SIMD acceleration for LINCS

author Berk Hess <hess@kth.se>

Tue, 13 Jan 2015 20:15:21 +0000 (21:15 +0100)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Thu, 5 Mar 2015 00:11:28 +0000 (01:11 +0100)
author Berk Hess <hess@kth.se>
Tue, 13 Jan 2015 20:15:21 +0000 (21:15 +0100)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 5 Mar 2015 00:11:28 +0000 (01:11 +0100)
diff --git a/src/gromacs/listed-forces/bonded.cpp b/src/gromacs/listed-forces/bonded.cpp

index d3a8c24a9eac44175a049246701b7e17ffc9d34e..f34528ded50b51981e10ad0688df7e3854389f69 100644 (file)
--- a/src/gromacs/listed-forces/bonded.cpp
+++ b/src/gromacs/listed-forces/bonded.cpp
@@ -57,6 +57,7 @@
  #include "gromacs/math/vec.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc-simd.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/simd/simd.h"
  #include "gromacs/simd/simd_math.h"
@@ -105,83 +106,6 @@ static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
      }
  }
  
-#ifdef GMX_SIMD_HAVE_REAL
-
-/* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
-typedef struct {
-    gmx_simd_real_t inv_bzz;
-    gmx_simd_real_t inv_byy;
-    gmx_simd_real_t inv_bxx;
-    gmx_simd_real_t bzx;
-    gmx_simd_real_t bzy;
-    gmx_simd_real_t bzz;
-    gmx_simd_real_t byx;
-    gmx_simd_real_t byy;
-    gmx_simd_real_t bxx;
-} pbc_simd_t;
-
-/*! \brief Set the SIMD pbc data from a normal t_pbc struct */
-static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
-{
-    rvec inv_bdiag;
-    int  d;
-
-    /* Setting inv_bdiag to 0 effectively turns off PBC */
-    clear_rvec(inv_bdiag);
-    if (pbc != NULL)
-    {
-        for (d = 0; d < pbc->ndim_ePBC; d++)
-        {
-            inv_bdiag[d] = 1.0/pbc->box[d][d];
-        }
-    }
-
-    pbc_simd->inv_bzz = gmx_simd_set1_r(inv_bdiag[ZZ]);
-    pbc_simd->inv_byy = gmx_simd_set1_r(inv_bdiag[YY]);
-    pbc_simd->inv_bxx = gmx_simd_set1_r(inv_bdiag[XX]);
-
-    if (pbc != NULL)
-    {
-        pbc_simd->bzx = gmx_simd_set1_r(pbc->box[ZZ][XX]);
-        pbc_simd->bzy = gmx_simd_set1_r(pbc->box[ZZ][YY]);
-        pbc_simd->bzz = gmx_simd_set1_r(pbc->box[ZZ][ZZ]);
-        pbc_simd->byx = gmx_simd_set1_r(pbc->box[YY][XX]);
-        pbc_simd->byy = gmx_simd_set1_r(pbc->box[YY][YY]);
-        pbc_simd->bxx = gmx_simd_set1_r(pbc->box[XX][XX]);
-    }
-    else
-    {
-        pbc_simd->bzx = gmx_simd_setzero_r();
-        pbc_simd->bzy = gmx_simd_setzero_r();
-        pbc_simd->bzz = gmx_simd_setzero_r();
-        pbc_simd->byx = gmx_simd_setzero_r();
-        pbc_simd->byy = gmx_simd_setzero_r();
-        pbc_simd->bxx = gmx_simd_setzero_r();
-    }
-}
-
-/*! \brief Correct distance vector *dx,*dy,*dz for PBC using SIMD */
-static gmx_inline void
-pbc_dx_simd(gmx_simd_real_t *dx, gmx_simd_real_t *dy, gmx_simd_real_t *dz,
-            const pbc_simd_t *pbc)
-{
-    gmx_simd_real_t sh;
-
-    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dz, pbc->inv_bzz));
-    *dx = gmx_simd_fnmadd_r(sh, pbc->bzx, *dx);
-    *dy = gmx_simd_fnmadd_r(sh, pbc->bzy, *dy);
-    *dz = gmx_simd_fnmadd_r(sh, pbc->bzz, *dz);
-
-    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dy, pbc->inv_byy));
-    *dx = gmx_simd_fnmadd_r(sh, pbc->byx, *dx);
-    *dy = gmx_simd_fnmadd_r(sh, pbc->byy, *dy);
-
-    sh  = gmx_simd_round_r(gmx_simd_mul_r(*dx, pbc->inv_bxx));
-    *dx = gmx_simd_fnmadd_r(sh, pbc->bxx, *dx);
-}
-
-#endif /* GMX_SIMD_HAVE_REAL */
-
  /*! \brief Morse potential bond
   *
   * By Frank Everdij. Three parameters needed:
@@ -1101,9 +1025,6 @@ angles_noener_simd(int nbonds,
              coeff[s]                     = forceparams[type].harmonic.krA;
              coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA*DEG2RAD;
  
-            /* If you can't use pbc_dx_simd below for PBC, e.g. because
-             * you can't round in SIMD, use pbc_rvec_sub here.
-             */
              /* Store the non PBC corrected distances packed and aligned */
              for (m = 0; m < DIM; m++)
              {
@@ -1128,8 +1049,8 @@ angles_noener_simd(int nbonds,
          rkjy_S    = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
          rkjz_S    = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
  
-        pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
-        pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
+        pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
+        pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
  
          rij_rkj_S = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
                                       rkjx_S, rkjy_S, rkjz_S);
@@ -1544,9 +1465,6 @@ dih_angle_simd(const rvec *x,
  
      for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
      {
-        /* If you can't use pbc_dx_simd below for PBC, e.g. because
-         * you can't round in SIMD, use pbc_rvec_sub here.
-         */
          for (m = 0; m < DIM; m++)
          {
              dr[s + (0*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
@@ -1565,9 +1483,9 @@ dih_angle_simd(const rvec *x,
      rkly_S = gmx_simd_load_r(dr + 7*GMX_SIMD_REAL_WIDTH);
      rklz_S = gmx_simd_load_r(dr + 8*GMX_SIMD_REAL_WIDTH);
  
-    pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
-    pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
-    pbc_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc);
+    pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
+    pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
+    pbc_correct_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc);
  
      gmx_simd_cprod_r(rijx_S, rijy_S, rijz_S,
                       rkjx_S, rkjy_S, rkjz_S,
diff --git a/src/gromacs/listed-forces/bonded.h b/src/gromacs/listed-forces/bonded.h

index 4c9e185b2ec1ee3ae3cb36297301a40ffaafab8a..d21e8ab2bbd1e882f4ec08bd35370b4722fb3c26 100644 (file)
--- a/src/gromacs/listed-forces/bonded.h
+++ b/src/gromacs/listed-forces/bonded.h
@@ -56,6 +56,7 @@
  #include "gromacs/legacyheaders/types/mdatom.h"
  #include "gromacs/legacyheaders/types/nrnb.h"
  #include "gromacs/math/vec.h"
+#include "gromacs/pbcutil/pbc-simd.h"
  #include "gromacs/simd/simd.h"
  #include "gromacs/topology/idef.h"
  #include "gromacs/utility/basedefinitions.h"
diff --git a/src/gromacs/listed-forces/listed-forces.cpp b/src/gromacs/listed-forces/listed-forces.cpp

index 417d224e7375b3bf2615f7330482ae7e69ff9e92..4cb4e90ac098f720da30eb699aabfc7eddaccd44 100644 (file)
--- a/src/gromacs/listed-forces/listed-forces.cpp
+++ b/src/gromacs/listed-forces/listed-forces.cpp
@@ -259,6 +259,16 @@ calc_one_bond(int thread,
                gmx_bool bCalcEnerVir,
                int *global_atom_index)
  {
+#ifdef GMX_SIMD_HAVE_REAL
+    gmx_bool bUseSIMD;
+    /* MSVC 2010 produces buggy SIMD PBC code, disable SIMD for MSVC <= 2010 */
+#if defined _MSC_VER && _MSC_VER < 1700
+    bUseSIMD = FALSE;
+#else
+    bUseSIMD = fr->use_simd_kernels;
+#endif
+#endif
+
      int      nat1, nbonds, efptFTYPE;
      real     v = 0;
      t_iatom *iatoms;
@@ -291,7 +301,7 @@ calc_one_bond(int thread,
                            md, fcd, global_atom_index);
          }
  #ifdef GMX_SIMD_HAVE_REAL
-        else if (ftype == F_ANGLES &&
+        else if (ftype == F_ANGLES && bUseSIMD &&
                   !bCalcEnerVir && fr->efep == efepNO)
          {
              /* No energies, shift forces, dvdl */
@@ -308,19 +318,27 @@ calc_one_bond(int thread,
          {
              /* No energies, shift forces, dvdl */
  #ifdef GMX_SIMD_HAVE_REAL
-            pdihs_noener_simd
-#else
-            pdihs_noener
+            if (bUseSIMD)
+            {
+                pdihs_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
+                                  idef->iparams,
+                                  x, f,
+                                  pbc, g, lambda[efptFTYPE], md, fcd,
+                                  global_atom_index);
+            }
+            else
  #endif
-                (nbn, idef->il[ftype].iatoms+nb0,
-                idef->iparams,
-                x, f,
-                pbc, g, lambda[efptFTYPE], md, fcd,
-                global_atom_index);
+            {
+                pdihs_noener(nbn, idef->il[ftype].iatoms+nb0,
+                             idef->iparams,
+                             x, f,
+                             pbc, g, lambda[efptFTYPE], md, fcd,
+                             global_atom_index);
+            }
              v = 0;
          }
  #ifdef GMX_SIMD_HAVE_REAL
-        else if (ftype == F_RBDIHS &&
+        else if (ftype == F_RBDIHS && bUseSIMD &&
                   !bCalcEnerVir && fr->efep == efepNO)
          {
              /* No energies, shift forces, dvdl */
diff --git a/src/gromacs/mdlib/clincs.cpp b/src/gromacs/mdlib/clincs.cpp

index 480a96596d841b2b1afcb99627f738aa78b843ab..a2a3b7c12ff3fbd9207b3cd609279d244a70d2ea 100644 (file)
--- a/src/gromacs/mdlib/clincs.cpp
+++ b/src/gromacs/mdlib/clincs.cpp
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,9 +37,14 @@
  /* This file is completely threadsafe - keep it that way! */
  #include "gmxpre.h"
  
+#include "config.h"
+
+#include <assert.h>
  #include <math.h>
  #include <stdlib.h>
  
+#include <algorithm>
+
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/fileio/gmxfio.h"
  #include "gromacs/legacyheaders/constr.h"
@@ -50,7 +55,11 @@
  #include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/math/units.h"
  #include "gromacs/math/vec.h"
+#include "gromacs/pbcutil/pbc-simd.h"
  #include "gromacs/pbcutil/pbc.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/simd/vector_operations.h"
  #include "gromacs/topology/block.h"
  #include "gromacs/topology/mtop_util.h"
  #include "gromacs/utility/bitmask.h"
@@ -58,6 +67,11 @@
  #include "gromacs/utility/gmxomp.h"
  #include "gromacs/utility/smalloc.h"
  
+/* MSVC 2010 produces buggy SIMD PBC code, disable SIMD for MSVC <= 2010 */
+#if defined GMX_SIMD_HAVE_REAL && !(defined _MSC_VER && _MSC_VER < 1700)
+#define LINCS_SIMD
+#endif
+
  typedef struct {
      int    b0;         /* first constraint for this thread */
      int    b1;         /* b1-1 is the last constraint for this thread */
@@ -68,6 +82,7 @@ typedef struct {
      int    ind_nalloc; /* allocation size of ind and ind_r */
      tensor vir_r_m_dr; /* temporary variable for virial calculation */
      real   dhdlambda;  /* temporary variable for lambda derivative */
+    real  *simd_buf;   /* Aligned work array for SIMD */
  } lincs_thread_t;
  
  typedef struct gmx_lincsdata {
@@ -112,6 +127,15 @@ typedef struct gmx_lincsdata {
      real            rmsd_data[3];
  } t_gmx_lincsdata;
  
+/* Define simd_width for memory allocation used for SIMD code */
+#ifdef LINCS_SIMD
+static const int simd_width = GMX_SIMD_REAL_WIDTH;
+#else
+static const int simd_width = 1;
+#endif
+/* We can't use small memory alignments on many systems, so use min 64 bytes*/
+static const int align_bytes = std::max<int>(64, simd_width*sizeof(real));
+
  real *lincs_rmsd_data(struct gmx_lincsdata *lincsd)
  {
      return lincsd->rmsd_data;
@@ -177,8 +201,8 @@ static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
           */
          /* We need to copy the temporary array, since only the elements
           * for constraints involved in triangles are updated and then
-         * the pointers are swapped. This saving copying the whole arrary.
-         * We need barrier as other threads might still be reading from rhs2.
+         * the pointers are swapped. This saves copying the whole array.
+         * We need a barrier as other threads might still be reading from rhs2.
           */
  #pragma omp barrier
          for (b = b0; b < b1; b++)
@@ -357,6 +381,115 @@ static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
      }
  }
  
+#ifdef LINCS_SIMD
+/* Calculate distances between indexed atom coordinate pairs
+ * and store the result in 3 SIMD registers through an aligned buffer.
+ * Start from index is and load GMX_SIMD_REAL_WIDTH elements.
+ * Note that pair_index must contain valid indices up to is+GMX_SIMD_REAL_WIDTH.
+ */
+static gmx_inline void gmx_simdcall
+rvec_dist_to_simd(const rvec      *x,
+                  int              is,
+                  const int       *pair_index,
+                  real            *buf,
+                  gmx_simd_real_t *dx,
+                  gmx_simd_real_t *dy,
+                  gmx_simd_real_t *dz)
+{
+    int s, m;
+
+    for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+    {
+        /* Store the non PBC corrected distances packed and aligned */
+        for (m = 0; m < DIM; m++)
+        {
+            buf[m*GMX_SIMD_REAL_WIDTH + s] =
+                x[pair_index[2*(is+s)]][m] - x[pair_index[2*(is+s) + 1]][m];
+        }
+    }
+    *dx = gmx_simd_load_r(buf + 0*GMX_SIMD_REAL_WIDTH);
+    *dy = gmx_simd_load_r(buf + 1*GMX_SIMD_REAL_WIDTH);
+    *dz = gmx_simd_load_r(buf + 2*GMX_SIMD_REAL_WIDTH);
+}
+
+/* Store a SIMD vector in GMX_SIMD_REAL_WIDTH rvecs through an aligned buffer */
+static gmx_inline void gmx_simdcall
+copy_simd_vec_to_rvec(gmx_simd_real_t  x,
+                      gmx_simd_real_t  y,
+                      gmx_simd_real_t  z,
+                      real            *buf,
+                      rvec            *v,
+                      int              is)
+{
+    int s, m;
+
+    gmx_simd_store_r(buf + 0*GMX_SIMD_REAL_WIDTH, x);
+    gmx_simd_store_r(buf + 1*GMX_SIMD_REAL_WIDTH, y);
+    gmx_simd_store_r(buf + 2*GMX_SIMD_REAL_WIDTH, z);
+
+    for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+    {
+        for (m = 0; m < DIM; m++)
+        {
+            v[is + s][m] = buf[m*GMX_SIMD_REAL_WIDTH + s];
+        }
+    }
+}
+
+/* Calculate the constraint distance vectors r to project on from x.
+ * Determine the right-hand side of the matrix equation using quantity f.
+ * This function only differs from calc_dr_x_xp_simd below in that
+ * no constraint length is subtracted and no PBC is used for f.
+ */
+static void gmx_simdcall
+calc_dr_x_f_simd(int                       b0,
+                 int                       b1,
+                 const int *               bla,
+                 const rvec * gmx_restrict x,
+                 const rvec * gmx_restrict f,
+                 const real * gmx_restrict blc,
+                 const pbc_simd_t *        pbc_simd,
+                 real * gmx_restrict       vbuf1,
+                 real * gmx_restrict       vbuf2,
+                 rvec * gmx_restrict       r,
+                 real * gmx_restrict       rhs,
+                 real * gmx_restrict       sol)
+{
+    int bs;
+
+    assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
+
+    for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
+    {
+        gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
+        gmx_simd_real_t fx_S, fy_S, fz_S, ip_S, rhs_S;
+
+        rvec_dist_to_simd(x, bs, bla, vbuf1, &rx_S, &ry_S, &rz_S);
+
+        pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
+
+        n2_S  = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
+        il_S  = gmx_simd_invsqrt_r(n2_S);
+
+        rx_S  = gmx_simd_mul_r(rx_S, il_S);
+        ry_S  = gmx_simd_mul_r(ry_S, il_S);
+        rz_S  = gmx_simd_mul_r(rz_S, il_S);
+
+        copy_simd_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r, bs);
+
+        rvec_dist_to_simd(f, bs, bla, vbuf2, &fx_S, &fy_S, &fz_S);
+
+        ip_S  = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
+                                 fx_S, fy_S, fz_S);
+
+        rhs_S = gmx_simd_mul_r(gmx_simd_load_r(blc + bs), ip_S);
+
+        gmx_simd_store_r(rhs + bs, rhs_S);
+        gmx_simd_store_r(sol + bs, rhs_S);
+    }
+}
+#endif /* LINCS_SIMD */
+
  /* LINCS projection, works on derivatives of the coordinates */
  static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
                        struct gmx_lincsdata *lincsd, int th,
@@ -364,9 +497,7 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
                        int econq, gmx_bool bCalcDHDL,
                        gmx_bool bCalcVir, tensor rmdf)
  {
-    int      b0, b1, b, i, j, k, n;
-    real     tmp0, tmp1, tmp2, mvb;
-    rvec     dx;
+    int      b0, b1, b;
      int     *bla, *blnr, *blbnb;
      rvec    *r;
      real    *blc, *blmf, *blcc, *rhs1, *rhs2, *sol;
@@ -395,11 +526,35 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
      rhs2   = lincsd->tmp2;
      sol    = lincsd->tmp3;
  
+#ifdef LINCS_SIMD
+
+    /* This SIMD code does the same as the plain-C code after the #else.
+     * The only difference is that we always call pbc code, as with SIMD
+     * the overhead of pbc computation (when not needed) is small.
+     */
+    pbc_simd_t pbc_simd;
+
+    /* Convert the pbc struct for SIMD */
+    set_pbc_simd(pbc, &pbc_simd);
+
+    /* Compute normalized x i-j vectors, store in r.
+     * Compute the inner product of r and xp i-j and store in rhs1.
+     */
+    calc_dr_x_f_simd(b0, b1, bla, x, f, blc,
+                     &pbc_simd,
+                     lincsd->th[th].simd_buf,
+                     lincsd->th[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
+                     r, rhs1, sol);
+
+#else /* LINCS_SIMD */
+
      /* Compute normalized i-j vectors */
      if (pbc)
      {
          for (b = b0; b < b1; b++)
          {
+            rvec dx;
+
              pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
              unitv(dx, r[b]);
          }
@@ -408,30 +563,44 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
      {
          for (b = b0; b < b1; b++)
          {
+            rvec dx;
+
              rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
              unitv(dx, r[b]);
          } /* 16 ncons flops */
      }
  
+    for (b = b0; b < b1; b++)
+    {
+        int  i, j;
+        real mvb;
+
+        i       = bla[2*b];
+        j       = bla[2*b+1];
+        mvb     = blc[b]*(r[b][0]*(f[i][0] - f[j][0]) +
+                          r[b][1]*(f[i][1] - f[j][1]) +
+                          r[b][2]*(f[i][2] - f[j][2]));
+        rhs1[b] = mvb;
+        sol[b]  = mvb;
+        /* 7 flops */
+    }
+
+#endif /* LINCS_SIMD */
+
+    /* We need a barrier, since the matrix construction below
+     * can access entries in r of other threads.
+     */
  #pragma omp barrier
+
+    /* Construct the (sparse) LINCS matrix */
      for (b = b0; b < b1; b++)
      {
-        tmp0 = r[b][0];
-        tmp1 = r[b][1];
-        tmp2 = r[b][2];
-        i    = bla[2*b];
-        j    = bla[2*b+1];
+        int n;
+
          for (n = blnr[b]; n < blnr[b+1]; n++)
          {
-            k       = blbnb[n];
-            blcc[n] = blmf[n]*(tmp0*r[k][0] + tmp1*r[k][1] + tmp2*r[k][2]);
+            blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
          } /* 6 nr flops */
-        mvb = blc[b]*(tmp0*(f[i][0] - f[j][0]) +
-                      tmp1*(f[i][1] - f[j][1]) +
-                      tmp2*(f[i][2] - f[j][2]));
-        rhs1[b] = mvb;
-        sol[b]  = mvb;
-        /* 7 flops */
      }
      /* Together: 23*ncons + 6*nrtot flops */
  
@@ -486,6 +655,9 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
           */
          for (b = b0; b < b1; b++)
          {
+            real mvb, tmp1;
+            int  i, j;
+
              mvb = lincsd->bllen[b]*sol[b];
              for (i = 0; i < DIM; i++)
              {
@@ -499,18 +671,193 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
      }
  }
  
+#ifdef LINCS_SIMD
+/* Calculate the constraint distance vectors r to project on from x.
+ * Determine the right-hand side of the matrix equation using coordinates xp.
+ */
+static void gmx_simdcall
+calc_dr_x_xp_simd(int                       b0,
+                  int                       b1,
+                  const int *               bla,
+                  const rvec * gmx_restrict x,
+                  const rvec * gmx_restrict xp,
+                  const real * gmx_restrict bllen,
+                  const real * gmx_restrict blc,
+                  const pbc_simd_t *        pbc_simd,
+                  real * gmx_restrict       vbuf1,
+                  real * gmx_restrict       vbuf2,
+                  rvec * gmx_restrict       r,
+                  real * gmx_restrict       rhs,
+                  real * gmx_restrict       sol)
+{
+    int bs;
+
+    assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
+
+    for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
+    {
+        gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
+        gmx_simd_real_t rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
+
+        rvec_dist_to_simd(x, bs, bla, vbuf1, &rx_S, &ry_S, &rz_S);
+
+        pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
+
+        n2_S  = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
+        il_S  = gmx_simd_invsqrt_r(n2_S);
+
+        rx_S  = gmx_simd_mul_r(rx_S, il_S);
+        ry_S  = gmx_simd_mul_r(ry_S, il_S);
+        rz_S  = gmx_simd_mul_r(rz_S, il_S);
+
+        copy_simd_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r, bs);
+
+        rvec_dist_to_simd(xp, bs, bla, vbuf2, &rxp_S, &ryp_S, &rzp_S);
+
+        pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
+
+        ip_S  = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
+                                 rxp_S, ryp_S, rzp_S);
+
+        rhs_S = gmx_simd_mul_r(gmx_simd_load_r(blc + bs),
+                               gmx_simd_sub_r(ip_S, gmx_simd_load_r(bllen + bs)));
+
+        gmx_simd_store_r(rhs + bs, rhs_S);
+        gmx_simd_store_r(sol + bs, rhs_S);
+    }
+}
+#endif /* LINCS_SIMD */
+
+/* Determine the distances and right-hand side for the next iteration */
+static void calc_dist_iter(int                       b0,
+                           int                       b1,
+                           const int *               bla,
+                           const rvec * gmx_restrict xp,
+                           const real * gmx_restrict bllen,
+                           const real * gmx_restrict blc,
+                           const t_pbc *             pbc,
+                           real                      wfac,
+                           real * gmx_restrict       rhs,
+                           real * gmx_restrict       sol,
+                           gmx_bool *                bWarn)
+{
+    int b;
+
+    for (b = b0; b < b1; b++)
+    {
+        real len, len2, dlen2, mvb;
+        rvec dx;
+
+        len = bllen[b];
+        if (pbc)
+        {
+            pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
+        }
+        else
+        {
+            rvec_sub(xp[bla[2*b]], xp[bla[2*b+1]], dx);
+        }
+        len2  = len*len;
+        dlen2 = 2*len2 - norm2(dx);
+        if (dlen2 < wfac*len2)
+        {
+            /* not race free - see detailed comment in caller */
+            *bWarn = TRUE;
+        }
+        if (dlen2 > 0)
+        {
+            mvb = blc[b]*(len - dlen2*gmx_invsqrt(dlen2));
+        }
+        else
+        {
+            mvb = blc[b]*len;
+        }
+        rhs[b]  = mvb;
+        sol[b]  = mvb;
+    } /* 20*ncons flops */
+}
+
+#ifdef LINCS_SIMD
+/* As the function above, but using SIMD intrinsics */
+static void gmx_simdcall
+calc_dist_iter_simd(int                       b0,
+                    int                       b1,
+                    const int *               bla,
+                    const rvec * gmx_restrict x,
+                    const real * gmx_restrict bllen,
+                    const real * gmx_restrict blc,
+                    const pbc_simd_t *        pbc_simd,
+                    real                      wfac,
+                    real * gmx_restrict       vbuf,
+                    real * gmx_restrict       rhs,
+                    real * gmx_restrict       sol,
+                    gmx_bool *                bWarn)
+{
+    gmx_simd_real_t min_S  = gmx_simd_set1_r(GMX_REAL_MIN);
+    gmx_simd_real_t two_S  = gmx_simd_set1_r(2.0);
+    gmx_simd_real_t wfac_S = gmx_simd_set1_r(wfac);
+    gmx_simd_bool_t warn_S;
+
+    int             bs;
+
+    assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
+
+    /* Initialize all to FALSE */
+    warn_S = gmx_simd_cmplt_r(two_S, gmx_simd_setzero_r());
+
+    for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
+    {
+        gmx_simd_real_t rx_S, ry_S, rz_S, n2_S;
+        gmx_simd_real_t len_S, len2_S, dlen2_S, lc_S, blc_S;
+
+        rvec_dist_to_simd(x, bs, bla, vbuf, &rx_S, &ry_S, &rz_S);
+
+        pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
+
+        n2_S    = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
+
+        len_S   = gmx_simd_load_r(bllen + bs);
+        len2_S  = gmx_simd_mul_r(len_S, len_S);
+
+        dlen2_S = gmx_simd_fmsub_r(two_S, len2_S, n2_S);
+
+        warn_S  = gmx_simd_or_b(warn_S,
+                                gmx_simd_cmplt_r(dlen2_S,
+                                                 gmx_simd_mul_r(wfac_S, len2_S)));
+
+        /* Avoid 1/0 by taking the max with REAL_MIN.
+         * Note: when dlen2 is close to zero (90 degree constraint rotation),
+         * the accuracy of the algorithm is no longer relevant.
+         */
+        dlen2_S = gmx_simd_max_r(dlen2_S, min_S);
+
+        lc_S    = gmx_simd_fnmadd_r(dlen2_S, gmx_simd_invsqrt_r(dlen2_S), len_S);
+
+        blc_S   = gmx_simd_load_r(blc + bs);
+
+        lc_S    = gmx_simd_mul_r(blc_S, lc_S);
+
+        gmx_simd_store_r(rhs + bs, lc_S);
+        gmx_simd_store_r(sol + bs, lc_S);
+    }
+
+    if (gmx_simd_anytrue_b(warn_S))
+    {
+        *bWarn = TRUE;
+    }
+}
+#endif /* LINCS_SIMD */
+
  static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
                       struct gmx_lincsdata *lincsd, int th,
-                     real *invmass,
+                     const real *invmass,
                       t_commrec *cr,
                       gmx_bool bCalcDHDL,
-                     real wangle, int *warn,
-                     real invdt, rvec *v,
+                     real wangle, gmx_bool *bWarn,
+                     real invdt, rvec * gmx_restrict v,
                       gmx_bool bCalcVir, tensor vir_r_m_dr)
  {
-    int      b0, b1, b, i, j, k, n, iter;
-    real     tmp0, tmp1, tmp2, mvb, rlen, len, len2, dlen2, wfac;
-    rvec     dx;
+    int      b0, b1, b, i, j, n, iter;
      int     *bla, *blnr, *blbnb;
      rvec    *r;
      real    *blc, *blmf, *bllen, *blcc, *rhs1, *rhs2, *sol, *blc_sol, *mlambda;
@@ -542,21 +889,39 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
          nlocat = NULL;
      }
  
+#ifdef LINCS_SIMD
+
+    /* This SIMD code does the same as the plain-C code after the #else.
+     * The only difference is that we always call pbc code, as with SIMD
+     * the overhead of pbc computation (when not needed) is small.
+     */
+    pbc_simd_t pbc_simd;
+
+    /* Convert the pbc struct for SIMD */
+    set_pbc_simd(pbc, &pbc_simd);
+
+    /* Compute normalized x i-j vectors, store in r.
+     * Compute the inner product of r and xp i-j and store in rhs1.
+     */
+    calc_dr_x_xp_simd(b0, b1, bla, x, xp, bllen, blc,
+                      &pbc_simd,
+                      lincsd->th[th].simd_buf,
+                      lincsd->th[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
+                      r, rhs1, sol);
+
+#else /* LINCS_SIMD */
+
      if (pbc)
      {
          /* Compute normalized i-j vectors */
          for (b = b0; b < b1; b++)
          {
+            rvec dx;
+            real mvb;
+
              pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
              unitv(dx, r[b]);
-        }
-#pragma omp barrier
-        for (b = b0; b < b1; b++)
-        {
-            for (n = blnr[b]; n < blnr[b+1]; n++)
-            {
-                blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
-            }
+
              pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
              mvb     = blc[b]*(iprod(r[b], dx) - bllen[b]);
              rhs1[b] = mvb;
@@ -568,34 +933,24 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
          /* Compute normalized i-j vectors */
          for (b = b0; b < b1; b++)
          {
+            real tmp0, tmp1, tmp2, rlen, mvb;
+
              i       = bla[2*b];
              j       = bla[2*b+1];
              tmp0    = x[i][0] - x[j][0];
              tmp1    = x[i][1] - x[j][1];
              tmp2    = x[i][2] - x[j][2];
-            rlen    = gmx_invsqrt(tmp0*tmp0+tmp1*tmp1+tmp2*tmp2);
+            rlen    = gmx_invsqrt(tmp0*tmp0 + tmp1*tmp1 + tmp2*tmp2);
              r[b][0] = rlen*tmp0;
              r[b][1] = rlen*tmp1;
              r[b][2] = rlen*tmp2;
-        } /* 16 ncons flops */
+            /* 16 ncons flops */
  
-#pragma omp barrier
-        for (b = b0; b < b1; b++)
-        {
-            tmp0 = r[b][0];
-            tmp1 = r[b][1];
-            tmp2 = r[b][2];
-            len  = bllen[b];
-            i    = bla[2*b];
-            j    = bla[2*b+1];
-            for (n = blnr[b]; n < blnr[b+1]; n++)
-            {
-                k       = blbnb[n];
-                blcc[n] = blmf[n]*(tmp0*r[k][0] + tmp1*r[k][1] + tmp2*r[k][2]);
-            } /* 6 nr flops */
-            mvb = blc[b]*(tmp0*(xp[i][0] - xp[j][0]) +
-                          tmp1*(xp[i][1] - xp[j][1]) +
-                          tmp2*(xp[i][2] - xp[j][2]) - len);
+            i       = bla[2*b];
+            j       = bla[2*b+1];
+            mvb     = blc[b]*(r[b][0]*(xp[i][0] - xp[j][0]) +
+                              r[b][1]*(xp[i][1] - xp[j][1]) +
+                              r[b][2]*(xp[i][2] - xp[j][2]) - bllen[b]);
              rhs1[b] = mvb;
              sol[b]  = mvb;
              /* 10 flops */
@@ -603,13 +958,39 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
          /* Together: 26*ncons + 6*nrtot flops */
      }
  
+#endif /* LINCS_SIMD */
+
+    /* We need a barrier, since the matrix construction below
+     * can access entries in r of other threads.
+     */
+#pragma omp barrier
+
+    /* Construct the (sparse) LINCS matrix */
+    for (b = b0; b < b1; b++)
+    {
+        for (n = blnr[b]; n < blnr[b+1]; n++)
+        {
+            blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
+        }
+    }
+    /* Together: 26*ncons + 6*nrtot flops */
+
      lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
      /* nrec*(ncons+2*nrtot) flops */
  
+#ifndef LINCS_SIMD
      for (b = b0; b < b1; b++)
      {
          mlambda[b] = blc[b]*sol[b];
      }
+#else
+    for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
+    {
+        gmx_simd_store_r(mlambda + b,
+                         gmx_simd_mul_r(gmx_simd_load_r(blc + b),
+                                        gmx_simd_load_r(sol + b)));
+    }
+#endif
  
      /* Update the coordinates */
      lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
@@ -618,6 +999,8 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
       ********  Correction for centripetal effects  ********
       */
  
+    real wfac;
+
      wfac = cos(DEG2RAD*wangle);
      wfac = wfac*wfac;
  
@@ -637,45 +1020,40 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
          }
  
  #pragma omp barrier
-        for (b = b0; b < b1; b++)
-        {
-            len = bllen[b];
-            if (pbc)
-            {
-                pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
-            }
-            else
-            {
-                rvec_sub(xp[bla[2*b]], xp[bla[2*b+1]], dx);
-            }
-            len2  = len*len;
-            dlen2 = 2*len2 - norm2(dx);
-            if (dlen2 < wfac*len2 && (nlocat == NULL || nlocat[b]))
-            {
-                /* not race free - see detailed comment in caller */
-                *warn = b;
-            }
-            if (dlen2 > 0)
-            {
-                mvb = blc[b]*(len - dlen2*gmx_invsqrt(dlen2));
-            }
-            else
-            {
-                mvb = blc[b]*len;
-            }
-            rhs1[b] = mvb;
-            sol[b]  = mvb;
-        } /* 20*ncons flops */
+
+#ifdef LINCS_SIMD
+        calc_dist_iter_simd(b0, b1, bla, xp, bllen, blc, &pbc_simd, wfac,
+                            lincsd->th[th].simd_buf, rhs1, sol, bWarn);
+#else
+        calc_dist_iter(b0, b1, bla, xp, bllen, blc, pbc, wfac,
+                       rhs1, sol, bWarn);
+        /* 20*ncons flops */
+#endif
  
          lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
          /* nrec*(ncons+2*nrtot) flops */
  
+#ifndef LINCS_SIMD
          for (b = b0; b < b1; b++)
          {
+            real mvb;
+
              mvb         = blc[b]*sol[b];
              blc_sol[b]  = mvb;
              mlambda[b] += mvb;
          }
+#else
+        for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
+        {
+            gmx_simd_real_t mvb;
+
+            mvb = gmx_simd_mul_r(gmx_simd_load_r(blc + b),
+                                 gmx_simd_load_r(sol + b));
+            gmx_simd_store_r(blc_sol + b, mvb);
+            gmx_simd_store_r(mlambda + b,
+                             gmx_simd_add_r(gmx_simd_load_r(mlambda + b), mvb));
+        }
+#endif
  
          /* Update the coordinates */
          lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
@@ -721,6 +1099,8 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
          /* Constraint virial */
          for (b = b0; b < b1; b++)
          {
+            real tmp0, tmp1;
+
              tmp0 = -bllen[b]*mlambda[b];
              for (i = 0; i < DIM; i++)
              {
@@ -987,6 +1367,10 @@ gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
       * but it could be set in set_lincs().
       */
      li->nth = gmx_omp_nthreads_get(emntLINCS);
+    if (debug)
+    {
+        fprintf(debug, "LINCS: using %d threads\n", li->nth);
+    }
      if (li->nth == 1)
      {
          snew(li->th, 1);
@@ -996,9 +1380,13 @@ gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
          /* Allocate an extra elements for "thread-overlap" constraints */
          snew(li->th, li->nth+1);
      }
-    if (debug)
+    int th;
+#pragma omp parallel for num_threads(li->nth)
+    for (th = 0; th < li->nth; th++)
      {
-        fprintf(debug, "LINCS: using %d threads\n", li->nth);
+        /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
+        snew_aligned(li->th[th].simd_buf, 2*simd_width*DIM,
+                     align_bytes);
      }
  
      if (bPLINCS || li->ncg_triangle > 0)
@@ -1057,6 +1445,8 @@ static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
          gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
      }
  
+    int block = simd_width;
+
      for (th = 0; th < li->nth; th++)
      {
          lincs_thread_t *li_th;
@@ -1064,9 +1454,12 @@ static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
  
          li_th = &li->th[th];
  
-        /* The constraints are divided equally over the threads */
-        li_th->b0 = (li->nc* th   )/li->nth;
-        li_th->b1 = (li->nc*(th+1))/li->nth;
+        /* The constraints are divided equally over the threads,
+         * in chunks of size block to ensure alignment for SIMD.
+         */
+        li_th->b0 = (((li->nc + block - 1)* th   )/(block*li->nth))*block;
+        li_th->b1 = (((li->nc + block - 1)*(th+1))/(block*li->nth))*block;
+        li_th->b1 = std::min<int>(li_th->b1, li->nc);
  
          /* For each atom set a flag for constraints from each */
          for (b = li_th->b0; b < li_th->b1; b++)
@@ -1153,6 +1546,14 @@ static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
      }
  }
  
+/* There is no realloc with alignment, so here we make one for reals.
+ * Note that this function does not preserve the contents of the memory.
+ */
+static void resize_real_aligned(real **ptr, int nelem)
+{
+    sfree_aligned(*ptr);
+    snew_aligned(*ptr, nelem, align_bytes);
+}
  
  void set_lincs(t_idef *idef, t_mdatoms *md,
                 gmx_bool bDynamics, t_commrec *cr,
@@ -1161,7 +1562,7 @@ void set_lincs(t_idef *idef, t_mdatoms *md,
      int          start, natoms, nflexcon;
      t_blocka     at2con;
      t_iatom     *iatom;
-    int          i, k, ncc_alloc, ni, con, nconnect, concon;
+    int          i, k, ncc_alloc, ncon_tot, con, nconnect, concon;
      int          type, a1, a2;
      real         lenA = 0, lenB;
  
@@ -1215,23 +1616,25 @@ void set_lincs(t_idef *idef, t_mdatoms *md,
      at2con = make_at2con(start, natoms, idef->il, idef->iparams, bDynamics,
                           &nflexcon);
  
+    ncon_tot = idef->il[F_CONSTR].nr/3;
  
-    if (idef->il[F_CONSTR].nr/3 > li->nc_alloc || li->nc_alloc == 0)
+    /* Ensure we have enough space for aligned loads beyond ncon_tot */
+    if (ncon_tot + simd_width > li->nc_alloc || li->nc_alloc == 0)
      {
-        li->nc_alloc = over_alloc_dd(idef->il[F_CONSTR].nr/3);
-        srenew(li->bllen0, li->nc_alloc);
-        srenew(li->ddist, li->nc_alloc);
+        li->nc_alloc = over_alloc_dd(ncon_tot + simd_width);
+        resize_real_aligned(&li->bllen0, li->nc_alloc);
+        resize_real_aligned(&li->ddist, li->nc_alloc);
          srenew(li->bla, 2*li->nc_alloc);
-        srenew(li->blc, li->nc_alloc);
-        srenew(li->blc1, li->nc_alloc);
+        resize_real_aligned(&li->blc, li->nc_alloc);
+        resize_real_aligned(&li->blc1, li->nc_alloc);
          srenew(li->blnr, li->nc_alloc+1);
-        srenew(li->bllen, li->nc_alloc);
+        resize_real_aligned(&li->bllen, li->nc_alloc);
          srenew(li->tmpv, li->nc_alloc);
-        srenew(li->tmp1, li->nc_alloc);
-        srenew(li->tmp2, li->nc_alloc);
-        srenew(li->tmp3, li->nc_alloc);
-        srenew(li->tmp4, li->nc_alloc);
-        srenew(li->mlambda, li->nc_alloc);
+        resize_real_aligned(&li->tmp1, li->nc_alloc);
+        resize_real_aligned(&li->tmp2, li->nc_alloc);
+        resize_real_aligned(&li->tmp3, li->nc_alloc);
+        resize_real_aligned(&li->tmp4, li->nc_alloc);
+        resize_real_aligned(&li->mlambda, li->nc_alloc);
          if (li->ncg_triangle > 0)
          {
              /* This is allocating too much, but it is difficult to improve */
@@ -1245,12 +1648,10 @@ void set_lincs(t_idef *idef, t_mdatoms *md,
      ncc_alloc   = li->ncc_alloc;
      li->blnr[0] = 0;
  
-    ni = idef->il[F_CONSTR].nr/3;
-
      con           = 0;
      nconnect      = 0;
      li->blnr[con] = nconnect;
-    for (i = 0; i < ni; i++)
+    for (i = 0; i < ncon_tot; i++)
      {
          type   = iatom[3*i];
          a1     = iatom[3*i+1];
@@ -1305,6 +1706,22 @@ void set_lincs(t_idef *idef, t_mdatoms *md,
              con++;
          }
      }
+    if (simd_width > 1)
+    {
+        /* Copy the last atom pair indices and lengths for constraints
+         * up to a multiple of simd_width, such that we can do all
+         * SIMD operations without have to worry about end effects.
+         */
+        int con_roundup, i;
+
+        con_roundup = ((con + simd_width - 1)/simd_width)*simd_width;
+        for (i = con; i < con_roundup; i++)
+        {
+            li->bla[i*2    ] = li->bla[(con - 1)*2    ];
+            li->bla[i*2 + 1] = li->bla[(con - 1)*2 + 1];
+            li->bllen[i]     = li->bllen[con - 1];
+        }
+    }
  
      done_blocka(&at2con);
  
@@ -1483,10 +1900,10 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
  {
      gmx_bool  bCalcDHDL;
      char      buf[STRLEN], buf2[22], buf3[STRLEN];
-    int       i, warn, p_imax;
+    int       i, p_imax;
      real      ncons_loc, p_ssd, p_max = 0;
      rvec      dx;
-    gmx_bool  bOK;
+    gmx_bool  bOK, bWarn;
  
      bOK = TRUE;
  
@@ -1567,11 +1984,11 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
                      &ncons_loc, &p_ssd, &p_max, &p_imax);
          }
  
-        /* This warn var can be updated by multiple threads
+        /* This bWarn var can be updated by multiple threads
           * at the same time. But as we only need to detect
           * if a warning occured or not, this is not an issue.
           */
-        warn = -1;
+        bWarn = FALSE;
  
          /* The OpenMP parallel region of constrain_lincs for coords */
  #pragma omp parallel num_threads(lincsd->nth)
@@ -1583,7 +2000,7 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
              do_lincs(x, xprime, box, pbc, lincsd, th,
                       md->invmass, cr,
                       bCalcDHDL,
-                     ir->LincsWarnAngle, &warn,
+                     ir->LincsWarnAngle, &bWarn,
                       invdt, v, bCalcVir,
                       th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
          }
@@ -1627,7 +2044,7 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
                      ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
          }
  
-        if (warn >= 0)
+        if (bWarn)
          {
              if (maxwarn >= 0)
              {
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 3f6b1874a7ba4aa1d3cacf039344e28e52517220..a34f2dd83c5849e559a7609009e04b1291fb105c 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2486,7 +2486,8 @@ void init_forcerec(FILE              *fp,
          {
              fprintf(fp,
                      "\nFound environment variable GMX_DISABLE_SIMD_KERNELS.\n"
-                    "Disabling the usage of any SIMD-specific kernel routines (e.g. SSE2/SSE4.1/AVX).\n\n");
+                    "Disabling the usage of any SIMD-specific non-bonded & bonded kernel routines\n"
+                    "(e.g. SSE2/SSE4.1/AVX).\n\n");
          }
      }
  
diff --git a/src/gromacs/pbcutil/pbc-simd.cpp b/src/gromacs/pbcutil/pbc-simd.cpp

new file mode 100644 (file)

index 0000000..7ede573
--- /dev/null
+++ b/src/gromacs/pbcutil/pbc-simd.cpp
@@ -0,0 +1,91 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines a low-level function for SIMD PBC calculation.
+ *
+ * \author Berk Hess <hess@kth.se>
+ *
+ * \ingroup module_pbcutil
+ */
+#include "gmxpre.h"
+
+#include "pbc-simd.h"
+
+#include "gromacs/math/vec.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/simd/simd.h"
+
+void set_pbc_simd(const t_pbc gmx_unused *pbc,
+                  pbc_simd_t gmx_unused  *pbc_simd)
+{
+#ifdef GMX_SIMD_HAVE_REAL
+    rvec inv_box_diag;
+    int  d;
+
+    /* Setting inv_bdiag to 0 effectively turns off PBC */
+    clear_rvec(inv_box_diag);
+    if (pbc != NULL)
+    {
+        for (d = 0; d < pbc->ndim_ePBC; d++)
+        {
+            inv_box_diag[d] = 1.0/pbc->box[d][d];
+        }
+    }
+
+    pbc_simd->inv_bzz = gmx_simd_set1_r(inv_box_diag[ZZ]);
+    pbc_simd->inv_byy = gmx_simd_set1_r(inv_box_diag[YY]);
+    pbc_simd->inv_bxx = gmx_simd_set1_r(inv_box_diag[XX]);
+
+    if (pbc != NULL)
+    {
+        pbc_simd->bzx = gmx_simd_set1_r(pbc->box[ZZ][XX]);
+        pbc_simd->bzy = gmx_simd_set1_r(pbc->box[ZZ][YY]);
+        pbc_simd->bzz = gmx_simd_set1_r(pbc->box[ZZ][ZZ]);
+        pbc_simd->byx = gmx_simd_set1_r(pbc->box[YY][XX]);
+        pbc_simd->byy = gmx_simd_set1_r(pbc->box[YY][YY]);
+        pbc_simd->bxx = gmx_simd_set1_r(pbc->box[XX][XX]);
+    }
+    else
+    {
+        pbc_simd->bzx = gmx_simd_setzero_r();
+        pbc_simd->bzy = gmx_simd_setzero_r();
+        pbc_simd->bzz = gmx_simd_setzero_r();
+        pbc_simd->byx = gmx_simd_setzero_r();
+        pbc_simd->byy = gmx_simd_setzero_r();
+        pbc_simd->bxx = gmx_simd_setzero_r();
+    }
+#endif
+}
diff --git a/src/gromacs/pbcutil/pbc-simd.h b/src/gromacs/pbcutil/pbc-simd.h

new file mode 100644 (file)

index 0000000..6e1348a
--- /dev/null
+++ b/src/gromacs/pbcutil/pbc-simd.h
@@ -0,0 +1,138 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief This file contains a definition, declaration and inline function
+ * for SIMD accelerated PBC calculations.
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \inlibraryapi
+ * \ingroup module_pbcutil
+ */
+
+#ifndef GMX_PBCUTIL_PBC_SIMD_H
+#define GMX_PBCUTIL_PBC_SIMD_H
+
+#include "config.h"
+
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/utility/fatalerror.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \cond INTERNAL */
+
+/*! \brief Structure containing the PBC setup for SIMD PBC calculations.
+ *
+ * Without SIMD this is a dummy struct, so it can be declared and passed.
+ * This can avoid some ifdef'ing.
+ */
+typedef struct {
+#ifdef GMX_SIMD_HAVE_REAL
+    gmx_simd_real_t inv_bzz; /**< 1/box[ZZ][ZZ] */
+    gmx_simd_real_t inv_byy; /**< 1/box[YY][YY] */
+    gmx_simd_real_t inv_bxx; /**< 1/box[XX][XX] */
+    gmx_simd_real_t bzx;     /**< box[ZZ][XX] */
+    gmx_simd_real_t bzy;     /**< box[ZZ][YY] */
+    gmx_simd_real_t bzz;     /**< box[ZZ][ZZ] */
+    gmx_simd_real_t byx;     /**< box[YY][XX] */
+    gmx_simd_real_t byy;     /**< box[YY][YY] */
+    gmx_simd_real_t bxx;     /**< bo[XX][XX] */
+#else
+    int             dum;     /**< Dummy variable to avoid empty struct */
+#endif
+} pbc_simd_t;
+
+/*! \endcond */
+
+/*! \brief Set the SIMD PBC data from a normal t_pbc struct.
+ *
+ * NULL can be passed for \p pbc, then no PBC will be used.
+ */
+void set_pbc_simd(const t_pbc *pbc,
+                  pbc_simd_t  *pbc_simd);
+
+#if defined GMX_SIMD_HAVE_REAL
+
+/*! \brief Correct SIMD distance vector *dx,*dy,*dz for PBC using SIMD.
+ *
+ * For rectangular boxes all returned distance vectors are the shortest.
+ * For triclinic boxes only distances up to half the smallest box diagonal
+ * element are guaranteed to be the shortest. This means that distances from
+ * 0.5/sqrt(2) times a box vector length (e.g. for a rhombic dodecahedron)
+ * can use a more distant periodic image.
+ * Note that this routine always does PBC arithmetic, even for dimensions
+ * without PBC. But on modern processors the overhead of this, often called,
+ * routine should be low. On e.g. Intel Haswell/Broadwell it takes 8 cycles.
+ */
+static gmx_inline void gmx_simdcall
+pbc_correct_dx_simd(gmx_simd_real_t  *dx,
+                    gmx_simd_real_t  *dy,
+                    gmx_simd_real_t  *dz,
+                    const pbc_simd_t *pbc)
+{
+    gmx_simd_real_t shz, shy, shx;
+
+#if defined _MSC_VER && _MSC_VER < 1700
+    /* The caller side should make sure we never end up here.
+     * TODO Black-list _MSC_VER < 1700 when it's old enough, so we can rid
+     * of this code complication.
+     */
+    gmx_incons("pbc_correct_dx_simd was called for code compiled with MSVC 2010 or older, which produces incorrect code (probably corrupts memory) and therefore this function should not have been called");
+#endif
+
+    shz = gmx_simd_round_r(gmx_simd_mul_r(*dz, pbc->inv_bzz));
+    *dx = gmx_simd_fnmadd_r(shz, pbc->bzx, *dx);
+    *dy = gmx_simd_fnmadd_r(shz, pbc->bzy, *dy);
+    *dz = gmx_simd_fnmadd_r(shz, pbc->bzz, *dz);
+
+    shy = gmx_simd_round_r(gmx_simd_mul_r(*dy, pbc->inv_byy));
+    *dx = gmx_simd_fnmadd_r(shy, pbc->byx, *dx);
+    *dy = gmx_simd_fnmadd_r(shy, pbc->byy, *dy);
+
+    shx = gmx_simd_round_r(gmx_simd_mul_r(*dx, pbc->inv_bxx));
+    *dx = gmx_simd_fnmadd_r(shx, pbc->bxx, *dx);
+}
+
+#endif /* GMX_SIMD_HAVE_REAL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
author	Berk Hess <hess@kth.se>
	Tue, 13 Jan 2015 20:15:21 +0000 (21:15 +0100)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Thu, 5 Mar 2015 00:11:28 +0000 (01:11 +0100)
src/gromacs/listed-forces/bonded.cpp		patch \| blob \| history
src/gromacs/listed-forces/bonded.h		patch \| blob \| history
src/gromacs/listed-forces/listed-forces.cpp		patch \| blob \| history
src/gromacs/mdlib/clincs.cpp		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/pbcutil/pbc-simd.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/pbcutil/pbc-simd.h	[new file with mode: 0644]	patch \| blob