From: Berk Hess <hess@kth.se>
Date: Thu, 19 Feb 2015 14:27:09 +0000 (+0100)
Subject: LINCS thread tasks can now be independent
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=b23fad4be3871942cc5e4cf9a6dac311f51005bb;p=alexxy%2Fgromacs.git

LINCS thread tasks can now be independent

With very locally coupled constraints, such as H-bonds only
constraints, the LINCS OpenMP tasks are now independent. This means
that no OpenMP barriers are required, which can significantly speed
up LINCS.
Additionally triangle constraints (which are present in e.g. OH groups
when using pdb2gmx -vsite) are now divided over thread tasks instead
of done by the master thread only. This slightly improves load
balancing and removes two thread barriers.

Change-Id: Ibbafd9c10f51d35a87e9784a0650d849c0d1c1e5
---

diff --git a/src/gromacs/mdlib/clincs.cpp b/src/gromacs/mdlib/clincs.cpp
index a2a3b7c12f..e4de579842 100644
--- a/src/gromacs/mdlib/clincs.cpp
+++ b/src/gromacs/mdlib/clincs.cpp
@@ -73,8 +73,12 @@
 #endif
 
 typedef struct {
-    int    b0;         /* first constraint for this thread */
-    int    b1;         /* b1-1 is the last constraint for this thread */
+    int    b0;         /* first constraint for this task */
+    int    b1;         /* b1-1 is the last constraint for this task */
+    int    ntriangle;  /* the number of constraints in triangles */
+    int   *triangle;   /* the list of triangle constraints */
+    int   *tri_bits;   /* the bits tell if the matrix element should be used */
+    int    tri_alloc;  /* allocation size of triangle and tri_bits */
     int    nind;       /* number of indices */
     int   *ind;        /* constraint index for updating atom data */
     int    nind_r;     /* number of indices */
@@ -83,7 +87,7 @@ typedef struct {
     tensor vir_r_m_dr; /* temporary variable for virial calculation */
     real   dhdlambda;  /* temporary variable for lambda derivative */
     real  *simd_buf;   /* Aligned work array for SIMD */
-} lincs_thread_t;
+} lincs_task_t;
 
 typedef struct gmx_lincsdata {
     int             ncg;          /* the global number of constraints */
@@ -91,11 +95,15 @@ typedef struct gmx_lincsdata {
     int             ncg_triangle; /* the global number of constraints in triangles */
     int             nIter;        /* the number of iterations */
     int             nOrder;       /* the order of the matrix expansion */
-    int             nc;           /* the number of constraints */
+    int             max_connect;  /* the maximum number of constrains connected to a single atom */
+
+    int             nc_real;      /* the number of real constraints */
+    int             nc;           /* the number of constraints including padding for SIMD */
     int             nc_alloc;     /* the number we allocated memory for */
     int             ncc;          /* the number of constraint connections */
     int             ncc_alloc;    /* the number we allocated memory for */
     real            matlam;       /* the FE lambda value used for filling blc and blmf */
+    int            *con_index;    /* mapping from topology to LINCS constraints */
     real           *bllen0;       /* the reference distance in topology A */
     real           *ddist;        /* the reference distance in top B - the r.d. in top A */
     int            *bla;          /* the atom pairs involved in the constraints */
@@ -104,17 +112,18 @@ typedef struct gmx_lincsdata {
     int            *blnr;         /* index into blbnb and blmf */
     int            *blbnb;        /* list of constraint connections */
     int             ntriangle;    /* the local number of constraints in triangles */
-    int            *triangle;     /* the list of triangle constraints */
-    int            *tri_bits;     /* the bits tell if the matrix element should be used */
     int             ncc_triangle; /* the number of constraint connections in triangles */
     gmx_bool        bCommIter;    /* communicate before each LINCS interation */
     real           *blmf;         /* matrix of mass factors for constraint connections */
     real           *blmf1;        /* as blmf, but with all masses 1 */
     real           *bllen;        /* the reference bond length */
-    int             nth;          /* The number of threads doing LINCS */
-    lincs_thread_t *th;           /* LINCS thread division */
+    int            *nlocat;       /* the local atom count per constraint, can be NULL */
+
+    int             ntask;        /* The number of tasks = #threads for LINCS */
+    lincs_task_t   *task;         /* LINCS thread division */
     gmx_bitmask_t  *atf;          /* atom flags for thread parallelization */
     int             atf_nalloc;   /* allocation size of atf */
+    gmx_bool        bTaskDep;     /* are the LINCS tasks interdependent? */
     /* arrays for temporary storage in the LINCS algorithm */
     rvec           *tmpv;
     real           *tmpncc;
@@ -158,39 +167,48 @@ real lincs_rmsd(struct gmx_lincsdata *lincsd, gmx_bool bSD2)
  * constraint data, without an OpenMP barrier.
  */
 static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
-                                int b0, int b1,
+                                const lincs_task_t *li_task,
                                 const real *blcc,
                                 real *rhs1, real *rhs2, real *sol)
 {
-    int        nrec, rec, b, j, n, nr0, nr1;
-    real       mvb, *swap;
-    int        ntriangle, tb, bits;
-    const int *blnr     = lincsd->blnr, *blbnb = lincsd->blbnb;
-    const int *triangle = lincsd->triangle, *tri_bits = lincsd->tri_bits;
+    int        b0, b1, nrec, rec;
+    const int *blnr  = lincsd->blnr;
+    const int *blbnb = lincsd->blbnb;
 
-    ntriangle = lincsd->ntriangle;
-    nrec      = lincsd->nOrder;
+    b0   = li_task->b0;
+    b1   = li_task->b1;
+    nrec = lincsd->nOrder;
 
     for (rec = 0; rec < nrec; rec++)
     {
+        int b;
+
+        if (lincsd->bTaskDep)
+        {
 #pragma omp barrier
+        }
         for (b = b0; b < b1; b++)
         {
+            real mvb;
+            int  n;
+
             mvb = 0;
             for (n = blnr[b]; n < blnr[b+1]; n++)
             {
-                j   = blbnb[n];
-                mvb = mvb + blcc[n]*rhs1[j];
+                mvb = mvb + blcc[n]*rhs1[blbnb[n]];
             }
             rhs2[b] = mvb;
             sol[b]  = sol[b] + mvb;
         }
+
+        real *swap;
+
         swap = rhs1;
         rhs1 = rhs2;
         rhs2 = swap;
     } /* nrec*(ncons+2*nrtot) flops */
 
-    if (ntriangle > 0)
+    if (lincsd->ntriangle > 0)
     {
         /* Perform an extra nrec recursions for only the constraints
          * involved in rigid triangles.
@@ -199,49 +217,55 @@ static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
          * around 0.7, while the effective eigenvalue for bond constraints
          * is around 0.4 (and 0.7*0.7=0.5).
          */
-        /* We need to copy the temporary array, since only the elements
-         * for constraints involved in triangles are updated and then
-         * the pointers are swapped. This saves copying the whole array.
-         * We need a barrier as other threads might still be reading from rhs2.
-         */
-#pragma omp barrier
-        for (b = b0; b < b1; b++)
+
+        if (lincsd->bTaskDep)
         {
-            rhs2[b] = rhs1[b];
-        }
+            /* We need a barrier here, since other threads might still be
+             * reading the contents of rhs1 and/o rhs2.
+             * We could avoid this barrier by introducing two extra rhs
+             * arrays for the triangle constraints only.
+             */
 #pragma omp barrier
-#pragma omp master
+        }
+
+        /* Constraints involved in a triangle are ensured to be in the same
+         * LINCS task. This means no barriers are required during the extra
+         * iterations for the triangle constraints.
+         */
+        const int *triangle = li_task->triangle;
+        const int *tri_bits = li_task->tri_bits;
+
+        for (rec = 0; rec < nrec; rec++)
         {
-            for (rec = 0; rec < nrec; rec++)
+            int tb;
+
+            for (tb = 0; tb < li_task->ntriangle; tb++)
             {
-                for (tb = 0; tb < ntriangle; tb++)
+                int  b, bits, nr0, nr1, n;
+                real mvb;
+
+                b    = triangle[tb];
+                bits = tri_bits[tb];
+                mvb  = 0;
+                nr0  = blnr[b];
+                nr1  = blnr[b+1];
+                for (n = nr0; n < nr1; n++)
                 {
-                    b    = triangle[tb];
-                    bits = tri_bits[tb];
-                    mvb  = 0;
-                    nr0  = blnr[b];
-                    nr1  = blnr[b+1];
-                    for (n = nr0; n < nr1; n++)
+                    if (bits & (1 << (n - nr0)))
                     {
-                        if (bits & (1<<(n-nr0)))
-                        {
-                            j   = blbnb[n];
-                            mvb = mvb + blcc[n]*rhs1[j];
-                        }
+                        mvb = mvb + blcc[n]*rhs1[blbnb[n]];
                     }
-                    rhs2[b] = mvb;
-                    sol[b]  = sol[b] + mvb;
                 }
-                swap = rhs1;
-                rhs1 = rhs2;
-                rhs2 = swap;
+                rhs2[b] = mvb;
+                sol[b]  = sol[b] + mvb;
             }
-        } /* flops count is missing here */
 
-        /* We need a barrier here as the calling routine will continue
-         * to operate on the thread-local constraints without barrier.
-         */
-#pragma omp barrier
+            real *swap;
+
+            swap = rhs1;
+            rhs1 = rhs2;
+            rhs2 = swap;
+        } /* nrec*(ntriangle + ncc_triangle*2) flops */
     }
 }
 
@@ -351,10 +375,11 @@ static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
                                const real *invmass,
                                rvec *x)
 {
-    if (li->nth == 1)
+    if (li->ntask == 1)
     {
         /* Single thread, we simply update for all constraints */
-        lincs_update_atoms_noind(li->nc, li->bla, prefac, fac, r, invmass, x);
+        lincs_update_atoms_noind(li->nc_real,
+                                 li->bla, prefac, fac, r, invmass, x);
     }
     else
     {
@@ -362,10 +387,10 @@ static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
          * constraints that only access our local atom range.
          * This can be done without a barrier.
          */
-        lincs_update_atoms_ind(li->th[th].nind, li->th[th].ind,
+        lincs_update_atoms_ind(li->task[th].nind, li->task[th].ind,
                                li->bla, prefac, fac, r, invmass, x);
 
-        if (li->th[li->nth].nind > 0)
+        if (li->task[li->ntask].nind > 0)
         {
             /* Update the constraints that operate on atoms
              * in multiple thread atom blocks on the master thread.
@@ -373,8 +398,8 @@ static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
 #pragma omp barrier
 #pragma omp master
             {
-                lincs_update_atoms_ind(li->th[li->nth].nind,
-                                       li->th[li->nth].ind,
+                lincs_update_atoms_ind(li->task[li->ntask].nind,
+                                       li->task[li->ntask].ind,
                                        li->bla, prefac, fac, r, invmass, x);
             }
         }
@@ -502,8 +527,8 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
     rvec    *r;
     real    *blc, *blmf, *blcc, *rhs1, *rhs2, *sol;
 
-    b0 = lincsd->th[th].b0;
-    b1 = lincsd->th[th].b1;
+    b0 = lincsd->task[th].b0;
+    b1 = lincsd->task[th].b1;
 
     bla    = lincsd->bla;
     r      = lincsd->tmpv;
@@ -542,8 +567,8 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
      */
     calc_dr_x_f_simd(b0, b1, bla, x, f, blc,
                      &pbc_simd,
-                     lincsd->th[th].simd_buf,
-                     lincsd->th[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
+                     lincsd->task[th].simd_buf,
+                     lincsd->task[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
                      r, rhs1, sol);
 
 #else /* LINCS_SIMD */
@@ -587,10 +612,13 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
 
 #endif /* LINCS_SIMD */
 
-    /* We need a barrier, since the matrix construction below
-     * can access entries in r of other threads.
-     */
+    if (lincsd->bTaskDep)
+    {
+        /* We need a barrier, since the matrix construction below
+         * can access entries in r of other threads.
+         */
 #pragma omp barrier
+    }
 
     /* Construct the (sparse) LINCS matrix */
     for (b = b0; b < b1; b++)
@@ -604,7 +632,7 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
     }
     /* Together: 23*ncons + 6*nrtot flops */
 
-    lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
+    lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
     /* nrec*(ncons+2*nrtot) flops */
 
     if (econq == econqDeriv_FlexCon)
@@ -643,7 +671,7 @@ static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
             dhdlambda -= sol[b]*lincsd->ddist[b];
         }
 
-        lincsd->th[th].dhdlambda = dhdlambda;
+        lincsd->task[th].dhdlambda = dhdlambda;
     }
 
     if (bCalcVir)
@@ -863,8 +891,8 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
     real    *blc, *blmf, *bllen, *blcc, *rhs1, *rhs2, *sol, *blc_sol, *mlambda;
     int     *nlocat;
 
-    b0 = lincsd->th[th].b0;
-    b1 = lincsd->th[th].b1;
+    b0 = lincsd->task[th].b0;
+    b1 = lincsd->task[th].b1;
 
     bla     = lincsd->bla;
     r       = lincsd->tmpv;
@@ -879,15 +907,7 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
     sol     = lincsd->tmp3;
     blc_sol = lincsd->tmp4;
     mlambda = lincsd->mlambda;
-
-    if (DOMAINDECOMP(cr) && cr->dd->constraints)
-    {
-        nlocat = dd_constraints_nlocalatoms(cr->dd);
-    }
-    else
-    {
-        nlocat = NULL;
-    }
+    nlocat  = lincsd->nlocat;
 
 #ifdef LINCS_SIMD
 
@@ -905,8 +925,8 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
      */
     calc_dr_x_xp_simd(b0, b1, bla, x, xp, bllen, blc,
                       &pbc_simd,
-                      lincsd->th[th].simd_buf,
-                      lincsd->th[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
+                      lincsd->task[th].simd_buf,
+                      lincsd->task[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
                       r, rhs1, sol);
 
 #else /* LINCS_SIMD */
@@ -960,10 +980,13 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
 
 #endif /* LINCS_SIMD */
 
-    /* We need a barrier, since the matrix construction below
-     * can access entries in r of other threads.
-     */
+    if (lincsd->bTaskDep)
+    {
+        /* We need a barrier, since the matrix construction below
+         * can access entries in r of other threads.
+         */
 #pragma omp barrier
+    }
 
     /* Construct the (sparse) LINCS matrix */
     for (b = b0; b < b1; b++)
@@ -975,7 +998,7 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
     }
     /* Together: 26*ncons + 6*nrtot flops */
 
-    lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
+    lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
     /* nrec*(ncons+2*nrtot) flops */
 
 #ifndef LINCS_SIMD
@@ -1017,20 +1040,23 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
                     dd_move_x_constraints(cr->dd, box, xp, NULL, FALSE);
                 }
             }
+#pragma omp barrier
         }
-
+        else if (lincsd->bTaskDep)
+        {
 #pragma omp barrier
+        }
 
 #ifdef LINCS_SIMD
         calc_dist_iter_simd(b0, b1, bla, xp, bllen, blc, &pbc_simd, wfac,
-                            lincsd->th[th].simd_buf, rhs1, sol, bWarn);
+                            lincsd->task[th].simd_buf, rhs1, sol, bWarn);
 #else
         calc_dist_iter(b0, b1, bla, xp, bllen, blc, pbc, wfac,
                        rhs1, sol, bWarn);
         /* 20*ncons flops */
 #endif
 
-        lincs_matrix_expand(lincsd, b0, b1, blcc, rhs1, rhs2, sol);
+        lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
         /* nrec*(ncons+2*nrtot) flops */
 
 #ifndef LINCS_SIMD
@@ -1069,8 +1095,11 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
 
     if (nlocat != NULL && (bCalcDHDL || bCalcVir))
     {
-        /* In lincs_update_atoms threads might cross-read mlambda */
+        if (lincsd->bTaskDep)
+        {
+            /* In lincs_update_atoms threads might cross-read mlambda */
 #pragma omp barrier
+        }
 
         /* Only account for local atoms */
         for (b = b0; b < b1; b++)
@@ -1091,7 +1120,7 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
              */
             dhdl -= lincsd->mlambda[b]*lincsd->ddist[b];
         }
-        lincsd->th[th].dhdlambda = dhdl;
+        lincsd->task[th].dhdlambda = dhdl;
     }
 
     if (bCalcVir)
@@ -1124,30 +1153,35 @@ static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
      */
 }
 
-void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
+/* Sets the elements in the LINCS matrix for task li_task */
+static void set_lincs_matrix_task(struct gmx_lincsdata *li,
+                                  lincs_task_t         *li_task,
+                                  const real           *invmass,
+                                  int                  *ncc_triangle)
 {
-    int        i, a1, a2, n, k, sign, center;
-    int        end, nk, kk;
-    const real invsqrt2 = 0.7071067811865475244;
-
-    for (i = 0; (i < li->nc); i++)
-    {
-        a1          = li->bla[2*i];
-        a2          = li->bla[2*i+1];
-        li->blc[i]  = gmx_invsqrt(invmass[a1] + invmass[a2]);
-        li->blc1[i] = invsqrt2;
-    }
+    int        i;
 
     /* Construct the coupling coefficient matrix blmf */
-    li->ntriangle    = 0;
-    li->ncc_triangle = 0;
-    for (i = 0; (i < li->nc); i++)
+    li_task->ntriangle = 0;
+    *ncc_triangle      = 0;
+    for (i = li_task->b0; i < li_task->b1; i++)
     {
+        int a1, a2, n;
+
         a1 = li->bla[2*i];
         a2 = li->bla[2*i+1];
         for (n = li->blnr[i]; (n < li->blnr[i+1]); n++)
         {
+            int k, sign, center, end;
+
             k = li->blbnb[n];
+
+            /* If we are using multiple, independent tasks for LINCS,
+             * the calls to check_assign_connected should have
+             * put all connected constraints in our task.
+             */
+            assert(li->bTaskDep || (k >= li_task->b0 && k < li_task->b1));
+
             if (a1 == li->bla[2*k] || a2 == li->bla[2*k+1])
             {
                 sign = -1;
@@ -1170,6 +1204,8 @@ void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
             li->blmf1[n] = sign*0.5;
             if (li->ncg_triangle > 0)
             {
+                int nk, kk;
+
                 /* Look for constraint triangles */
                 for (nk = li->blnr[k]; (nk < li->blnr[k+1]); nk++)
                 {
@@ -1177,27 +1213,62 @@ void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
                     if (kk != i && kk != k &&
                         (li->bla[2*kk] == end || li->bla[2*kk+1] == end))
                     {
-                        if (li->ntriangle == 0 ||
-                            li->triangle[li->ntriangle-1] < i)
+                        /* If we are using multiple tasks for LINCS,
+                         * the calls to check_assign_triangle should have
+                         * put all constraints in the triangle in our task.
+                         */
+                        assert(k  >= li_task->b0 && k  < li_task->b1);
+                        assert(kk >= li_task->b0 && kk < li_task->b1);
+
+                        if (li_task->ntriangle == 0 ||
+                            li_task->triangle[li_task->ntriangle - 1] < i)
                         {
                             /* Add this constraint to the triangle list */
-                            li->triangle[li->ntriangle] = i;
-                            li->tri_bits[li->ntriangle] = 0;
-                            li->ntriangle++;
-                            if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li->tri_bits[0])*8 - 1))
+                            li_task->triangle[li_task->ntriangle] = i;
+                            li_task->tri_bits[li_task->ntriangle] = 0;
+                            li_task->ntriangle++;
+                            if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li_task->tri_bits[0])*8 - 1))
                             {
                                 gmx_fatal(FARGS, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
                                           li->blnr[i+1] - li->blnr[i],
-                                          sizeof(li->tri_bits[0])*8-1);
+                                          sizeof(li_task->tri_bits[0])*8-1);
                             }
                         }
-                        li->tri_bits[li->ntriangle-1] |= (1<<(n-li->blnr[i]));
-                        li->ncc_triangle++;
+                        li_task->tri_bits[li_task->ntriangle-1] |= (1 << (n - li->blnr[i]));
+                        (*ncc_triangle)++;
                     }
                 }
             }
         }
     }
+}
+
+/* Sets the elements in the LINCS matrix */
+void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
+{
+    int        i;
+    const real invsqrt2 = 0.7071067811865475244;
+
+    for (i = 0; (i < li->nc); i++)
+    {
+        int a1, a2;
+
+        a1          = li->bla[2*i];
+        a2          = li->bla[2*i+1];
+        li->blc[i]  = gmx_invsqrt(invmass[a1] + invmass[a2]);
+        li->blc1[i] = invsqrt2;
+    }
+
+    /* Construct the coupling coefficient matrix blmf */
+    int th, ntriangle = 0, ncc_triangle = 0;
+#pragma omp parallel for reduction(+: ntriangle, ncc_triangle) num_threads(li->ntask) schedule(static)
+    for (th = 0; th < li->ntask; th++)
+    {
+        set_lincs_matrix_task(li, &li->task[th], invmass, &ncc_triangle);
+        ntriangle = li->task[th].ntriangle;
+    }
+    li->ntriangle    = ntriangle;
+    li->ncc_triangle = ncc_triangle;
 
     if (debug)
     {
@@ -1316,8 +1387,9 @@ gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
                            gmx_bool bPLINCS, int nIter, int nProjOrder)
 {
     struct gmx_lincsdata *li;
-    int                   mb;
+    int                   mt, mb;
     gmx_moltype_t        *molt;
+    gmx_bool              bMoreThanTwoSeq;
 
     if (fplog)
     {
@@ -1335,27 +1407,46 @@ gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
     li->nIter  = nIter;
     li->nOrder = nProjOrder;
 
+    li->max_connect = 0;
+    for (mt = 0; mt < mtop->nmoltype; mt++)
+    {
+        int a;
+
+        molt = &mtop->moltype[mt];
+        for (a = 0; a < molt->atoms.nr; a++)
+        {
+            li->max_connect = std::max(li->max_connect,
+                                       at2con[mt].index[a + 1] - at2con[mt].index[a]);
+        }
+    }
+
     li->ncg_triangle = 0;
-    li->bCommIter    = FALSE;
+    bMoreThanTwoSeq  = FALSE;
     for (mb = 0; mb < mtop->nmolblock; mb++)
     {
         molt              = &mtop->moltype[mtop->molblock[mb].type];
+
         li->ncg_triangle +=
             mtop->molblock[mb].nmol*
             count_triangle_constraints(molt->ilist,
                                        &at2con[mtop->molblock[mb].type]);
-        if (bPLINCS && li->bCommIter == FALSE)
-        {
-            /* Check if we need to communicate not only before LINCS,
-             * but also before each iteration.
-             * The check for only two sequential constraints is only
-             * useful for the common case of H-bond only constraints.
-             * With more effort we could also make it useful for small
-             * molecules with nr. sequential constraints <= nOrder-1.
-             */
-            li->bCommIter = (li->nOrder < 1 || more_than_two_sequential_constraints(molt->ilist, &at2con[mtop->molblock[mb].type]));
+
+        if (!bMoreThanTwoSeq &&
+            more_than_two_sequential_constraints(molt->ilist, &at2con[mtop->molblock[mb].type]))
+        {
+            bMoreThanTwoSeq = TRUE;
         }
     }
+
+    /* Check if we need to communicate not only before LINCS,
+     * but also before each iteration.
+     * The check for only two sequential constraints is only
+     * useful for the common case of H-bond only constraints.
+     * With more effort we could also make it useful for small
+     * molecules with nr. sequential constraints <= nOrder-1.
+     */
+    li->bCommIter = (bPLINCS && (li->nOrder < 1 || bMoreThanTwoSeq));
+
     if (debug && bPLINCS)
     {
         fprintf(debug, "PLINCS communication before each iteration: %d\n",
@@ -1365,27 +1456,31 @@ gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
     /* LINCS can run on any number of threads.
      * Currently the number is fixed for the whole simulation,
      * but it could be set in set_lincs().
+     * The current constraint to task assignment code can create independent
+     * tasks only when not more than two constraints are connected sequentially.
      */
-    li->nth = gmx_omp_nthreads_get(emntLINCS);
+    li->ntask    = gmx_omp_nthreads_get(emntLINCS);
+    li->bTaskDep = (li->ntask > 1 && bMoreThanTwoSeq);
     if (debug)
     {
-        fprintf(debug, "LINCS: using %d threads\n", li->nth);
+        fprintf(debug, "LINCS: using %d threads, tasks are %sdependent\n",
+                li->ntask, li->bTaskDep ? "" : "in");
     }
-    if (li->nth == 1)
+    if (li->ntask == 1)
     {
-        snew(li->th, 1);
+        snew(li->task, 1);
     }
     else
     {
-        /* Allocate an extra elements for "thread-overlap" constraints */
-        snew(li->th, li->nth+1);
+        /* Allocate an extra elements for "task-overlap" constraints */
+        snew(li->task, li->ntask + 1);
     }
     int th;
-#pragma omp parallel for num_threads(li->nth)
-    for (th = 0; th < li->nth; th++)
+#pragma omp parallel for num_threads(li->ntask)
+    for (th = 0; th < li->ntask; th++)
     {
         /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
-        snew_aligned(li->th[th].simd_buf, 2*simd_width*DIM,
+        snew_aligned(li->task[th].simd_buf, 2*simd_width*DIM,
                      align_bytes);
     }
 
@@ -1422,7 +1517,7 @@ gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
 /* Sets up the work division over the threads */
 static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
 {
-    lincs_thread_t *li_m;
+    lincs_task_t   *li_m;
     int             th;
     gmx_bitmask_t  *atf;
     int             a;
@@ -1440,56 +1535,47 @@ static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
         bitmask_clear(&atf[a]);
     }
 
-    if (li->nth > BITMASK_SIZE)
+    if (li->ntask > BITMASK_SIZE)
     {
         gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
     }
 
-    int block = simd_width;
-
-    for (th = 0; th < li->nth; th++)
+    for (th = 0; th < li->ntask; th++)
     {
-        lincs_thread_t *li_th;
-        int             b;
+        lincs_task_t *li_task;
+        int           b;
 
-        li_th = &li->th[th];
-
-        /* The constraints are divided equally over the threads,
-         * in chunks of size block to ensure alignment for SIMD.
-         */
-        li_th->b0 = (((li->nc + block - 1)* th   )/(block*li->nth))*block;
-        li_th->b1 = (((li->nc + block - 1)*(th+1))/(block*li->nth))*block;
-        li_th->b1 = std::min<int>(li_th->b1, li->nc);
+        li_task = &li->task[th];
 
         /* For each atom set a flag for constraints from each */
-        for (b = li_th->b0; b < li_th->b1; b++)
+        for (b = li_task->b0; b < li_task->b1; b++)
         {
-            bitmask_set_bit(&atf[li->bla[b*2]], th);
-            bitmask_set_bit(&atf[li->bla[b*2+1]], th);
+            bitmask_set_bit(&atf[li->bla[b*2    ]], th);
+            bitmask_set_bit(&atf[li->bla[b*2 + 1]], th);
         }
     }
 
-#pragma omp parallel for num_threads(li->nth) schedule(static)
-    for (th = 0; th < li->nth; th++)
+#pragma omp parallel for num_threads(li->ntask) schedule(static)
+    for (th = 0; th < li->ntask; th++)
     {
-        lincs_thread_t *li_th;
-        gmx_bitmask_t   mask;
-        int             b;
+        lincs_task_t  *li_task;
+        gmx_bitmask_t  mask;
+        int            b;
 
-        li_th = &li->th[th];
+        li_task = &li->task[th];
 
-        if (li_th->b1 - li_th->b0 > li_th->ind_nalloc)
+        if (li_task->b1 - li_task->b0 > li_task->ind_nalloc)
         {
-            li_th->ind_nalloc = over_alloc_large(li_th->b1-li_th->b0);
-            srenew(li_th->ind, li_th->ind_nalloc);
-            srenew(li_th->ind_r, li_th->ind_nalloc);
+            li_task->ind_nalloc = over_alloc_large(li_task->b1-li_task->b0);
+            srenew(li_task->ind, li_task->ind_nalloc);
+            srenew(li_task->ind_r, li_task->ind_nalloc);
         }
 
         bitmask_init_low_bits(&mask, th);
 
-        li_th->nind   = 0;
-        li_th->nind_r = 0;
-        for (b = li_th->b0; b < li_th->b1; b++)
+        li_task->nind   = 0;
+        li_task->nind_r = 0;
+        for (b = li_task->b0; b < li_task->b1; b++)
         {
             /* We let the constraint with the lowest thread index
              * operate on atoms with constraints from multiple threads.
@@ -1498,12 +1584,12 @@ static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
                 bitmask_is_disjoint(atf[li->bla[b*2+1]], mask))
             {
                 /* Add the constraint to the local atom update index */
-                li_th->ind[li_th->nind++] = b;
+                li_task->ind[li_task->nind++] = b;
             }
             else
             {
                 /* Add the constraint to the rest block */
-                li_th->ind_r[li_th->nind_r++] = b;
+                li_task->ind_r[li_task->nind_r++] = b;
             }
         }
     }
@@ -1511,31 +1597,31 @@ static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
     /* We need to copy all constraints which have not be assigned
      * to a thread to a separate list which will be handled by one thread.
      */
-    li_m = &li->th[li->nth];
+    li_m = &li->task[li->ntask];
 
     li_m->nind = 0;
-    for (th = 0; th < li->nth; th++)
+    for (th = 0; th < li->ntask; th++)
     {
-        lincs_thread_t *li_th;
-        int             b;
+        lincs_task_t *li_task;
+        int           b;
 
-        li_th   = &li->th[th];
+        li_task   = &li->task[th];
 
-        if (li_m->nind + li_th->nind_r > li_m->ind_nalloc)
+        if (li_m->nind + li_task->nind_r > li_m->ind_nalloc)
         {
-            li_m->ind_nalloc = over_alloc_large(li_m->nind+li_th->nind_r);
+            li_m->ind_nalloc = over_alloc_large(li_m->nind+li_task->nind_r);
             srenew(li_m->ind, li_m->ind_nalloc);
         }
 
-        for (b = 0; b < li_th->nind_r; b++)
+        for (b = 0; b < li_task->nind_r; b++)
         {
-            li_m->ind[li_m->nind++] = li_th->ind_r[b];
+            li_m->ind[li_m->nind++] = li_task->ind_r[b];
         }
 
         if (debug)
         {
             fprintf(debug, "LINCS thread %d: %d constraints\n",
-                    th, li_th->nind);
+                    th, li_task->nind);
         }
     }
 
@@ -1555,31 +1641,261 @@ static void resize_real_aligned(real **ptr, int nelem)
     snew_aligned(*ptr, nelem, align_bytes);
 }
 
+static void assign_constraint(struct gmx_lincsdata *li,
+                              int constraint_index,
+                              int a1, int a2,
+                              real lenA, real lenB,
+                              const t_blocka *at2con)
+{
+    int con;
+
+    con = li->nc;
+
+    /* Make an mapping of local topology constraint index to LINCS index */
+    li->con_index[constraint_index] = con;
+
+    li->bllen0[con]  = lenA;
+    li->ddist[con]   = lenB - lenA;
+    /* Set the length to the topology A length */
+    li->bllen[con]   = lenA;
+    li->bla[2*con]   = a1;
+    li->bla[2*con+1] = a2;
+
+    /* Make space in the constraint connection matrix for constraints
+     * connected to both end of the current constraint.
+     */
+    li->ncc +=
+        at2con->index[a1 + 1] - at2con->index[a1] - 1 +
+        at2con->index[a2 + 1] - at2con->index[a2] - 1;
+
+    li->blnr[con + 1] = li->ncc;
+
+    /* Increase the constraint count */
+    li->nc++;
+}
+
+/* Check if constraint with topology index constraint_index is connected
+ * to other constraints, and if so add those connected constraints to our task.
+ */
+static void check_assign_connected(struct gmx_lincsdata *li,
+                                   const t_iatom *iatom,
+                                   const t_idef *idef,
+                                   int bDynamics,
+                                   int a1, int a2,
+                                   const t_blocka *at2con)
+{
+    /* Currently this function only supports constraint groups
+     * in which all constraints share at least one atom
+     * (e.g. H-bond constraints).
+     * Check both ends of the current constraint for
+     * connected constraints. We need to assign those
+     * to the same task.
+     */
+    int end;
+
+    for (end = 0; end < 2; end++)
+    {
+        int a, k;
+
+        a = (end == 0 ? a1 : a2);
+
+        for (k = at2con->index[a]; k < at2con->index[a + 1]; k++)
+        {
+            int cc;
+
+            cc = at2con->a[k];
+            /* Check if constraint cc has not yet been assigned */
+            if (li->con_index[cc] == -1)
+            {
+                int  type;
+                real lenA, lenB;
+
+                type = iatom[cc*3];
+                lenA = idef->iparams[type].constr.dA;
+                lenB = idef->iparams[type].constr.dB;
+
+                if (bDynamics || lenA != 0 || lenB != 0)
+                {
+                    assign_constraint(li, cc, iatom[3*cc + 1], iatom[3*cc + 2], lenA, lenB, at2con);
+                }
+            }
+        }
+    }
+}
+
+/* Check if constraint with topology index constraint_index is involved
+ * in a constraint triangle, and if so add the other two constraints
+ * in the triangle to our task.
+ */
+static void check_assign_triangle(struct gmx_lincsdata *li,
+                                  const t_iatom *iatom,
+                                  const t_idef *idef,
+                                  int bDynamics,
+                                  int constraint_index,
+                                  int a1, int a2,
+                                  const t_blocka *at2con)
+{
+    int nca, cc[32], ca[32], k;
+    int c_triangle[2] = { -1, -1 };
+
+    nca = 0;
+    for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
+    {
+        int c;
+
+        c = at2con->a[k];
+        if (c != constraint_index)
+        {
+            int aa1, aa2;
+
+            aa1 = iatom[c*3 + 1];
+            aa2 = iatom[c*3 + 2];
+            if (aa1 != a1)
+            {
+                cc[nca] = c;
+                ca[nca] = aa1;
+                nca++;
+            }
+            if (aa2 != a1)
+            {
+                cc[nca] = c;
+                ca[nca] = aa2;
+                nca++;
+            }
+        }
+    }
+
+    for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
+    {
+        int c;
+
+        c = at2con->a[k];
+        if (c != constraint_index)
+        {
+            int aa1, aa2, i;
+
+            aa1 = iatom[c*3 + 1];
+            aa2 = iatom[c*3 + 2];
+            if (aa1 != a2)
+            {
+                for (i = 0; i < nca; i++)
+                {
+                    if (aa1 == ca[i])
+                    {
+                        c_triangle[0] = cc[i];
+                        c_triangle[1] = c;
+                    }
+                }
+            }
+            if (aa2 != a2)
+            {
+                for (i = 0; i < nca; i++)
+                {
+                    if (aa2 == ca[i])
+                    {
+                        c_triangle[0] = cc[i];
+                        c_triangle[1] = c;
+                    }
+                }
+            }
+        }
+    }
+
+    if (c_triangle[0] >= 0)
+    {
+        int end;
+
+        for (end = 0; end < 2; end++)
+        {
+            /* Check if constraint c_triangle[end] has not yet been assigned */
+            if (li->con_index[c_triangle[end]] == -1)
+            {
+                int  i, type;
+                real lenA, lenB;
+
+                i    = c_triangle[end]*3;
+                type = iatom[i];
+                lenA = idef->iparams[type].constr.dA;
+                lenB = idef->iparams[type].constr.dB;
+
+                if (bDynamics || lenA != 0 || lenB != 0)
+                {
+                    assign_constraint(li, c_triangle[end], iatom[i + 1], iatom[i + 2], lenA, lenB, at2con);
+                }
+            }
+        }
+    }
+}
+
+static void set_matrix_indices(struct gmx_lincsdata *li,
+                               const lincs_task_t   *li_task,
+                               const t_blocka       *at2con,
+                               gmx_bool              bSortMatrix)
+{
+    int b;
+
+    for (b = li_task->b0; b < li_task->b1; b++)
+    {
+        int a1, a2, i, k;
+
+        a1 = li->bla[b*2];
+        a2 = li->bla[b*2 + 1];
+
+        i = li->blnr[b];
+        for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
+        {
+            int concon;
+
+            concon = li->con_index[at2con->a[k]];
+            if (concon != b)
+            {
+                li->blbnb[i++] = concon;
+            }
+        }
+        for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
+        {
+            int concon;
+
+            concon = li->con_index[at2con->a[k]];
+            if (concon != b)
+            {
+                li->blbnb[i++] = concon;
+            }
+        }
+
+        if (bSortMatrix)
+        {
+            /* Order the blbnb matrix to optimize memory access */
+            qsort(&(li->blbnb[li->blnr[b]]), li->blnr[b + 1] - li->blnr[b],
+                  sizeof(li->blbnb[0]), int_comp);
+        }
+    }
+}
+
 void set_lincs(t_idef *idef, t_mdatoms *md,
                gmx_bool bDynamics, t_commrec *cr,
                struct gmx_lincsdata *li)
 {
-    int          start, natoms, nflexcon;
+    int          natoms, nflexcon;
     t_blocka     at2con;
     t_iatom     *iatom;
-    int          i, k, ncc_alloc, ncon_tot, con, nconnect, concon;
-    int          type, a1, a2;
-    real         lenA = 0, lenB;
+    int          i, ncc_alloc_old, ncon_tot;
 
-    li->nc  = 0;
-    li->ncc = 0;
+    li->nc_real = 0;
+    li->nc      = 0;
+    li->ncc     = 0;
     /* Zero the thread index ranges.
      * Otherwise without local constraints we could return with old ranges.
      */
-    for (i = 0; i < li->nth; i++)
+    for (i = 0; i < li->ntask; i++)
     {
-        li->th[i].b0   = 0;
-        li->th[i].b1   = 0;
-        li->th[i].nind = 0;
+        li->task[i].b0   = 0;
+        li->task[i].b1   = 0;
+        li->task[i].nind = 0;
     }
-    if (li->nth > 1)
+    if (li->ntask > 1)
     {
-        li->th[li->nth].nind = 0;
+        li->task[li->ntask].nind = 0;
     }
 
     /* This is the local topology, so there are only F_CONSTR constraints */
@@ -1600,164 +1916,258 @@ void set_lincs(t_idef *idef, t_mdatoms *md,
     {
         if (cr->dd->constraints)
         {
+            int start;
+
             dd_get_constraint_range(cr->dd, &start, &natoms);
         }
         else
         {
             natoms = cr->dd->nat_home;
         }
-        start = 0;
     }
     else
     {
-        start  = 0;
         natoms = md->homenr;
     }
-    at2con = make_at2con(start, natoms, idef->il, idef->iparams, bDynamics,
+    at2con = make_at2con(0, natoms, idef->il, idef->iparams, bDynamics,
                          &nflexcon);
 
     ncon_tot = idef->il[F_CONSTR].nr/3;
 
-    /* Ensure we have enough space for aligned loads beyond ncon_tot */
-    if (ncon_tot + simd_width > li->nc_alloc || li->nc_alloc == 0)
+    /* Ensure we have enough padding for aligned loads for each thread */
+    if (ncon_tot + li->ntask*simd_width > li->nc_alloc || li->nc_alloc == 0)
     {
-        li->nc_alloc = over_alloc_dd(ncon_tot + simd_width);
+        li->nc_alloc = over_alloc_dd(ncon_tot + li->ntask*simd_width);
+        srenew(li->con_index, li->nc_alloc);
         resize_real_aligned(&li->bllen0, li->nc_alloc);
         resize_real_aligned(&li->ddist, li->nc_alloc);
         srenew(li->bla, 2*li->nc_alloc);
         resize_real_aligned(&li->blc, li->nc_alloc);
         resize_real_aligned(&li->blc1, li->nc_alloc);
-        srenew(li->blnr, li->nc_alloc+1);
+        srenew(li->blnr, li->nc_alloc + 1);
         resize_real_aligned(&li->bllen, li->nc_alloc);
         srenew(li->tmpv, li->nc_alloc);
+        if (DOMAINDECOMP(cr))
+        {
+            srenew(li->nlocat, li->nc_alloc);
+        }
         resize_real_aligned(&li->tmp1, li->nc_alloc);
         resize_real_aligned(&li->tmp2, li->nc_alloc);
         resize_real_aligned(&li->tmp3, li->nc_alloc);
         resize_real_aligned(&li->tmp4, li->nc_alloc);
         resize_real_aligned(&li->mlambda, li->nc_alloc);
-        if (li->ncg_triangle > 0)
-        {
-            /* This is allocating too much, but it is difficult to improve */
-            srenew(li->triangle, li->nc_alloc);
-            srenew(li->tri_bits, li->nc_alloc);
-        }
     }
 
     iatom = idef->il[F_CONSTR].iatoms;
 
-    ncc_alloc   = li->ncc_alloc;
-    li->blnr[0] = 0;
-
-    con           = 0;
-    nconnect      = 0;
-    li->blnr[con] = nconnect;
-    for (i = 0; i < ncon_tot; i++)
-    {
-        type   = iatom[3*i];
-        a1     = iatom[3*i+1];
-        a2     = iatom[3*i+2];
-        lenA   = idef->iparams[type].constr.dA;
-        lenB   = idef->iparams[type].constr.dB;
-        /* Skip the flexible constraints when not doing dynamics */
-        if (bDynamics || lenA != 0 || lenB != 0)
-        {
-            li->bllen0[con]  = lenA;
-            li->ddist[con]   = lenB - lenA;
-            /* Set the length to the topology A length */
-            li->bllen[con]   = li->bllen0[con];
-            li->bla[2*con]   = a1;
-            li->bla[2*con+1] = a2;
-            /* Construct the constraint connection matrix blbnb */
-            for (k = at2con.index[a1-start]; k < at2con.index[a1-start+1]; k++)
+    ncc_alloc_old = li->ncc_alloc;
+    li->blnr[0]   = li->ncc;
+
+    /* Assign the constraints for li->ntask LINCS tasks.
+     * We target a uniform distribution of constraints over the tasks.
+     * Note that when flexible constraints are present, but are removed here
+     * (e.g. because we are doing EM) we get imbalance, but since that doesn't
+     * happen during normal MD, that's ok.
+     */
+    int ncon_assign, ncon_target, con, th;
+
+    /* Determine the number of constraints we need to assign here */
+    ncon_assign      = ncon_tot;
+    if (!bDynamics)
+    {
+        /* With energy minimization, flexible constraints are ignored
+         * (and thus minimized, as they should be).
+         */
+        ncon_assign -= nflexcon;
+    }
+
+    /* Set the target constraint count per task to exactly uniform,
+     * this might be overridden below.
+     */
+    ncon_target = (ncon_assign + li->ntask - 1)/li->ntask;
+
+    /* Mark all constraints as unassigned by setting their index to -1 */
+    for (con = 0; con < ncon_tot; con++)
+    {
+        li->con_index[con] = -1;
+    }
+
+    con = 0;
+    for (th = 0; th < li->ntask; th++)
+    {
+        lincs_task_t *li_task;
+
+        li_task = &li->task[th];
+
+#ifdef LINCS_SIMD
+        /* With indepedent tasks we likely have H-bond constraints or constraint
+         * pairs. The connected constraints will be pulled into the task, so the
+         * constraints per task will often exceed ncon_target.
+         * Triangle constraints can also increase the count, but there are
+         * relatively few of those, so we usually expect to get ncon_target.
+         */
+        if (li->bTaskDep)
+        {
+            /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
+             * since otherwise a lot of operations can be wasted.
+             * There are several ways to round here, we choose the one
+             * that alternates block sizes, which helps with Intel HT.
+             */
+            ncon_target = ((ncon_assign*(th + 1))/li->ntask - li->nc_real + GMX_SIMD_REAL_WIDTH - 1) & ~(GMX_SIMD_REAL_WIDTH - 1);
+        }
+#endif
+
+        /* Continue filling the arrays where we left off with the previous task,
+         * including padding for SIMD.
+         */
+        li_task->b0 = li->nc;
+
+        while (con < ncon_tot && li->nc - li_task->b0 < ncon_target)
+        {
+            if (li->con_index[con] == -1)
             {
-                concon = at2con.a[k];
-                if (concon != i)
+                int  type, a1, a2;
+                real lenA, lenB;
+
+                type   = iatom[3*con];
+                a1     = iatom[3*con + 1];
+                a2     = iatom[3*con + 2];
+                lenA   = idef->iparams[type].constr.dA;
+                lenB   = idef->iparams[type].constr.dB;
+                /* Skip the flexible constraints when not doing dynamics */
+                if (bDynamics || lenA != 0 || lenB != 0)
                 {
-                    if (nconnect >= ncc_alloc)
+                    assign_constraint(li, con, a1, a2, lenA, lenB, &at2con);
+
+                    if (li->ntask > 1 && !li->bTaskDep)
                     {
-                        ncc_alloc = over_alloc_small(nconnect+1);
-                        srenew(li->blbnb, ncc_alloc);
+                        /* We can generate independent tasks. Check if we
+                         * need to assign connected constraints to our task.
+                         */
+                        check_assign_connected(li, iatom, idef, bDynamics,
+                                               a1, a2, &at2con);
                     }
-                    li->blbnb[nconnect++] = concon;
-                }
-            }
-            for (k = at2con.index[a2-start]; k < at2con.index[a2-start+1]; k++)
-            {
-                concon = at2con.a[k];
-                if (concon != i)
-                {
-                    if (nconnect+1 > ncc_alloc)
+                    if (li->ntask > 1 && li->ncg_triangle > 0)
                     {
-                        ncc_alloc = over_alloc_small(nconnect+1);
-                        srenew(li->blbnb, ncc_alloc);
+                        /* Ensure constraints in one triangle are assigned
+                         * to the same task.
+                         */
+                        check_assign_triangle(li, iatom, idef, bDynamics,
+                                              con, a1, a2, &at2con);
                     }
-                    li->blbnb[nconnect++] = concon;
                 }
             }
-            li->blnr[con+1] = nconnect;
 
-            if (cr->dd == NULL)
+            con++;
+        }
+
+        li_task->b1 = li->nc;
+
+        if (simd_width > 1)
+        {
+            /* Copy the last atom pair indices and lengths for constraints
+             * up to a multiple of simd_width, such that we can do all
+             * SIMD operations without having to worry about end effects.
+             */
+            int i, last;
+
+            li->nc = ((li_task->b1 + simd_width - 1)/simd_width)*simd_width;
+            last   = li_task->b1 - 1;
+            for (i = li_task->b1; i < li->nc; i++)
             {
-                /* Order the blbnb matrix to optimize memory access */
-                qsort(&(li->blbnb[li->blnr[con]]), li->blnr[con+1]-li->blnr[con],
-                      sizeof(li->blbnb[0]), int_comp);
+                li->bla[i*2    ] = li->bla[last*2    ];
+                li->bla[i*2 + 1] = li->bla[last*2 + 1];
+                li->bllen0[i]    = li->bllen0[last];
+                li->ddist[i]     = li->ddist[last];
+                li->bllen[i]     = li->bllen[last];
+                li->blnr[i + 1]  = li->blnr[last + 1];
             }
-            /* Increase the constraint count */
-            con++;
         }
+
+        /* Keep track of how many constraints we assigned */
+        li->nc_real += li_task->b1 - li_task->b0;
+
+        if (debug)
+        {
+            fprintf(debug, "LINCS task %d constraints %d - %d\n",
+                    th, li_task->b0, li_task->b1);
+        }
+    }
+
+    assert(li->nc_real == ncon_assign);
+
+    gmx_bool bSortMatrix;
+
+    /* Without DD we order the blbnb matrix to optimize memory access.
+     * With DD the overhead of sorting is more than the gain during access.
+     */
+    bSortMatrix = !DOMAINDECOMP(cr);
+
+    if (li->ncc > li->ncc_alloc)
+    {
+        li->ncc_alloc = over_alloc_small(li->ncc);
+        srenew(li->blbnb, li->ncc_alloc);
     }
-    if (simd_width > 1)
+
+#pragma omp parallel for num_threads(li->ntask) schedule(static)
+    for (th = 0; th < li->ntask; th++)
     {
-        /* Copy the last atom pair indices and lengths for constraints
-         * up to a multiple of simd_width, such that we can do all
-         * SIMD operations without have to worry about end effects.
-         */
-        int con_roundup, i;
+        lincs_task_t *li_task;
+
+        li_task = &li->task[th];
 
-        con_roundup = ((con + simd_width - 1)/simd_width)*simd_width;
-        for (i = con; i < con_roundup; i++)
+        if (li->ncg_triangle > 0 &&
+            li_task->b1 - li_task->b0 > li_task->tri_alloc)
         {
-            li->bla[i*2    ] = li->bla[(con - 1)*2    ];
-            li->bla[i*2 + 1] = li->bla[(con - 1)*2 + 1];
-            li->bllen[i]     = li->bllen[con - 1];
+            /* This is allocating too much, but it is difficult to improve */
+            li_task->tri_alloc = over_alloc_dd(li_task->b1 - li_task->b0);
+            srenew(li_task->triangle, li_task->tri_alloc);
+            srenew(li_task->tri_bits, li_task->tri_alloc);
         }
+
+        set_matrix_indices(li, li_task, &at2con, bSortMatrix);
     }
 
     done_blocka(&at2con);
 
-    /* This is the real number of constraints,
-     * without dynamics the flexible constraints are not present.
-     */
-    li->nc = con;
-
-    li->ncc = li->blnr[con];
     if (cr->dd == NULL)
     {
-        /* Since the matrix is static, we can free some memory */
-        ncc_alloc = li->ncc;
-        srenew(li->blbnb, ncc_alloc);
+        /* Since the matrix is static, we should free some memory */
+        li->ncc_alloc = li->ncc;
+        srenew(li->blbnb, li->ncc_alloc);
     }
 
-    if (ncc_alloc > li->ncc_alloc)
+    if (li->ncc_alloc > ncc_alloc_old)
     {
-        li->ncc_alloc = ncc_alloc;
         srenew(li->blmf, li->ncc_alloc);
         srenew(li->blmf1, li->ncc_alloc);
         srenew(li->tmpncc, li->ncc_alloc);
     }
 
-    if (debug)
+    if (DOMAINDECOMP(cr) && dd_constraints_nlocalatoms(cr->dd) != NULL)
+    {
+        int *nlocat_dd;
+
+        nlocat_dd = dd_constraints_nlocalatoms(cr->dd);
+
+        /* Convert nlocat from local topology to LINCS constraint indexing */
+        for (con = 0; con < ncon_tot; con++)
+        {
+            li->nlocat[li->con_index[con]] = nlocat_dd[con];
+        }
+    }
+    else
     {
-        fprintf(debug, "Number of constraints is %d, couplings %d\n",
-                li->nc, li->ncc);
+        li->nlocat = NULL;
     }
 
-    if (li->nth == 1)
+    if (debug)
     {
-        li->th[0].b0 = 0;
-        li->th[0].b1 = li->nc;
+        fprintf(debug, "Number of constraints is %d, padded %d, couplings %d\n",
+                li->nc_real, li->nc, li->ncc);
     }
-    else
+
+    if (li->ntask > 1)
     {
         lincs_thread_setup(li, md->nr);
     }
@@ -1827,54 +2237,58 @@ static void lincs_warning(FILE *fplog,
     }
 }
 
-static void cconerr(gmx_domdec_t *dd,
-                    int ncons, int *bla, real *bllen, rvec *x, t_pbc *pbc,
+static void cconerr(const struct gmx_lincsdata *lincsd,
+                    rvec *x, t_pbc *pbc,
                     real *ncons_loc, real *ssd, real *max, int *imax)
 {
-    real       len, d, ma, ssd2, r2;
-    int       *nlocat, count, b, im;
-    rvec       dx;
+    const int  *bla, *nlocat;
+    const real *bllen;
+    real        ma, ssd2;
+    int         count, im, task;
 
-    if (dd && dd->constraints)
-    {
-        nlocat = dd_constraints_nlocalatoms(dd);
-    }
-    else
-    {
-        nlocat = 0;
-    }
+    bla    = lincsd->bla;
+    bllen  = lincsd->bllen;
+    nlocat = lincsd->nlocat;
 
     ma    = 0;
     ssd2  = 0;
     im    = 0;
     count = 0;
-    for (b = 0; b < ncons; b++)
+    for (task = 0; task < lincsd->ntask; task++)
     {
-        if (pbc)
-        {
-            pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
-        }
-        else
-        {
-            rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
-        }
-        r2  = norm2(dx);
-        len = r2*gmx_invsqrt(r2);
-        d   = fabs(len/bllen[b]-1);
-        if (d > ma && (nlocat == NULL || nlocat[b]))
-        {
-            ma = d;
-            im = b;
-        }
-        if (nlocat == NULL)
-        {
-            ssd2 += d*d;
-            count++;
-        }
-        else
+        int b;
+
+        for (b = lincsd->task[task].b0; b < lincsd->task[task].b1; b++)
         {
-            ssd2  += nlocat[b]*d*d;
-            count += nlocat[b];
+            real len, d, r2;
+            rvec dx;
+
+            if (pbc)
+            {
+                pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
+            }
+            else
+            {
+                rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
+            }
+            r2  = norm2(dx);
+            len = r2*gmx_invsqrt(r2);
+            d   = fabs(len/bllen[b]-1);
+            if (d > ma && (nlocat == NULL || nlocat[b]))
+            {
+                ma = d;
+                im = b;
+            }
+            if (nlocat == NULL)
+            {
+                ssd2 += d*d;
+                count++;
+            }
+            else
+            {
+                ssd2  += nlocat[b]*d*d;
+                count += nlocat[b];
+            }
         }
     }
 
@@ -1980,7 +2394,7 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
 
         if (bLog && fplog)
         {
-            cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
+            cconerr(lincsd, xprime, pbc,
                     &ncons_loc, &p_ssd, &p_max, &p_imax);
         }
 
@@ -1991,18 +2405,18 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
         bWarn = FALSE;
 
         /* The OpenMP parallel region of constrain_lincs for coords */
-#pragma omp parallel num_threads(lincsd->nth)
+#pragma omp parallel num_threads(lincsd->ntask)
         {
             int th = gmx_omp_get_thread_num();
 
-            clear_mat(lincsd->th[th].vir_r_m_dr);
+            clear_mat(lincsd->task[th].vir_r_m_dr);
 
             do_lincs(x, xprime, box, pbc, lincsd, th,
                      md->invmass, cr,
                      bCalcDHDL,
                      ir->LincsWarnAngle, &bWarn,
                      invdt, v, bCalcVir,
-                     th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
+                     th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
         }
 
         if (bLog && fplog && lincsd->nc > 0)
@@ -2015,7 +2429,7 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
         }
         if (bLog || bEner)
         {
-            cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
+            cconerr(lincsd, xprime, pbc,
                     &ncons_loc, &p_ssd, &p_max, &p_imax);
             /* Check if we are doing the second part of SD */
             if (ir->eI == eiSD2 && v == NULL)
@@ -2048,7 +2462,7 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
         {
             if (maxwarn >= 0)
             {
-                cconerr(cr->dd, lincsd->nc, lincsd->bla, lincsd->bllen, xprime, pbc,
+                cconerr(lincsd, xprime, pbc,
                         &ncons_loc, &p_ssd, &p_max, &p_imax);
                 if (MULTISIM(cr))
                 {
@@ -2092,13 +2506,13 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
     else
     {
         /* The OpenMP parallel region of constrain_lincs for derivatives */
-#pragma omp parallel num_threads(lincsd->nth)
+#pragma omp parallel num_threads(lincsd->ntask)
         {
             int th = gmx_omp_get_thread_num();
 
             do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
                       md->invmass, econq, bCalcDHDL,
-                      bCalcVir, th == 0 ? vir_r_m_dr : lincsd->th[th].vir_r_m_dr);
+                      bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
         }
     }
 
@@ -2109,9 +2523,9 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
         int  th;
 
         dhdlambda = 0;
-        for (th = 0; th < lincsd->nth; th++)
+        for (th = 0; th < lincsd->ntask; th++)
         {
-            dhdlambda += lincsd->th[th].dhdlambda;
+            dhdlambda += lincsd->task[th].dhdlambda;
         }
         if (econqCoord)
         {
@@ -2122,16 +2536,16 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
         *dvdlambda += dhdlambda;
     }
 
-    if (bCalcVir && lincsd->nth > 1)
+    if (bCalcVir && lincsd->ntask > 1)
     {
-        for (i = 1; i < lincsd->nth; i++)
+        for (i = 1; i < lincsd->ntask; i++)
         {
-            m_add(vir_r_m_dr, lincsd->th[i].vir_r_m_dr, vir_r_m_dr);
+            m_add(vir_r_m_dr, lincsd->task[i].vir_r_m_dr, vir_r_m_dr);
         }
     }
 
     /* count assuming nit=1 */
-    inc_nrnb(nrnb, eNR_LINCS, lincsd->nc);
+    inc_nrnb(nrnb, eNR_LINCS, lincsd->nc_real);
     inc_nrnb(nrnb, eNR_LINCSMAT, (2+lincsd->nOrder)*lincsd->ncc);
     if (lincsd->ntriangle > 0)
     {
@@ -2139,11 +2553,11 @@ gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
     }
     if (v)
     {
-        inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc*2);
+        inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc_real*2);
     }
     if (bCalcVir)
     {
-        inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc);
+        inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc_real);
     }
 
     return bOK;