src/gromacs/mdlib/lincs.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /* This file is completely threadsafe - keep it that way! */
  38 #include "gmxpre.h"
  39
  40 #include "lincs.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdlib.h>
  46
  47 #include <cmath>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec.h"
  52 #include "gromacs/domdec/domdec_struct.h"
  53 #include "gromacs/gmxlib/nrnb.h"
  54 #include "gromacs/math/functions.h"
  55 #include "gromacs/math/units.h"
  56 #include "gromacs/math/vec.h"
  57 #include "gromacs/mdlib/constr.h"
  58 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  59 #include "gromacs/mdlib/mdrun.h"
  60 #include "gromacs/mdtypes/commrec.h"
  61 #include "gromacs/mdtypes/inputrec.h"
  62 #include "gromacs/mdtypes/md_enums.h"
  63 #include "gromacs/mdtypes/mdatom.h"
  64 #include "gromacs/pbcutil/pbc.h"
  65 #include "gromacs/pbcutil/pbc-simd.h"
  66 #include "gromacs/simd/simd.h"
  67 #include "gromacs/simd/simd_math.h"
  68 #include "gromacs/simd/vector_operations.h"
  69 #include "gromacs/topology/block.h"
  70 #include "gromacs/topology/mtop_util.h"
  71 #include "gromacs/utility/basedefinitions.h"
  72 #include "gromacs/utility/bitmask.h"
  73 #include "gromacs/utility/cstringutil.h"
  74 #include "gromacs/utility/exceptions.h"
  75 #include "gromacs/utility/fatalerror.h"
  76 #include "gromacs/utility/gmxomp.h"
  77 #include "gromacs/utility/pleasecite.h"
  78 #include "gromacs/utility/smalloc.h"
  79
  80 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  81
  82 typedef struct {
  83     int    b0;         /* first constraint for this task */
  84     int    b1;         /* b1-1 is the last constraint for this task */
  85     int    ntriangle;  /* the number of constraints in triangles */
  86     int   *triangle;   /* the list of triangle constraints */
  87     int   *tri_bits;   /* the bits tell if the matrix element should be used */
  88     int    tri_alloc;  /* allocation size of triangle and tri_bits */
  89     int    nind;       /* number of indices */
  90     int   *ind;        /* constraint index for updating atom data */
  91     int    nind_r;     /* number of indices */
  92     int   *ind_r;      /* constraint index for updating atom data */
  93     int    ind_nalloc; /* allocation size of ind and ind_r */
  94     tensor vir_r_m_dr; /* temporary variable for virial calculation */
  95     real   dhdlambda;  /* temporary variable for lambda derivative */
  96 } lincs_task_t;
  97
  98 typedef struct gmx_lincsdata {
  99     int             ncg;          /* the global number of constraints */
 100     int             ncg_flex;     /* the global number of flexible constraints */
 101     int             ncg_triangle; /* the global number of constraints in triangles */
 102     int             nIter;        /* the number of iterations */
 103     int             nOrder;       /* the order of the matrix expansion */
 104     int             max_connect;  /* the maximum number of constrains connected to a single atom */
 105
 106     int             nc_real;      /* the number of real constraints */
 107     int             nc;           /* the number of constraints including padding for SIMD */
 108     int             nc_alloc;     /* the number we allocated memory for */
 109     int             ncc;          /* the number of constraint connections */
 110     int             ncc_alloc;    /* the number we allocated memory for */
 111     real            matlam;       /* the FE lambda value used for filling blc and blmf */
 112     int            *con_index;    /* mapping from topology to LINCS constraints */
 113     real           *bllen0;       /* the reference distance in topology A */
 114     real           *ddist;        /* the reference distance in top B - the r.d. in top A */
 115     int            *bla;          /* the atom pairs involved in the constraints */
 116     real           *blc;          /* 1/sqrt(invmass1 + invmass2) */
 117     real           *blc1;         /* as blc, but with all masses 1 */
 118     int            *blnr;         /* index into blbnb and blmf */
 119     int            *blbnb;        /* list of constraint connections */
 120     int             ntriangle;    /* the local number of constraints in triangles */
 121     int             ncc_triangle; /* the number of constraint connections in triangles */
 122     gmx_bool        bCommIter;    /* communicate before each LINCS interation */
 123     real           *blmf;         /* matrix of mass factors for constraint connections */
 124     real           *blmf1;        /* as blmf, but with all masses 1 */
 125     real           *bllen;        /* the reference bond length */
 126     int            *nlocat;       /* the local atom count per constraint, can be NULL */
 127
 128     int             ntask;        /* The number of tasks = #threads for LINCS */
 129     lincs_task_t   *task;         /* LINCS thread division */
 130     gmx_bitmask_t  *atf;          /* atom flags for thread parallelization */
 131     int             atf_nalloc;   /* allocation size of atf */
 132     gmx_bool        bTaskDep;     /* are the LINCS tasks interdependent? */
 133     gmx_bool        bTaskDepTri;  /* are there triangle constraints that cross task borders? */
 134     /* arrays for temporary storage in the LINCS algorithm */
 135     rvec           *tmpv;
 136     real           *tmpncc;
 137     real           *tmp1;
 138     real           *tmp2;
 139     real           *tmp3;
 140     real           *tmp4;
 141     real           *mlambda; /* the Lagrange multipliers * -1 */
 142     /* storage for the constraint RMS relative deviation output */
 143     real            rmsd_data[3];
 144 } t_gmx_lincsdata;
 145
 146 /* Define simd_width for memory allocation used for SIMD code */
 147 #if GMX_SIMD_HAVE_REAL
 148 static const int simd_width = GMX_SIMD_REAL_WIDTH;
 149 #else
 150 static const int simd_width = 1;
 151 #endif
 152
 153 /* Align to 128 bytes, consistent with the current implementation of
 154    AlignedAllocator, which currently forces 128 byte alignment. */
 155 static const int align_bytes = 128;
 156
 157 real *lincs_rmsd_data(struct gmx_lincsdata *lincsd)
 158 {
 159     return lincsd->rmsd_data;
 160 }
 161
 162 real lincs_rmsd(struct gmx_lincsdata *lincsd)
 163 {
 164     if (lincsd->rmsd_data[0] > 0)
 165     {
 166         return std::sqrt(lincsd->rmsd_data[1]/lincsd->rmsd_data[0]);
 167     }
 168     else
 169     {
 170         return 0;
 171     }
 172 }
 173
 174 /* Do a set of nrec LINCS matrix multiplications.
 175  * This function will return with up to date thread-local
 176  * constraint data, without an OpenMP barrier.
 177  */
 178 static void lincs_matrix_expand(const gmx_lincsdata *lincsd,
 179                                 const lincs_task_t *li_task,
 180                                 const real *blcc,
 181                                 real *rhs1, real *rhs2, real *sol)
 182 {
 183     int        b0, b1, nrec, rec;
 184     const int *blnr  = lincsd->blnr;
 185     const int *blbnb = lincsd->blbnb;
 186
 187     b0   = li_task->b0;
 188     b1   = li_task->b1;
 189     nrec = lincsd->nOrder;
 190
 191     for (rec = 0; rec < nrec; rec++)
 192     {
 193         int b;
 194
 195         if (lincsd->bTaskDep)
 196         {
 197 #pragma omp barrier
 198         }
 199         for (b = b0; b < b1; b++)
 200         {
 201             real mvb;
 202             int  n;
 203
 204             mvb = 0;
 205             for (n = blnr[b]; n < blnr[b+1]; n++)
 206             {
 207                 mvb = mvb + blcc[n]*rhs1[blbnb[n]];
 208             }
 209             rhs2[b] = mvb;
 210             sol[b]  = sol[b] + mvb;
 211         }
 212
 213         real *swap;
 214
 215         swap = rhs1;
 216         rhs1 = rhs2;
 217         rhs2 = swap;
 218     } /* nrec*(ncons+2*nrtot) flops */
 219
 220     if (lincsd->ntriangle > 0)
 221     {
 222         /* Perform an extra nrec recursions for only the constraints
 223          * involved in rigid triangles.
 224          * In this way their accuracy should come close to those of the other
 225          * constraints, since traingles of constraints can produce eigenvalues
 226          * around 0.7, while the effective eigenvalue for bond constraints
 227          * is around 0.4 (and 0.7*0.7=0.5).
 228          */
 229
 230         if (lincsd->bTaskDep)
 231         {
 232             /* We need a barrier here, since other threads might still be
 233              * reading the contents of rhs1 and/o rhs2.
 234              * We could avoid this barrier by introducing two extra rhs
 235              * arrays for the triangle constraints only.
 236              */
 237 #pragma omp barrier
 238         }
 239
 240         /* Constraints involved in a triangle are ensured to be in the same
 241          * LINCS task. This means no barriers are required during the extra
 242          * iterations for the triangle constraints.
 243          */
 244         const int *triangle = li_task->triangle;
 245         const int *tri_bits = li_task->tri_bits;
 246
 247         for (rec = 0; rec < nrec; rec++)
 248         {
 249             int tb;
 250
 251             for (tb = 0; tb < li_task->ntriangle; tb++)
 252             {
 253                 int  b, bits, nr0, nr1, n;
 254                 real mvb;
 255
 256                 b    = triangle[tb];
 257                 bits = tri_bits[tb];
 258                 mvb  = 0;
 259                 nr0  = blnr[b];
 260                 nr1  = blnr[b+1];
 261                 for (n = nr0; n < nr1; n++)
 262                 {
 263                     if (bits & (1 << (n - nr0)))
 264                     {
 265                         mvb = mvb + blcc[n]*rhs1[blbnb[n]];
 266                     }
 267                 }
 268                 rhs2[b] = mvb;
 269                 sol[b]  = sol[b] + mvb;
 270             }
 271
 272             real *swap;
 273
 274             swap = rhs1;
 275             rhs1 = rhs2;
 276             rhs2 = swap;
 277         } /* nrec*(ntriangle + ncc_triangle*2) flops */
 278
 279         if (lincsd->bTaskDepTri)
 280         {
 281             /* The constraints triangles are decoupled from each other,
 282              * but constraints in one triangle cross thread task borders.
 283              * We could probably avoid this with more advanced setup code.
 284              */
 285 #pragma omp barrier
 286         }
 287     }
 288 }
 289
 290 static void lincs_update_atoms_noind(int ncons, const int *bla,
 291                                      real prefac,
 292                                      const real *fac, rvec *r,
 293                                      const real *invmass,
 294                                      rvec *x)
 295 {
 296     int  b, i, j;
 297     real mvb, im1, im2, tmp0, tmp1, tmp2;
 298
 299     if (invmass != nullptr)
 300     {
 301         for (b = 0; b < ncons; b++)
 302         {
 303             i        = bla[2*b];
 304             j        = bla[2*b+1];
 305             mvb      = prefac*fac[b];
 306             im1      = invmass[i];
 307             im2      = invmass[j];
 308             tmp0     = r[b][0]*mvb;
 309             tmp1     = r[b][1]*mvb;
 310             tmp2     = r[b][2]*mvb;
 311             x[i][0] -= tmp0*im1;
 312             x[i][1] -= tmp1*im1;
 313             x[i][2] -= tmp2*im1;
 314             x[j][0] += tmp0*im2;
 315             x[j][1] += tmp1*im2;
 316             x[j][2] += tmp2*im2;
 317         } /* 16 ncons flops */
 318     }
 319     else
 320     {
 321         for (b = 0; b < ncons; b++)
 322         {
 323             i        = bla[2*b];
 324             j        = bla[2*b+1];
 325             mvb      = prefac*fac[b];
 326             tmp0     = r[b][0]*mvb;
 327             tmp1     = r[b][1]*mvb;
 328             tmp2     = r[b][2]*mvb;
 329             x[i][0] -= tmp0;
 330             x[i][1] -= tmp1;
 331             x[i][2] -= tmp2;
 332             x[j][0] += tmp0;
 333             x[j][1] += tmp1;
 334             x[j][2] += tmp2;
 335         }
 336     }
 337 }
 338
 339 static void lincs_update_atoms_ind(int ncons, const int *ind, const int *bla,
 340                                    real prefac,
 341                                    const real *fac, rvec *r,
 342                                    const real *invmass,
 343                                    rvec *x)
 344 {
 345     int  bi, b, i, j;
 346     real mvb, im1, im2, tmp0, tmp1, tmp2;
 347
 348     if (invmass != nullptr)
 349     {
 350         for (bi = 0; bi < ncons; bi++)
 351         {
 352             b        = ind[bi];
 353             i        = bla[2*b];
 354             j        = bla[2*b+1];
 355             mvb      = prefac*fac[b];
 356             im1      = invmass[i];
 357             im2      = invmass[j];
 358             tmp0     = r[b][0]*mvb;
 359             tmp1     = r[b][1]*mvb;
 360             tmp2     = r[b][2]*mvb;
 361             x[i][0] -= tmp0*im1;
 362             x[i][1] -= tmp1*im1;
 363             x[i][2] -= tmp2*im1;
 364             x[j][0] += tmp0*im2;
 365             x[j][1] += tmp1*im2;
 366             x[j][2] += tmp2*im2;
 367         } /* 16 ncons flops */
 368     }
 369     else
 370     {
 371         for (bi = 0; bi < ncons; bi++)
 372         {
 373             b        = ind[bi];
 374             i        = bla[2*b];
 375             j        = bla[2*b+1];
 376             mvb      = prefac*fac[b];
 377             tmp0     = r[b][0]*mvb;
 378             tmp1     = r[b][1]*mvb;
 379             tmp2     = r[b][2]*mvb;
 380             x[i][0] -= tmp0;
 381             x[i][1] -= tmp1;
 382             x[i][2] -= tmp2;
 383             x[j][0] += tmp0;
 384             x[j][1] += tmp1;
 385             x[j][2] += tmp2;
 386         } /* 16 ncons flops */
 387     }
 388 }
 389
 390 static void lincs_update_atoms(gmx_lincsdata *li, int th,
 391                                real prefac,
 392                                const real *fac, rvec *r,
 393                                const real *invmass,
 394                                rvec *x)
 395 {
 396     if (li->ntask == 1)
 397     {
 398         /* Single thread, we simply update for all constraints */
 399         lincs_update_atoms_noind(li->nc_real,
 400                                  li->bla, prefac, fac, r, invmass, x);
 401     }
 402     else
 403     {
 404         /* Update the atom vector components for our thread local
 405          * constraints that only access our local atom range.
 406          * This can be done without a barrier.
 407          */
 408         lincs_update_atoms_ind(li->task[th].nind, li->task[th].ind,
 409                                li->bla, prefac, fac, r, invmass, x);
 410
 411         if (li->task[li->ntask].nind > 0)
 412         {
 413             /* Update the constraints that operate on atoms
 414              * in multiple thread atom blocks on the master thread.
 415              */
 416 #pragma omp barrier
 417 #pragma omp master
 418             {
 419                 lincs_update_atoms_ind(li->task[li->ntask].nind,
 420                                        li->task[li->ntask].ind,
 421                                        li->bla, prefac, fac, r, invmass, x);
 422             }
 423         }
 424     }
 425 }
 426
 427 #if GMX_SIMD_HAVE_REAL
 428 /* Calculate the constraint distance vectors r to project on from x.
 429  * Determine the right-hand side of the matrix equation using quantity f.
 430  * This function only differs from calc_dr_x_xp_simd below in that
 431  * no constraint length is subtracted and no PBC is used for f.
 432  */
 433 static void gmx_simdcall
 434 calc_dr_x_f_simd(int                       b0,
 435                  int                       b1,
 436                  const int *               bla,
 437                  const rvec * gmx_restrict x,
 438                  const rvec * gmx_restrict f,
 439                  const real * gmx_restrict blc,
 440                  const real *              pbc_simd,
 441                  rvec * gmx_restrict       r,
 442                  real * gmx_restrict       rhs,
 443                  real * gmx_restrict       sol)
 444 {
 445     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 446
 447     alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset2[GMX_SIMD_REAL_WIDTH];
 448
 449     for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 450     {
 451         offset2[i] = i;
 452     }
 453
 454     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 455     {
 456         SimdReal x0_S, y0_S, z0_S;
 457         SimdReal x1_S, y1_S, z1_S;
 458         SimdReal rx_S, ry_S, rz_S, n2_S, il_S;
 459         SimdReal fx_S, fy_S, fz_S, ip_S, rhs_S;
 460         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset0[GMX_SIMD_REAL_WIDTH];
 461         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset1[GMX_SIMD_REAL_WIDTH];
 462
 463         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 464         {
 465             offset0[i] = bla[bs*2 + i*2];
 466             offset1[i] = bla[bs*2 + i*2 + 1];
 467         }
 468
 469         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
 470         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
 471         rx_S = x0_S - x1_S;
 472         ry_S = y0_S - y1_S;
 473         rz_S = z0_S - z1_S;
 474
 475         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 476
 477         n2_S  = norm2(rx_S, ry_S, rz_S);
 478         il_S  = invsqrt(n2_S);
 479
 480         rx_S  = rx_S * il_S;
 481         ry_S  = ry_S * il_S;
 482         rz_S  = rz_S * il_S;
 483
 484         transposeScatterStoreU<3>(reinterpret_cast<real *>(r + bs), offset2, rx_S, ry_S, rz_S);
 485
 486         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(f), offset0, &x0_S, &y0_S, &z0_S);
 487         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(f), offset1, &x1_S, &y1_S, &z1_S);
 488         fx_S = x0_S - x1_S;
 489         fy_S = y0_S - y1_S;
 490         fz_S = z0_S - z1_S;
 491
 492         ip_S  = iprod(rx_S, ry_S, rz_S, fx_S, fy_S, fz_S);
 493
 494         rhs_S = load<SimdReal>(blc + bs) * ip_S;
 495
 496         store(rhs + bs, rhs_S);
 497         store(sol + bs, rhs_S);
 498     }
 499 }
 500 #endif // GMX_SIMD_HAVE_REAL
 501
 502 /* LINCS projection, works on derivatives of the coordinates */
 503 static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
 504                       gmx_lincsdata *lincsd, int th,
 505                       real *invmass,
 506                       int econq, gmx_bool bCalcDHDL,
 507                       gmx_bool bCalcVir, tensor rmdf)
 508 {
 509     int      b0, b1, b;
 510     int     *bla, *blnr, *blbnb;
 511     rvec    *r;
 512     real    *blc, *blmf, *blcc, *rhs1, *rhs2, *sol;
 513
 514     b0 = lincsd->task[th].b0;
 515     b1 = lincsd->task[th].b1;
 516
 517     bla    = lincsd->bla;
 518     r      = lincsd->tmpv;
 519     blnr   = lincsd->blnr;
 520     blbnb  = lincsd->blbnb;
 521     if (econq != econqForce)
 522     {
 523         /* Use mass-weighted parameters */
 524         blc  = lincsd->blc;
 525         blmf = lincsd->blmf;
 526     }
 527     else
 528     {
 529         /* Use non mass-weighted parameters */
 530         blc  = lincsd->blc1;
 531         blmf = lincsd->blmf1;
 532     }
 533     blcc   = lincsd->tmpncc;
 534     rhs1   = lincsd->tmp1;
 535     rhs2   = lincsd->tmp2;
 536     sol    = lincsd->tmp3;
 537
 538 #if GMX_SIMD_HAVE_REAL
 539     /* This SIMD code does the same as the plain-C code after the #else.
 540      * The only difference is that we always call pbc code, as with SIMD
 541      * the overhead of pbc computation (when not needed) is small.
 542      */
 543     alignas(GMX_SIMD_ALIGNMENT) real    pbc_simd[9*GMX_SIMD_REAL_WIDTH];
 544
 545     /* Convert the pbc struct for SIMD */
 546     set_pbc_simd(pbc, pbc_simd);
 547
 548     /* Compute normalized x i-j vectors, store in r.
 549      * Compute the inner product of r and xp i-j and store in rhs1.
 550      */
 551     calc_dr_x_f_simd(b0, b1, bla, x, f, blc,
 552                      pbc_simd,
 553                      r, rhs1, sol);
 554
 555 #else // GMX_SIMD_HAVE_REAL
 556
 557     /* Compute normalized i-j vectors */
 558     if (pbc)
 559     {
 560         for (b = b0; b < b1; b++)
 561         {
 562             rvec dx;
 563
 564             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
 565             unitv(dx, r[b]);
 566         }
 567     }
 568     else
 569     {
 570         for (b = b0; b < b1; b++)
 571         {
 572             rvec dx;
 573
 574             rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
 575             unitv(dx, r[b]);
 576         } /* 16 ncons flops */
 577     }
 578
 579     for (b = b0; b < b1; b++)
 580     {
 581         int  i, j;
 582         real mvb;
 583
 584         i       = bla[2*b];
 585         j       = bla[2*b+1];
 586         mvb     = blc[b]*(r[b][0]*(f[i][0] - f[j][0]) +
 587                           r[b][1]*(f[i][1] - f[j][1]) +
 588                           r[b][2]*(f[i][2] - f[j][2]));
 589         rhs1[b] = mvb;
 590         sol[b]  = mvb;
 591         /* 7 flops */
 592     }
 593
 594 #endif // GMX_SIMD_HAVE_REAL
 595
 596     if (lincsd->bTaskDep)
 597     {
 598         /* We need a barrier, since the matrix construction below
 599          * can access entries in r of other threads.
 600          */
 601 #pragma omp barrier
 602     }
 603
 604     /* Construct the (sparse) LINCS matrix */
 605     for (b = b0; b < b1; b++)
 606     {
 607         int n;
 608
 609         for (n = blnr[b]; n < blnr[b+1]; n++)
 610         {
 611             blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
 612         } /* 6 nr flops */
 613     }
 614     /* Together: 23*ncons + 6*nrtot flops */
 615
 616     lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
 617     /* nrec*(ncons+2*nrtot) flops */
 618
 619     if (econq == econqDeriv_FlexCon)
 620     {
 621         /* We only want to constraint the flexible constraints,
 622          * so we mask out the normal ones by setting sol to 0.
 623          */
 624         for (b = b0; b < b1; b++)
 625         {
 626             if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
 627             {
 628                 sol[b] = 0;
 629             }
 630         }
 631     }
 632
 633     /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
 634     for (b = b0; b < b1; b++)
 635     {
 636         sol[b] *= blc[b];
 637     }
 638
 639     /* When constraining forces, we should not use mass weighting,
 640      * so we pass invmass=NULL, which results in the use of 1 for all atoms.
 641      */
 642     lincs_update_atoms(lincsd, th, 1.0, sol, r,
 643                        (econq != econqForce) ? invmass : nullptr, fp);
 644
 645     if (bCalcDHDL)
 646     {
 647         real dhdlambda;
 648
 649         dhdlambda = 0;
 650         for (b = b0; b < b1; b++)
 651         {
 652             dhdlambda -= sol[b]*lincsd->ddist[b];
 653         }
 654
 655         lincsd->task[th].dhdlambda = dhdlambda;
 656     }
 657
 658     if (bCalcVir)
 659     {
 660         /* Constraint virial,
 661          * determines sum r_bond x delta f,
 662          * where delta f is the constraint correction
 663          * of the quantity that is being constrained.
 664          */
 665         for (b = b0; b < b1; b++)
 666         {
 667             real mvb, tmp1;
 668             int  i, j;
 669
 670             mvb = lincsd->bllen[b]*sol[b];
 671             for (i = 0; i < DIM; i++)
 672             {
 673                 tmp1 = mvb*r[b][i];
 674                 for (j = 0; j < DIM; j++)
 675                 {
 676                     rmdf[i][j] += tmp1*r[b][j];
 677                 }
 678             }
 679         } /* 23 ncons flops */
 680     }
 681 }
 682
 683 #if GMX_SIMD_HAVE_REAL
 684 /* Calculate the constraint distance vectors r to project on from x.
 685  * Determine the right-hand side of the matrix equation using coordinates xp.
 686  */
 687 static void gmx_simdcall
 688 calc_dr_x_xp_simd(int                       b0,
 689                   int                       b1,
 690                   const int *               bla,
 691                   const rvec * gmx_restrict x,
 692                   const rvec * gmx_restrict xp,
 693                   const real * gmx_restrict bllen,
 694                   const real * gmx_restrict blc,
 695                   const real *              pbc_simd,
 696                   rvec * gmx_restrict       r,
 697                   real * gmx_restrict       rhs,
 698                   real * gmx_restrict       sol)
 699 {
 700     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 701     alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset2[GMX_SIMD_REAL_WIDTH];
 702
 703     for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 704     {
 705         offset2[i] = i;
 706     }
 707
 708     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 709     {
 710         SimdReal x0_S, y0_S, z0_S;
 711         SimdReal x1_S, y1_S, z1_S;
 712         SimdReal rx_S, ry_S, rz_S, n2_S, il_S;
 713         SimdReal rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
 714         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset0[GMX_SIMD_REAL_WIDTH];
 715         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset1[GMX_SIMD_REAL_WIDTH];
 716
 717         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 718         {
 719             offset0[i] = bla[bs*2 + i*2];
 720             offset1[i] = bla[bs*2 + i*2 + 1];
 721         }
 722
 723         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
 724         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
 725         rx_S = x0_S - x1_S;
 726         ry_S = y0_S - y1_S;
 727         rz_S = z0_S - z1_S;
 728
 729         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 730
 731         n2_S  = norm2(rx_S, ry_S, rz_S);
 732         il_S  = invsqrt(n2_S);
 733
 734         rx_S  = rx_S * il_S;
 735         ry_S  = ry_S * il_S;
 736         rz_S  = rz_S * il_S;
 737
 738         transposeScatterStoreU<3>(reinterpret_cast<real *>(r + bs), offset2, rx_S, ry_S, rz_S);
 739
 740         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(xp), offset0, &x0_S, &y0_S, &z0_S);
 741         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(xp), offset1, &x1_S, &y1_S, &z1_S);
 742         rxp_S = x0_S - x1_S;
 743         ryp_S = y0_S - y1_S;
 744         rzp_S = z0_S - z1_S;
 745
 746         pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
 747
 748         ip_S  = iprod(rx_S, ry_S, rz_S, rxp_S, ryp_S, rzp_S);
 749
 750         rhs_S = load<SimdReal>(blc + bs) * (ip_S - load<SimdReal>(bllen + bs));
 751
 752         store(rhs + bs, rhs_S);
 753         store(sol + bs, rhs_S);
 754     }
 755 }
 756 #endif // GMX_SIMD_HAVE_REAL
 757
 758 /* Determine the distances and right-hand side for the next iteration */
 759 gmx_unused static void calc_dist_iter(
 760         int                       b0,
 761         int                       b1,
 762         const int *               bla,
 763         const rvec * gmx_restrict xp,
 764         const real * gmx_restrict bllen,
 765         const real * gmx_restrict blc,
 766         const t_pbc *             pbc,
 767         real                      wfac,
 768         real * gmx_restrict       rhs,
 769         real * gmx_restrict       sol,
 770         gmx_bool *                bWarn)
 771 {
 772     int b;
 773
 774     for (b = b0; b < b1; b++)
 775     {
 776         real len, len2, dlen2, mvb;
 777         rvec dx;
 778
 779         len = bllen[b];
 780         if (pbc)
 781         {
 782             pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
 783         }
 784         else
 785         {
 786             rvec_sub(xp[bla[2*b]], xp[bla[2*b+1]], dx);
 787         }
 788         len2  = len*len;
 789         dlen2 = 2*len2 - norm2(dx);
 790         if (dlen2 < wfac*len2)
 791         {
 792             /* not race free - see detailed comment in caller */
 793             *bWarn = TRUE;
 794         }
 795         if (dlen2 > 0)
 796         {
 797             mvb = blc[b]*(len - dlen2*gmx::invsqrt(dlen2));
 798         }
 799         else
 800         {
 801             mvb = blc[b]*len;
 802         }
 803         rhs[b]  = mvb;
 804         sol[b]  = mvb;
 805     } /* 20*ncons flops */
 806 }
 807
 808 #if GMX_SIMD_HAVE_REAL
 809 /* As the function above, but using SIMD intrinsics */
 810 static void gmx_simdcall
 811 calc_dist_iter_simd(int                       b0,
 812                     int                       b1,
 813                     const int *               bla,
 814                     const rvec * gmx_restrict x,
 815                     const real * gmx_restrict bllen,
 816                     const real * gmx_restrict blc,
 817                     const real *              pbc_simd,
 818                     real                      wfac,
 819                     real * gmx_restrict       rhs,
 820                     real * gmx_restrict       sol,
 821                     gmx_bool *                bWarn)
 822 {
 823     SimdReal        min_S(GMX_REAL_MIN);
 824     SimdReal        two_S(2.0);
 825     SimdReal        wfac_S(wfac);
 826     SimdBool        warn_S;
 827
 828     int             bs;
 829
 830     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 831
 832     /* Initialize all to FALSE */
 833     warn_S = (two_S < setZero());
 834
 835     for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 836     {
 837         SimdReal x0_S, y0_S, z0_S;
 838         SimdReal x1_S, y1_S, z1_S;
 839         SimdReal rx_S, ry_S, rz_S, n2_S;
 840         SimdReal len_S, len2_S, dlen2_S, lc_S, blc_S;
 841         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset0[GMX_SIMD_REAL_WIDTH];
 842         alignas(GMX_SIMD_ALIGNMENT) std::int32_t      offset1[GMX_SIMD_REAL_WIDTH];
 843
 844         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 845         {
 846             offset0[i] = bla[bs*2 + i*2];
 847             offset1[i] = bla[bs*2 + i*2 + 1];
 848         }
 849
 850         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
 851         gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
 852         rx_S = x0_S - x1_S;
 853         ry_S = y0_S - y1_S;
 854         rz_S = z0_S - z1_S;
 855
 856         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 857
 858         n2_S    = norm2(rx_S, ry_S, rz_S);
 859
 860         len_S   = load<SimdReal>(bllen + bs);
 861         len2_S  = len_S * len_S;
 862
 863         dlen2_S = fms(two_S, len2_S, n2_S);
 864
 865         warn_S  = warn_S || (dlen2_S < (wfac_S * len2_S));
 866
 867         /* Avoid 1/0 by taking the max with REAL_MIN.
 868          * Note: when dlen2 is close to zero (90 degree constraint rotation),
 869          * the accuracy of the algorithm is no longer relevant.
 870          */
 871         dlen2_S = max(dlen2_S, min_S);
 872
 873         lc_S    = fnma(dlen2_S, invsqrt(dlen2_S), len_S);
 874
 875         blc_S   = load<SimdReal>(blc + bs);
 876
 877         lc_S    = blc_S * lc_S;
 878
 879         store(rhs + bs, lc_S);
 880         store(sol + bs, lc_S);
 881     }
 882
 883     if (anyTrue(warn_S))
 884     {
 885         *bWarn = TRUE;
 886     }
 887 }
 888 #endif // GMX_SIMD_HAVE_REAL
 889
 890 static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
 891                      gmx_lincsdata *lincsd, int th,
 892                      const real *invmass,
 893                      const t_commrec *cr,
 894                      gmx_bool bCalcDHDL,
 895                      real wangle, gmx_bool *bWarn,
 896                      real invdt, rvec * gmx_restrict v,
 897                      gmx_bool bCalcVir, tensor vir_r_m_dr)
 898 {
 899     int      b0, b1, b, i, j, n, iter;
 900     int     *bla, *blnr, *blbnb;
 901     rvec    *r;
 902     real    *blc, *blmf, *bllen, *blcc, *rhs1, *rhs2, *sol, *blc_sol, *mlambda;
 903     int     *nlocat;
 904
 905     b0 = lincsd->task[th].b0;
 906     b1 = lincsd->task[th].b1;
 907
 908     bla     = lincsd->bla;
 909     r       = lincsd->tmpv;
 910     blnr    = lincsd->blnr;
 911     blbnb   = lincsd->blbnb;
 912     blc     = lincsd->blc;
 913     blmf    = lincsd->blmf;
 914     bllen   = lincsd->bllen;
 915     blcc    = lincsd->tmpncc;
 916     rhs1    = lincsd->tmp1;
 917     rhs2    = lincsd->tmp2;
 918     sol     = lincsd->tmp3;
 919     blc_sol = lincsd->tmp4;
 920     mlambda = lincsd->mlambda;
 921     nlocat  = lincsd->nlocat;
 922
 923 #if GMX_SIMD_HAVE_REAL
 924
 925     /* This SIMD code does the same as the plain-C code after the #else.
 926      * The only difference is that we always call pbc code, as with SIMD
 927      * the overhead of pbc computation (when not needed) is small.
 928      */
 929     alignas(GMX_SIMD_ALIGNMENT) real    pbc_simd[9*GMX_SIMD_REAL_WIDTH];
 930
 931     /* Convert the pbc struct for SIMD */
 932     set_pbc_simd(pbc, pbc_simd);
 933
 934     /* Compute normalized x i-j vectors, store in r.
 935      * Compute the inner product of r and xp i-j and store in rhs1.
 936      */
 937     calc_dr_x_xp_simd(b0, b1, bla, x, xp, bllen, blc,
 938                       pbc_simd,
 939                       r, rhs1, sol);
 940
 941 #else // GMX_SIMD_HAVE_REAL
 942
 943     if (pbc)
 944     {
 945         /* Compute normalized i-j vectors */
 946         for (b = b0; b < b1; b++)
 947         {
 948             rvec dx;
 949             real mvb;
 950
 951             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
 952             unitv(dx, r[b]);
 953
 954             pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
 955             mvb     = blc[b]*(iprod(r[b], dx) - bllen[b]);
 956             rhs1[b] = mvb;
 957             sol[b]  = mvb;
 958         }
 959     }
 960     else
 961     {
 962         /* Compute normalized i-j vectors */
 963         for (b = b0; b < b1; b++)
 964         {
 965             real tmp0, tmp1, tmp2, rlen, mvb;
 966
 967             i       = bla[2*b];
 968             j       = bla[2*b+1];
 969             tmp0    = x[i][0] - x[j][0];
 970             tmp1    = x[i][1] - x[j][1];
 971             tmp2    = x[i][2] - x[j][2];
 972             rlen    = gmx::invsqrt(tmp0*tmp0 + tmp1*tmp1 + tmp2*tmp2);
 973             r[b][0] = rlen*tmp0;
 974             r[b][1] = rlen*tmp1;
 975             r[b][2] = rlen*tmp2;
 976             /* 16 ncons flops */
 977
 978             i       = bla[2*b];
 979             j       = bla[2*b+1];
 980             mvb     = blc[b]*(r[b][0]*(xp[i][0] - xp[j][0]) +
 981                               r[b][1]*(xp[i][1] - xp[j][1]) +
 982                               r[b][2]*(xp[i][2] - xp[j][2]) - bllen[b]);
 983             rhs1[b] = mvb;
 984             sol[b]  = mvb;
 985             /* 10 flops */
 986         }
 987         /* Together: 26*ncons + 6*nrtot flops */
 988     }
 989
 990 #endif // GMX_SIMD_HAVE_REAL
 991
 992     if (lincsd->bTaskDep)
 993     {
 994         /* We need a barrier, since the matrix construction below
 995          * can access entries in r of other threads.
 996          */
 997 #pragma omp barrier
 998     }
 999
1000     /* Construct the (sparse) LINCS matrix */
1001     for (b = b0; b < b1; b++)
1002     {
1003         for (n = blnr[b]; n < blnr[b+1]; n++)
1004         {
1005             blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
1006         }
1007     }
1008     /* Together: 26*ncons + 6*nrtot flops */
1009
1010     lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
1011     /* nrec*(ncons+2*nrtot) flops */
1012
1013 #if GMX_SIMD_HAVE_REAL
1014     for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1015     {
1016         SimdReal t1 = load<SimdReal>(blc + b);
1017         SimdReal t2 = load<SimdReal>(sol + b);
1018         store(mlambda + b, t1 * t2);
1019     }
1020 #else
1021     for (b = b0; b < b1; b++)
1022     {
1023         mlambda[b] = blc[b]*sol[b];
1024     }
1025 #endif // GMX_SIMD_HAVE_REAL
1026
1027     /* Update the coordinates */
1028     lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
1029
1030     /*
1031      ********  Correction for centripetal effects  ********
1032      */
1033
1034     real wfac;
1035
1036     wfac = std::cos(DEG2RAD*wangle);
1037     wfac = wfac*wfac;
1038
1039     for (iter = 0; iter < lincsd->nIter; iter++)
1040     {
1041         if ((lincsd->bCommIter && DOMAINDECOMP(cr) && cr->dd->constraints))
1042         {
1043 #pragma omp barrier
1044 #pragma omp master
1045             {
1046                 /* Communicate the corrected non-local coordinates */
1047                 if (DOMAINDECOMP(cr))
1048                 {
1049                     dd_move_x_constraints(cr->dd, box, xp, nullptr, FALSE);
1050                 }
1051             }
1052 #pragma omp barrier
1053         }
1054         else if (lincsd->bTaskDep)
1055         {
1056 #pragma omp barrier
1057         }
1058
1059 #if GMX_SIMD_HAVE_REAL
1060         calc_dist_iter_simd(b0, b1, bla, xp, bllen, blc, pbc_simd, wfac,
1061                             rhs1, sol, bWarn);
1062 #else
1063         calc_dist_iter(b0, b1, bla, xp, bllen, blc, pbc, wfac,
1064                        rhs1, sol, bWarn);
1065         /* 20*ncons flops */
1066 #endif  // GMX_SIMD_HAVE_REAL
1067
1068         lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
1069         /* nrec*(ncons+2*nrtot) flops */
1070
1071 #if GMX_SIMD_HAVE_REAL
1072         for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1073         {
1074             SimdReal t1  = load<SimdReal>(blc + b);
1075             SimdReal t2  = load<SimdReal>(sol + b);
1076             SimdReal mvb = t1 * t2;
1077             store(blc_sol + b, mvb);
1078             store(mlambda + b, load<SimdReal>(mlambda + b) + mvb);
1079         }
1080 #else
1081         for (b = b0; b < b1; b++)
1082         {
1083             real mvb;
1084
1085             mvb         = blc[b]*sol[b];
1086             blc_sol[b]  = mvb;
1087             mlambda[b] += mvb;
1088         }
1089 #endif  // GMX_SIMD_HAVE_REAL
1090
1091         /* Update the coordinates */
1092         lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
1093     }
1094     /* nit*ncons*(37+9*nrec) flops */
1095
1096     if (v != nullptr)
1097     {
1098         /* Update the velocities */
1099         lincs_update_atoms(lincsd, th, invdt, mlambda, r, invmass, v);
1100         /* 16 ncons flops */
1101     }
1102
1103     if (nlocat != nullptr && (bCalcDHDL || bCalcVir))
1104     {
1105         if (lincsd->bTaskDep)
1106         {
1107             /* In lincs_update_atoms threads might cross-read mlambda */
1108 #pragma omp barrier
1109         }
1110
1111         /* Only account for local atoms */
1112         for (b = b0; b < b1; b++)
1113         {
1114             mlambda[b] *= 0.5*nlocat[b];
1115         }
1116     }
1117
1118     if (bCalcDHDL)
1119     {
1120         real dhdl;
1121
1122         dhdl = 0;
1123         for (b = b0; b < b1; b++)
1124         {
1125             /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1126              * later after the contributions are reduced over the threads.
1127              */
1128             dhdl -= lincsd->mlambda[b]*lincsd->ddist[b];
1129         }
1130         lincsd->task[th].dhdlambda = dhdl;
1131     }
1132
1133     if (bCalcVir)
1134     {
1135         /* Constraint virial */
1136         for (b = b0; b < b1; b++)
1137         {
1138             real tmp0, tmp1;
1139
1140             tmp0 = -bllen[b]*mlambda[b];
1141             for (i = 0; i < DIM; i++)
1142             {
1143                 tmp1 = tmp0*r[b][i];
1144                 for (j = 0; j < DIM; j++)
1145                 {
1146                     vir_r_m_dr[i][j] -= tmp1*r[b][j];
1147                 }
1148             }
1149         } /* 22 ncons flops */
1150     }
1151
1152     /* Total:
1153      * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1154      * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1155      *
1156      * (26+nrec)*ncons + (6+2*nrec)*nrtot
1157      * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1158      * if nit=1
1159      * (63+nrec)*ncons + (6+4*nrec)*nrtot
1160      */
1161 }
1162
1163 /* Sets the elements in the LINCS matrix for task li_task */
1164 static void set_lincs_matrix_task(gmx_lincsdata        *li,
1165                                   lincs_task_t         *li_task,
1166                                   const real           *invmass,
1167                                   int                  *ncc_triangle,
1168                                   int                  *nCrossTaskTriangles)
1169 {
1170     int        i;
1171
1172     /* Construct the coupling coefficient matrix blmf */
1173     li_task->ntriangle   = 0;
1174     *ncc_triangle        = 0;
1175     *nCrossTaskTriangles = 0;
1176     for (i = li_task->b0; i < li_task->b1; i++)
1177     {
1178         int a1, a2, n;
1179
1180         a1 = li->bla[2*i];
1181         a2 = li->bla[2*i+1];
1182         for (n = li->blnr[i]; (n < li->blnr[i+1]); n++)
1183         {
1184             int k, sign, center, end;
1185
1186             k = li->blbnb[n];
1187
1188             /* If we are using multiple, independent tasks for LINCS,
1189              * the calls to check_assign_connected should have
1190              * put all connected constraints in our task.
1191              */
1192             assert(li->bTaskDep || (k >= li_task->b0 && k < li_task->b1));
1193
1194             if (a1 == li->bla[2*k] || a2 == li->bla[2*k+1])
1195             {
1196                 sign = -1;
1197             }
1198             else
1199             {
1200                 sign = 1;
1201             }
1202             if (a1 == li->bla[2*k] || a1 == li->bla[2*k+1])
1203             {
1204                 center = a1;
1205                 end    = a2;
1206             }
1207             else
1208             {
1209                 center = a2;
1210                 end    = a1;
1211             }
1212             li->blmf[n]  = sign*invmass[center]*li->blc[i]*li->blc[k];
1213             li->blmf1[n] = sign*0.5;
1214             if (li->ncg_triangle > 0)
1215             {
1216                 int nk, kk;
1217
1218                 /* Look for constraint triangles */
1219                 for (nk = li->blnr[k]; (nk < li->blnr[k+1]); nk++)
1220                 {
1221                     kk = li->blbnb[nk];
1222                     if (kk != i && kk != k &&
1223                         (li->bla[2*kk] == end || li->bla[2*kk+1] == end))
1224                     {
1225                         /* Check if the constraints in this triangle actually
1226                          * belong to a different task. We still assign them
1227                          * here, since it's convenient for the triangle
1228                          * iterations, but we then need an extra barrier.
1229                          */
1230                         if (k  < li_task->b0 || k  >= li_task->b1 ||
1231                             kk < li_task->b0 || kk >= li_task->b1)
1232                         {
1233                             (*nCrossTaskTriangles)++;
1234                         }
1235
1236                         if (li_task->ntriangle == 0 ||
1237                             li_task->triangle[li_task->ntriangle - 1] < i)
1238                         {
1239                             /* Add this constraint to the triangle list */
1240                             li_task->triangle[li_task->ntriangle] = i;
1241                             li_task->tri_bits[li_task->ntriangle] = 0;
1242                             li_task->ntriangle++;
1243                             if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li_task->tri_bits[0])*8 - 1))
1244                             {
1245                                 gmx_fatal(FARGS, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
1246                                           li->blnr[i+1] - li->blnr[i],
1247                                           sizeof(li_task->tri_bits[0])*8-1);
1248                             }
1249                         }
1250                         li_task->tri_bits[li_task->ntriangle-1] |= (1 << (n - li->blnr[i]));
1251                         (*ncc_triangle)++;
1252                     }
1253                 }
1254             }
1255         }
1256     }
1257 }
1258
1259 /* Sets the elements in the LINCS matrix */
1260 static void set_lincs_matrix(gmx_lincsdata *li, real *invmass, real lambda)
1261 {
1262     int        i;
1263     const real invsqrt2 = 0.7071067811865475244;
1264
1265     for (i = 0; (i < li->nc); i++)
1266     {
1267         int a1, a2;
1268
1269         a1          = li->bla[2*i];
1270         a2          = li->bla[2*i+1];
1271         li->blc[i]  = gmx::invsqrt(invmass[a1] + invmass[a2]);
1272         li->blc1[i] = invsqrt2;
1273     }
1274
1275     /* Construct the coupling coefficient matrix blmf */
1276     int th, ntriangle = 0, ncc_triangle = 0, nCrossTaskTriangles = 0;
1277 #pragma omp parallel for reduction(+: ntriangle, ncc_triangle, nCrossTaskTriangles) num_threads(li->ntask) schedule(static)
1278     for (th = 0; th < li->ntask; th++)
1279     {
1280         try
1281         {
1282             set_lincs_matrix_task(li, &li->task[th], invmass,
1283                                   &ncc_triangle, &nCrossTaskTriangles);
1284             ntriangle = li->task[th].ntriangle;
1285         }
1286         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1287     }
1288     li->ntriangle    = ntriangle;
1289     li->ncc_triangle = ncc_triangle;
1290     li->bTaskDepTri  = (nCrossTaskTriangles > 0);
1291
1292     if (debug)
1293     {
1294         fprintf(debug, "The %d constraints participate in %d triangles\n",
1295                 li->nc, li->ntriangle);
1296         fprintf(debug, "There are %d constraint couplings, of which %d in triangles\n",
1297                 li->ncc, li->ncc_triangle);
1298         if (li->ntriangle > 0 && li->ntask > 1)
1299         {
1300             fprintf(debug, "%d constraint triangles contain constraints assigned to different tasks\n",
1301                     nCrossTaskTriangles);
1302         }
1303     }
1304
1305     /* Set matlam,
1306      * so we know with which lambda value the masses have been set.
1307      */
1308     li->matlam = lambda;
1309 }
1310
1311 static int count_triangle_constraints(const t_ilist  *ilist,
1312                                       const t_blocka *at2con)
1313 {
1314     int      ncon1, ncon_tot;
1315     int      c0, a00, a01, n1, c1, a10, a11, ac1, n2, c2, a20, a21;
1316     int      ncon_triangle;
1317     gmx_bool bTriangle;
1318     t_iatom *ia1, *ia2, *iap;
1319
1320     ncon1    = ilist[F_CONSTR].nr/3;
1321     ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
1322
1323     ia1 = ilist[F_CONSTR].iatoms;
1324     ia2 = ilist[F_CONSTRNC].iatoms;
1325
1326     ncon_triangle = 0;
1327     for (c0 = 0; c0 < ncon_tot; c0++)
1328     {
1329         bTriangle = FALSE;
1330         iap       = constr_iatomptr(ncon1, ia1, ia2, c0);
1331         a00       = iap[1];
1332         a01       = iap[2];
1333         for (n1 = at2con->index[a01]; n1 < at2con->index[a01+1]; n1++)
1334         {
1335             c1 = at2con->a[n1];
1336             if (c1 != c0)
1337             {
1338                 iap = constr_iatomptr(ncon1, ia1, ia2, c1);
1339                 a10 = iap[1];
1340                 a11 = iap[2];
1341                 if (a10 == a01)
1342                 {
1343                     ac1 = a11;
1344                 }
1345                 else
1346                 {
1347                     ac1 = a10;
1348                 }
1349                 for (n2 = at2con->index[ac1]; n2 < at2con->index[ac1+1]; n2++)
1350                 {
1351                     c2 = at2con->a[n2];
1352                     if (c2 != c0 && c2 != c1)
1353                     {
1354                         iap = constr_iatomptr(ncon1, ia1, ia2, c2);
1355                         a20 = iap[1];
1356                         a21 = iap[2];
1357                         if (a20 == a00 || a21 == a00)
1358                         {
1359                             bTriangle = TRUE;
1360                         }
1361                     }
1362                 }
1363             }
1364         }
1365         if (bTriangle)
1366         {
1367             ncon_triangle++;
1368         }
1369     }
1370
1371     return ncon_triangle;
1372 }
1373
1374 static gmx_bool more_than_two_sequential_constraints(const t_ilist  *ilist,
1375                                                      const t_blocka *at2con)
1376 {
1377     t_iatom  *ia1, *ia2, *iap;
1378     int       ncon1, ncon_tot, c;
1379     int       a1, a2;
1380     gmx_bool  bMoreThanTwoSequentialConstraints;
1381
1382     ncon1    = ilist[F_CONSTR].nr/3;
1383     ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
1384
1385     ia1 = ilist[F_CONSTR].iatoms;
1386     ia2 = ilist[F_CONSTRNC].iatoms;
1387
1388     bMoreThanTwoSequentialConstraints = FALSE;
1389     for (c = 0; c < ncon_tot && !bMoreThanTwoSequentialConstraints; c++)
1390     {
1391         iap = constr_iatomptr(ncon1, ia1, ia2, c);
1392         a1  = iap[1];
1393         a2  = iap[2];
1394         /* Check if this constraint has constraints connected at both atoms */
1395         if (at2con->index[a1+1] - at2con->index[a1] > 1 &&
1396             at2con->index[a2+1] - at2con->index[a2] > 1)
1397         {
1398             bMoreThanTwoSequentialConstraints = TRUE;
1399         }
1400     }
1401
1402     return bMoreThanTwoSequentialConstraints;
1403 }
1404
1405 static int int_comp(const void *a, const void *b)
1406 {
1407     return (*(int *)a) - (*(int *)b);
1408 }
1409
1410 gmx_lincsdata_t init_lincs(FILE *fplog, const gmx_mtop_t *mtop,
1411                            int nflexcon_global, const t_blocka *at2con,
1412                            gmx_bool bPLINCS, int nIter, int nProjOrder)
1413 {
1414     gmx_lincsdata        *li;
1415     gmx_bool              bMoreThanTwoSeq;
1416
1417     if (fplog)
1418     {
1419         fprintf(fplog, "\nInitializing%s LINear Constraint Solver\n",
1420                 bPLINCS ? " Parallel" : "");
1421     }
1422
1423     snew(li, 1);
1424
1425     li->ncg      =
1426         gmx_mtop_ftype_count(mtop, F_CONSTR) +
1427         gmx_mtop_ftype_count(mtop, F_CONSTRNC);
1428     li->ncg_flex = nflexcon_global;
1429
1430     li->nIter  = nIter;
1431     li->nOrder = nProjOrder;
1432
1433     li->max_connect = 0;
1434     for (size_t mt = 0; mt < mtop->moltype.size(); mt++)
1435     {
1436         for (int a = 0; a < mtop->moltype[mt].atoms.nr; a++)
1437         {
1438             li->max_connect = std::max(li->max_connect,
1439                                        at2con[mt].index[a + 1] - at2con[mt].index[a]);
1440         }
1441     }
1442
1443     li->ncg_triangle = 0;
1444     bMoreThanTwoSeq  = FALSE;
1445     for (const gmx_molblock_t &molb : mtop->molblock)
1446     {
1447         const gmx_moltype_t &molt = mtop->moltype[molb.type];
1448
1449         li->ncg_triangle +=
1450             molb.nmol*
1451             count_triangle_constraints(molt.ilist, &at2con[molb.type]);
1452
1453         if (!bMoreThanTwoSeq &&
1454             more_than_two_sequential_constraints(molt.ilist, &at2con[molb.type]))
1455         {
1456             bMoreThanTwoSeq = TRUE;
1457         }
1458     }
1459
1460     /* Check if we need to communicate not only before LINCS,
1461      * but also before each iteration.
1462      * The check for only two sequential constraints is only
1463      * useful for the common case of H-bond only constraints.
1464      * With more effort we could also make it useful for small
1465      * molecules with nr. sequential constraints <= nOrder-1.
1466      */
1467     li->bCommIter = (bPLINCS && (li->nOrder < 1 || bMoreThanTwoSeq));
1468
1469     if (debug && bPLINCS)
1470     {
1471         fprintf(debug, "PLINCS communication before each iteration: %d\n",
1472                 li->bCommIter);
1473     }
1474
1475     /* LINCS can run on any number of threads.
1476      * Currently the number is fixed for the whole simulation,
1477      * but it could be set in set_lincs().
1478      * The current constraint to task assignment code can create independent
1479      * tasks only when not more than two constraints are connected sequentially.
1480      */
1481     li->ntask    = gmx_omp_nthreads_get(emntLINCS);
1482     li->bTaskDep = (li->ntask > 1 && bMoreThanTwoSeq);
1483     if (debug)
1484     {
1485         fprintf(debug, "LINCS: using %d threads, tasks are %sdependent\n",
1486                 li->ntask, li->bTaskDep ? "" : "in");
1487     }
1488     if (li->ntask == 1)
1489     {
1490         snew(li->task, 1);
1491     }
1492     else
1493     {
1494         /* Allocate an extra elements for "task-overlap" constraints */
1495         snew(li->task, li->ntask + 1);
1496     }
1497
1498     if (bPLINCS || li->ncg_triangle > 0)
1499     {
1500         please_cite(fplog, "Hess2008a");
1501     }
1502     else
1503     {
1504         please_cite(fplog, "Hess97a");
1505     }
1506
1507     if (fplog)
1508     {
1509         fprintf(fplog, "The number of constraints is %d\n", li->ncg);
1510         if (bPLINCS)
1511         {
1512             fprintf(fplog, "There are inter charge-group constraints,\n"
1513                     "will communicate selected coordinates each lincs iteration\n");
1514         }
1515         if (li->ncg_triangle > 0)
1516         {
1517             fprintf(fplog,
1518                     "%d constraints are involved in constraint triangles,\n"
1519                     "will apply an additional matrix expansion of order %d for couplings\n"
1520                     "between constraints inside triangles\n",
1521                     li->ncg_triangle, li->nOrder);
1522         }
1523     }
1524
1525     return li;
1526 }
1527
1528 /* Sets up the work division over the threads */
1529 static void lincs_thread_setup(gmx_lincsdata *li, int natoms)
1530 {
1531     lincs_task_t   *li_m;
1532     int             th;
1533     gmx_bitmask_t  *atf;
1534     int             a;
1535
1536     if (natoms > li->atf_nalloc)
1537     {
1538         li->atf_nalloc = over_alloc_large(natoms);
1539         srenew(li->atf, li->atf_nalloc);
1540     }
1541
1542     atf = li->atf;
1543     /* Clear the atom flags */
1544     for (a = 0; a < natoms; a++)
1545     {
1546         bitmask_clear(&atf[a]);
1547     }
1548
1549     if (li->ntask > BITMASK_SIZE)
1550     {
1551         gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
1552     }
1553
1554     for (th = 0; th < li->ntask; th++)
1555     {
1556         lincs_task_t *li_task;
1557         int           b;
1558
1559         li_task = &li->task[th];
1560
1561         /* For each atom set a flag for constraints from each */
1562         for (b = li_task->b0; b < li_task->b1; b++)
1563         {
1564             bitmask_set_bit(&atf[li->bla[b*2    ]], th);
1565             bitmask_set_bit(&atf[li->bla[b*2 + 1]], th);
1566         }
1567     }
1568
1569 #pragma omp parallel for num_threads(li->ntask) schedule(static)
1570     for (th = 0; th < li->ntask; th++)
1571     {
1572         try
1573         {
1574             lincs_task_t  *li_task;
1575             gmx_bitmask_t  mask;
1576             int            b;
1577
1578             li_task = &li->task[th];
1579
1580             if (li_task->b1 - li_task->b0 > li_task->ind_nalloc)
1581             {
1582                 li_task->ind_nalloc = over_alloc_large(li_task->b1-li_task->b0);
1583                 srenew(li_task->ind, li_task->ind_nalloc);
1584                 srenew(li_task->ind_r, li_task->ind_nalloc);
1585             }
1586
1587             bitmask_init_low_bits(&mask, th);
1588
1589             li_task->nind   = 0;
1590             li_task->nind_r = 0;
1591             for (b = li_task->b0; b < li_task->b1; b++)
1592             {
1593                 /* We let the constraint with the lowest thread index
1594                  * operate on atoms with constraints from multiple threads.
1595                  */
1596                 if (bitmask_is_disjoint(atf[li->bla[b*2]], mask) &&
1597                     bitmask_is_disjoint(atf[li->bla[b*2+1]], mask))
1598                 {
1599                     /* Add the constraint to the local atom update index */
1600                     li_task->ind[li_task->nind++] = b;
1601                 }
1602                 else
1603                 {
1604                     /* Add the constraint to the rest block */
1605                     li_task->ind_r[li_task->nind_r++] = b;
1606                 }
1607             }
1608         }
1609         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1610     }
1611
1612     /* We need to copy all constraints which have not be assigned
1613      * to a thread to a separate list which will be handled by one thread.
1614      */
1615     li_m = &li->task[li->ntask];
1616
1617     li_m->nind = 0;
1618     for (th = 0; th < li->ntask; th++)
1619     {
1620         lincs_task_t *li_task;
1621         int           b;
1622
1623         li_task   = &li->task[th];
1624
1625         if (li_m->nind + li_task->nind_r > li_m->ind_nalloc)
1626         {
1627             li_m->ind_nalloc = over_alloc_large(li_m->nind+li_task->nind_r);
1628             srenew(li_m->ind, li_m->ind_nalloc);
1629         }
1630
1631         for (b = 0; b < li_task->nind_r; b++)
1632         {
1633             li_m->ind[li_m->nind++] = li_task->ind_r[b];
1634         }
1635
1636         if (debug)
1637         {
1638             fprintf(debug, "LINCS thread %d: %d constraints\n",
1639                     th, li_task->nind);
1640         }
1641     }
1642
1643     if (debug)
1644     {
1645         fprintf(debug, "LINCS thread r: %d constraints\n",
1646                 li_m->nind);
1647     }
1648 }
1649
1650 /* There is no realloc with alignment, so here we make one for reals.
1651  * Note that this function does not preserve the contents of the memory.
1652  */
1653 static void resize_real_aligned(real **ptr, int nelem)
1654 {
1655     sfree_aligned(*ptr);
1656     snew_aligned(*ptr, nelem, align_bytes);
1657 }
1658
1659 static void assign_constraint(gmx_lincsdata *li,
1660                               int constraint_index,
1661                               int a1, int a2,
1662                               real lenA, real lenB,
1663                               const t_blocka *at2con)
1664 {
1665     int con;
1666
1667     con = li->nc;
1668
1669     /* Make an mapping of local topology constraint index to LINCS index */
1670     li->con_index[constraint_index] = con;
1671
1672     li->bllen0[con]  = lenA;
1673     li->ddist[con]   = lenB - lenA;
1674     /* Set the length to the topology A length */
1675     li->bllen[con]   = lenA;
1676     li->bla[2*con]   = a1;
1677     li->bla[2*con+1] = a2;
1678
1679     /* Make space in the constraint connection matrix for constraints
1680      * connected to both end of the current constraint.
1681      */
1682     li->ncc +=
1683         at2con->index[a1 + 1] - at2con->index[a1] - 1 +
1684         at2con->index[a2 + 1] - at2con->index[a2] - 1;
1685
1686     li->blnr[con + 1] = li->ncc;
1687
1688     /* Increase the constraint count */
1689     li->nc++;
1690 }
1691
1692 /* Check if constraint with topology index constraint_index is connected
1693  * to other constraints, and if so add those connected constraints to our task.
1694  */
1695 static void check_assign_connected(gmx_lincsdata *li,
1696                                    const t_iatom *iatom,
1697                                    const t_idef *idef,
1698                                    int bDynamics,
1699                                    int a1, int a2,
1700                                    const t_blocka *at2con)
1701 {
1702     /* Currently this function only supports constraint groups
1703      * in which all constraints share at least one atom
1704      * (e.g. H-bond constraints).
1705      * Check both ends of the current constraint for
1706      * connected constraints. We need to assign those
1707      * to the same task.
1708      */
1709     int end;
1710
1711     for (end = 0; end < 2; end++)
1712     {
1713         int a, k;
1714
1715         a = (end == 0 ? a1 : a2);
1716
1717         for (k = at2con->index[a]; k < at2con->index[a + 1]; k++)
1718         {
1719             int cc;
1720
1721             cc = at2con->a[k];
1722             /* Check if constraint cc has not yet been assigned */
1723             if (li->con_index[cc] == -1)
1724             {
1725                 int  type;
1726                 real lenA, lenB;
1727
1728                 type = iatom[cc*3];
1729                 lenA = idef->iparams[type].constr.dA;
1730                 lenB = idef->iparams[type].constr.dB;
1731
1732                 if (bDynamics || lenA != 0 || lenB != 0)
1733                 {
1734                     assign_constraint(li, cc, iatom[3*cc + 1], iatom[3*cc + 2], lenA, lenB, at2con);
1735                 }
1736             }
1737         }
1738     }
1739 }
1740
1741 /* Check if constraint with topology index constraint_index is involved
1742  * in a constraint triangle, and if so add the other two constraints
1743  * in the triangle to our task.
1744  */
1745 static void check_assign_triangle(gmx_lincsdata *li,
1746                                   const t_iatom *iatom,
1747                                   const t_idef *idef,
1748                                   int bDynamics,
1749                                   int constraint_index,
1750                                   int a1, int a2,
1751                                   const t_blocka *at2con)
1752 {
1753     int nca, cc[32], ca[32], k;
1754     int c_triangle[2] = { -1, -1 };
1755
1756     nca = 0;
1757     for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
1758     {
1759         int c;
1760
1761         c = at2con->a[k];
1762         if (c != constraint_index)
1763         {
1764             int aa1, aa2;
1765
1766             aa1 = iatom[c*3 + 1];
1767             aa2 = iatom[c*3 + 2];
1768             if (aa1 != a1)
1769             {
1770                 cc[nca] = c;
1771                 ca[nca] = aa1;
1772                 nca++;
1773             }
1774             if (aa2 != a1)
1775             {
1776                 cc[nca] = c;
1777                 ca[nca] = aa2;
1778                 nca++;
1779             }
1780         }
1781     }
1782
1783     for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
1784     {
1785         int c;
1786
1787         c = at2con->a[k];
1788         if (c != constraint_index)
1789         {
1790             int aa1, aa2, i;
1791
1792             aa1 = iatom[c*3 + 1];
1793             aa2 = iatom[c*3 + 2];
1794             if (aa1 != a2)
1795             {
1796                 for (i = 0; i < nca; i++)
1797                 {
1798                     if (aa1 == ca[i])
1799                     {
1800                         c_triangle[0] = cc[i];
1801                         c_triangle[1] = c;
1802                     }
1803                 }
1804             }
1805             if (aa2 != a2)
1806             {
1807                 for (i = 0; i < nca; i++)
1808                 {
1809                     if (aa2 == ca[i])
1810                     {
1811                         c_triangle[0] = cc[i];
1812                         c_triangle[1] = c;
1813                     }
1814                 }
1815             }
1816         }
1817     }
1818
1819     if (c_triangle[0] >= 0)
1820     {
1821         int end;
1822
1823         for (end = 0; end < 2; end++)
1824         {
1825             /* Check if constraint c_triangle[end] has not yet been assigned */
1826             if (li->con_index[c_triangle[end]] == -1)
1827             {
1828                 int  i, type;
1829                 real lenA, lenB;
1830
1831                 i    = c_triangle[end]*3;
1832                 type = iatom[i];
1833                 lenA = idef->iparams[type].constr.dA;
1834                 lenB = idef->iparams[type].constr.dB;
1835
1836                 if (bDynamics || lenA != 0 || lenB != 0)
1837                 {
1838                     assign_constraint(li, c_triangle[end], iatom[i + 1], iatom[i + 2], lenA, lenB, at2con);
1839                 }
1840             }
1841         }
1842     }
1843 }
1844
1845 static void set_matrix_indices(gmx_lincsdata        *li,
1846                                const lincs_task_t   *li_task,
1847                                const t_blocka       *at2con,
1848                                gmx_bool              bSortMatrix)
1849 {
1850     int b;
1851
1852     for (b = li_task->b0; b < li_task->b1; b++)
1853     {
1854         int a1, a2, i, k;
1855
1856         a1 = li->bla[b*2];
1857         a2 = li->bla[b*2 + 1];
1858
1859         i = li->blnr[b];
1860         for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
1861         {
1862             int concon;
1863
1864             concon = li->con_index[at2con->a[k]];
1865             if (concon != b)
1866             {
1867                 li->blbnb[i++] = concon;
1868             }
1869         }
1870         for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
1871         {
1872             int concon;
1873
1874             concon = li->con_index[at2con->a[k]];
1875             if (concon != b)
1876             {
1877                 li->blbnb[i++] = concon;
1878             }
1879         }
1880
1881         if (bSortMatrix)
1882         {
1883             /* Order the blbnb matrix to optimize memory access */
1884             qsort(&(li->blbnb[li->blnr[b]]), li->blnr[b + 1] - li->blnr[b],
1885                   sizeof(li->blbnb[0]), int_comp);
1886         }
1887     }
1888 }
1889
1890 void set_lincs(const t_idef         *idef,
1891                const t_mdatoms      *md,
1892                gmx_bool              bDynamics,
1893                const t_commrec      *cr,
1894                gmx_lincsdata        *li)
1895 {
1896     int          natoms, nflexcon;
1897     t_blocka     at2con;
1898     t_iatom     *iatom;
1899     int          i, ncc_alloc_old, ncon_tot;
1900
1901     li->nc_real = 0;
1902     li->nc      = 0;
1903     li->ncc     = 0;
1904     /* Zero the thread index ranges.
1905      * Otherwise without local constraints we could return with old ranges.
1906      */
1907     for (i = 0; i < li->ntask; i++)
1908     {
1909         li->task[i].b0   = 0;
1910         li->task[i].b1   = 0;
1911         li->task[i].nind = 0;
1912     }
1913     if (li->ntask > 1)
1914     {
1915         li->task[li->ntask].nind = 0;
1916     }
1917
1918     /* This is the local topology, so there are only F_CONSTR constraints */
1919     if (idef->il[F_CONSTR].nr == 0)
1920     {
1921         /* There are no constraints,
1922          * we do not need to fill any data structures.
1923          */
1924         return;
1925     }
1926
1927     if (debug)
1928     {
1929         fprintf(debug, "Building the LINCS connectivity\n");
1930     }
1931
1932     if (DOMAINDECOMP(cr))
1933     {
1934         if (cr->dd->constraints)
1935         {
1936             int start;
1937
1938             dd_get_constraint_range(cr->dd, &start, &natoms);
1939         }
1940         else
1941         {
1942             natoms = cr->dd->nat_home;
1943         }
1944     }
1945     else
1946     {
1947         natoms = md->homenr;
1948     }
1949     at2con = make_at2con(0, natoms, idef->il, idef->iparams, bDynamics,
1950                          &nflexcon);
1951
1952     ncon_tot = idef->il[F_CONSTR].nr/3;
1953
1954     /* Ensure we have enough padding for aligned loads for each thread */
1955     if (ncon_tot + li->ntask*simd_width > li->nc_alloc || li->nc_alloc == 0)
1956     {
1957         li->nc_alloc = over_alloc_dd(ncon_tot + li->ntask*simd_width);
1958         srenew(li->con_index, li->nc_alloc);
1959         resize_real_aligned(&li->bllen0, li->nc_alloc);
1960         resize_real_aligned(&li->ddist, li->nc_alloc);
1961         srenew(li->bla, 2*li->nc_alloc);
1962         resize_real_aligned(&li->blc, li->nc_alloc);
1963         resize_real_aligned(&li->blc1, li->nc_alloc);
1964         srenew(li->blnr, li->nc_alloc + 1);
1965         resize_real_aligned(&li->bllen, li->nc_alloc);
1966         srenew(li->tmpv, li->nc_alloc);
1967         if (DOMAINDECOMP(cr))
1968         {
1969             srenew(li->nlocat, li->nc_alloc);
1970         }
1971         resize_real_aligned(&li->tmp1, li->nc_alloc);
1972         resize_real_aligned(&li->tmp2, li->nc_alloc);
1973         resize_real_aligned(&li->tmp3, li->nc_alloc);
1974         resize_real_aligned(&li->tmp4, li->nc_alloc);
1975         resize_real_aligned(&li->mlambda, li->nc_alloc);
1976     }
1977
1978     iatom = idef->il[F_CONSTR].iatoms;
1979
1980     ncc_alloc_old = li->ncc_alloc;
1981     li->blnr[0]   = li->ncc;
1982
1983     /* Assign the constraints for li->ntask LINCS tasks.
1984      * We target a uniform distribution of constraints over the tasks.
1985      * Note that when flexible constraints are present, but are removed here
1986      * (e.g. because we are doing EM) we get imbalance, but since that doesn't
1987      * happen during normal MD, that's ok.
1988      */
1989     int ncon_assign, ncon_target, con, th;
1990
1991     /* Determine the number of constraints we need to assign here */
1992     ncon_assign      = ncon_tot;
1993     if (!bDynamics)
1994     {
1995         /* With energy minimization, flexible constraints are ignored
1996          * (and thus minimized, as they should be).
1997          */
1998         ncon_assign -= nflexcon;
1999     }
2000
2001     /* Set the target constraint count per task to exactly uniform,
2002      * this might be overridden below.
2003      */
2004     ncon_target = (ncon_assign + li->ntask - 1)/li->ntask;
2005
2006     /* Mark all constraints as unassigned by setting their index to -1 */
2007     for (con = 0; con < ncon_tot; con++)
2008     {
2009         li->con_index[con] = -1;
2010     }
2011
2012     con = 0;
2013     for (th = 0; th < li->ntask; th++)
2014     {
2015         lincs_task_t *li_task;
2016
2017         li_task = &li->task[th];
2018
2019 #if GMX_SIMD_HAVE_REAL
2020         /* With indepedent tasks we likely have H-bond constraints or constraint
2021          * pairs. The connected constraints will be pulled into the task, so the
2022          * constraints per task will often exceed ncon_target.
2023          * Triangle constraints can also increase the count, but there are
2024          * relatively few of those, so we usually expect to get ncon_target.
2025          */
2026         if (li->bTaskDep)
2027         {
2028             /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
2029              * since otherwise a lot of operations can be wasted.
2030              * There are several ways to round here, we choose the one
2031              * that alternates block sizes, which helps with Intel HT.
2032              */
2033             ncon_target = ((ncon_assign*(th + 1))/li->ntask - li->nc_real + GMX_SIMD_REAL_WIDTH - 1) & ~(GMX_SIMD_REAL_WIDTH - 1);
2034         }
2035 #endif  // GMX_SIMD==2 && GMX_SIMD_HAVE_REAL
2036
2037         /* Continue filling the arrays where we left off with the previous task,
2038          * including padding for SIMD.
2039          */
2040         li_task->b0 = li->nc;
2041
2042         while (con < ncon_tot && li->nc - li_task->b0 < ncon_target)
2043         {
2044             if (li->con_index[con] == -1)
2045             {
2046                 int  type, a1, a2;
2047                 real lenA, lenB;
2048
2049                 type   = iatom[3*con];
2050                 a1     = iatom[3*con + 1];
2051                 a2     = iatom[3*con + 2];
2052                 lenA   = idef->iparams[type].constr.dA;
2053                 lenB   = idef->iparams[type].constr.dB;
2054                 /* Skip the flexible constraints when not doing dynamics */
2055                 if (bDynamics || lenA != 0 || lenB != 0)
2056                 {
2057                     assign_constraint(li, con, a1, a2, lenA, lenB, &at2con);
2058
2059                     if (li->ntask > 1 && !li->bTaskDep)
2060                     {
2061                         /* We can generate independent tasks. Check if we
2062                          * need to assign connected constraints to our task.
2063                          */
2064                         check_assign_connected(li, iatom, idef, bDynamics,
2065                                                a1, a2, &at2con);
2066                     }
2067                     if (li->ntask > 1 && li->ncg_triangle > 0)
2068                     {
2069                         /* Ensure constraints in one triangle are assigned
2070                          * to the same task.
2071                          */
2072                         check_assign_triangle(li, iatom, idef, bDynamics,
2073                                               con, a1, a2, &at2con);
2074                     }
2075                 }
2076             }
2077
2078             con++;
2079         }
2080
2081         li_task->b1 = li->nc;
2082
2083         if (simd_width > 1)
2084         {
2085             /* Copy the last atom pair indices and lengths for constraints
2086              * up to a multiple of simd_width, such that we can do all
2087              * SIMD operations without having to worry about end effects.
2088              */
2089             int i, last;
2090
2091             li->nc = ((li_task->b1 + simd_width - 1)/simd_width)*simd_width;
2092             last   = li_task->b1 - 1;
2093             for (i = li_task->b1; i < li->nc; i++)
2094             {
2095                 li->bla[i*2    ] = li->bla[last*2    ];
2096                 li->bla[i*2 + 1] = li->bla[last*2 + 1];
2097                 li->bllen0[i]    = li->bllen0[last];
2098                 li->ddist[i]     = li->ddist[last];
2099                 li->bllen[i]     = li->bllen[last];
2100                 li->blnr[i + 1]  = li->blnr[last + 1];
2101             }
2102         }
2103
2104         /* Keep track of how many constraints we assigned */
2105         li->nc_real += li_task->b1 - li_task->b0;
2106
2107         if (debug)
2108         {
2109             fprintf(debug, "LINCS task %d constraints %d - %d\n",
2110                     th, li_task->b0, li_task->b1);
2111         }
2112     }
2113
2114     assert(li->nc_real == ncon_assign);
2115
2116     gmx_bool bSortMatrix;
2117
2118     /* Without DD we order the blbnb matrix to optimize memory access.
2119      * With DD the overhead of sorting is more than the gain during access.
2120      */
2121     bSortMatrix = !DOMAINDECOMP(cr);
2122
2123     if (li->ncc > li->ncc_alloc)
2124     {
2125         li->ncc_alloc = over_alloc_small(li->ncc);
2126         srenew(li->blbnb, li->ncc_alloc);
2127     }
2128
2129 #pragma omp parallel for num_threads(li->ntask) schedule(static)
2130     for (th = 0; th < li->ntask; th++)
2131     {
2132         try
2133         {
2134             lincs_task_t *li_task;
2135
2136             li_task = &li->task[th];
2137
2138             if (li->ncg_triangle > 0 &&
2139                 li_task->b1 - li_task->b0 > li_task->tri_alloc)
2140             {
2141                 /* This is allocating too much, but it is difficult to improve */
2142                 li_task->tri_alloc = over_alloc_dd(li_task->b1 - li_task->b0);
2143                 srenew(li_task->triangle, li_task->tri_alloc);
2144                 srenew(li_task->tri_bits, li_task->tri_alloc);
2145             }
2146
2147             set_matrix_indices(li, li_task, &at2con, bSortMatrix);
2148         }
2149         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2150     }
2151
2152     done_blocka(&at2con);
2153
2154     if (cr->dd == nullptr)
2155     {
2156         /* Since the matrix is static, we should free some memory */
2157         li->ncc_alloc = li->ncc;
2158         srenew(li->blbnb, li->ncc_alloc);
2159     }
2160
2161     if (li->ncc_alloc > ncc_alloc_old)
2162     {
2163         srenew(li->blmf, li->ncc_alloc);
2164         srenew(li->blmf1, li->ncc_alloc);
2165         srenew(li->tmpncc, li->ncc_alloc);
2166     }
2167
2168     if (DOMAINDECOMP(cr) && dd_constraints_nlocalatoms(cr->dd) != nullptr)
2169     {
2170         int *nlocat_dd;
2171
2172         nlocat_dd = dd_constraints_nlocalatoms(cr->dd);
2173
2174         /* Convert nlocat from local topology to LINCS constraint indexing */
2175         for (con = 0; con < ncon_tot; con++)
2176         {
2177             li->nlocat[li->con_index[con]] = nlocat_dd[con];
2178         }
2179     }
2180     else
2181     {
2182         li->nlocat = nullptr;
2183     }
2184
2185     if (debug)
2186     {
2187         fprintf(debug, "Number of constraints is %d, padded %d, couplings %d\n",
2188                 li->nc_real, li->nc, li->ncc);
2189     }
2190
2191     if (li->ntask > 1)
2192     {
2193         lincs_thread_setup(li, md->nr);
2194     }
2195
2196     set_lincs_matrix(li, md->invmass, md->lambda);
2197 }
2198
2199 static void lincs_warning(FILE *fplog,
2200                           gmx_domdec_t *dd, rvec *x, rvec *xprime, t_pbc *pbc,
2201                           int ncons, int *bla, real *bllen, real wangle,
2202                           int maxwarn, int *warncount)
2203 {
2204     int  b, i, j;
2205     rvec v0, v1;
2206     real wfac, d0, d1, cosine;
2207     char buf[STRLEN];
2208
2209     wfac = std::cos(DEG2RAD*wangle);
2210
2211     sprintf(buf, "bonds that rotated more than %g degrees:\n"
2212             " atom 1 atom 2  angle  previous, current, constraint length\n",
2213             wangle);
2214     fprintf(stderr, "%s", buf);
2215     if (fplog)
2216     {
2217         fprintf(fplog, "%s", buf);
2218     }
2219
2220     for (b = 0; b < ncons; b++)
2221     {
2222         i = bla[2*b];
2223         j = bla[2*b+1];
2224         if (pbc)
2225         {
2226             pbc_dx_aiuc(pbc, x[i], x[j], v0);
2227             pbc_dx_aiuc(pbc, xprime[i], xprime[j], v1);
2228         }
2229         else
2230         {
2231             rvec_sub(x[i], x[j], v0);
2232             rvec_sub(xprime[i], xprime[j], v1);
2233         }
2234         d0     = norm(v0);
2235         d1     = norm(v1);
2236         cosine = iprod(v0, v1)/(d0*d1);
2237         if (cosine < wfac)
2238         {
2239             sprintf(buf, " %6d %6d  %5.1f  %8.4f %8.4f    %8.4f\n",
2240                     ddglatnr(dd, i), ddglatnr(dd, j),
2241                     RAD2DEG*std::acos(cosine), d0, d1, bllen[b]);
2242             fprintf(stderr, "%s", buf);
2243             if (fplog)
2244             {
2245                 fprintf(fplog, "%s", buf);
2246             }
2247             if (!std::isfinite(d1))
2248             {
2249                 gmx_fatal(FARGS, "Bond length not finite.");
2250             }
2251
2252             (*warncount)++;
2253         }
2254     }
2255     if (*warncount > maxwarn)
2256     {
2257         too_many_constraint_warnings(econtLINCS, *warncount);
2258     }
2259 }
2260
2261 static void cconerr(const gmx_lincsdata *lincsd,
2262                     rvec *x, t_pbc *pbc,
2263                     real *ncons_loc, real *ssd, real *max, int *imax)
2264 {
2265     const int  *bla, *nlocat;
2266     const real *bllen;
2267     real        ma, ssd2;
2268     int         count, im, task;
2269
2270     bla    = lincsd->bla;
2271     bllen  = lincsd->bllen;
2272     nlocat = lincsd->nlocat;
2273
2274     ma    = 0;
2275     ssd2  = 0;
2276     im    = 0;
2277     count = 0;
2278     for (task = 0; task < lincsd->ntask; task++)
2279     {
2280         int b;
2281
2282         for (b = lincsd->task[task].b0; b < lincsd->task[task].b1; b++)
2283         {
2284             real len, d, r2;
2285             rvec dx;
2286
2287             if (pbc)
2288             {
2289                 pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
2290             }
2291             else
2292             {
2293                 rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
2294             }
2295             r2  = norm2(dx);
2296             len = r2*gmx::invsqrt(r2);
2297             d   = std::abs(len/bllen[b]-1);
2298             if (d > ma && (nlocat == nullptr || nlocat[b]))
2299             {
2300                 ma = d;
2301                 im = b;
2302             }
2303             if (nlocat == nullptr)
2304             {
2305                 ssd2 += d*d;
2306                 count++;
2307             }
2308             else
2309             {
2310                 ssd2  += nlocat[b]*d*d;
2311                 count += nlocat[b];
2312             }
2313         }
2314     }
2315
2316     *ncons_loc = (nlocat ? 0.5 : 1)*count;
2317     *ssd       = (nlocat ? 0.5 : 1)*ssd2;
2318     *max       = ma;
2319     *imax      = im;
2320 }
2321
2322 gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
2323                          const t_inputrec *ir,
2324                          gmx_int64_t step,
2325                          gmx_lincsdata *lincsd, t_mdatoms *md,
2326                          const t_commrec *cr,
2327                          const gmx_multisim_t *ms,
2328                          rvec *x, rvec *xprime, rvec *min_proj,
2329                          matrix box, t_pbc *pbc,
2330                          real lambda, real *dvdlambda,
2331                          real invdt, rvec *v,
2332                          gmx_bool bCalcVir, tensor vir_r_m_dr,
2333                          int econq,
2334                          t_nrnb *nrnb,
2335                          int maxwarn, int *warncount)
2336 {
2337     gmx_bool  bCalcDHDL;
2338     char      buf[STRLEN], buf2[22], buf3[STRLEN];
2339     int       i, p_imax;
2340     real      ncons_loc, p_ssd, p_max = 0;
2341     rvec      dx;
2342     gmx_bool  bOK, bWarn;
2343
2344     bOK = TRUE;
2345
2346     /* This boolean should be set by a flag passed to this routine.
2347      * We can also easily check if any constraint length is changed,
2348      * if not dH/dlambda=0 and we can also set the boolean to FALSE.
2349      */
2350     bCalcDHDL = (ir->efep != efepNO && dvdlambda != nullptr);
2351
2352     if (lincsd->nc == 0 && cr->dd == nullptr)
2353     {
2354         if (bLog || bEner)
2355         {
2356             lincsd->rmsd_data[0] = 0;
2357             lincsd->rmsd_data[1] = 0;
2358         }
2359
2360         return bOK;
2361     }
2362
2363     if (econq == econqCoord)
2364     {
2365         /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
2366          * also with efep!=fepNO.
2367          */
2368         if (ir->efep != efepNO)
2369         {
2370             if (md->nMassPerturbed && lincsd->matlam != md->lambda)
2371             {
2372                 set_lincs_matrix(lincsd, md->invmass, md->lambda);
2373             }
2374
2375             for (i = 0; i < lincsd->nc; i++)
2376             {
2377                 lincsd->bllen[i] = lincsd->bllen0[i] + lambda*lincsd->ddist[i];
2378             }
2379         }
2380
2381         if (lincsd->ncg_flex)
2382         {
2383             /* Set the flexible constraint lengths to the old lengths */
2384             if (pbc != nullptr)
2385             {
2386                 for (i = 0; i < lincsd->nc; i++)
2387                 {
2388                     if (lincsd->bllen[i] == 0)
2389                     {
2390                         pbc_dx_aiuc(pbc, x[lincsd->bla[2*i]], x[lincsd->bla[2*i+1]], dx);
2391                         lincsd->bllen[i] = norm(dx);
2392                     }
2393                 }
2394             }
2395             else
2396             {
2397                 for (i = 0; i < lincsd->nc; i++)
2398                 {
2399                     if (lincsd->bllen[i] == 0)
2400                     {
2401                         lincsd->bllen[i] =
2402                             std::sqrt(distance2(x[lincsd->bla[2*i]],
2403                                                 x[lincsd->bla[2*i+1]]));
2404                     }
2405                 }
2406             }
2407         }
2408
2409         if (bLog && fplog)
2410         {
2411             cconerr(lincsd, xprime, pbc,
2412                     &ncons_loc, &p_ssd, &p_max, &p_imax);
2413         }
2414
2415         /* This bWarn var can be updated by multiple threads
2416          * at the same time. But as we only need to detect
2417          * if a warning occurred or not, this is not an issue.
2418          */
2419         bWarn = FALSE;
2420
2421         /* The OpenMP parallel region of constrain_lincs for coords */
2422 #pragma omp parallel num_threads(lincsd->ntask)
2423         {
2424             try
2425             {
2426                 int th = gmx_omp_get_thread_num();
2427
2428                 clear_mat(lincsd->task[th].vir_r_m_dr);
2429
2430                 do_lincs(x, xprime, box, pbc, lincsd, th,
2431                          md->invmass, cr,
2432                          bCalcDHDL,
2433                          ir->LincsWarnAngle, &bWarn,
2434                          invdt, v, bCalcVir,
2435                          th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2436             }
2437             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2438         }
2439
2440         if (bLog && fplog && lincsd->nc > 0)
2441         {
2442             fprintf(fplog, "   Rel. Constraint Deviation:  RMS         MAX     between atoms\n");
2443             fprintf(fplog, "       Before LINCS          %.6f    %.6f %6d %6d\n",
2444                     std::sqrt(p_ssd/ncons_loc), p_max,
2445                     ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2446                     ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2447         }
2448         if (bLog || bEner)
2449         {
2450             cconerr(lincsd, xprime, pbc,
2451                     &ncons_loc, &p_ssd, &p_max, &p_imax);
2452             lincsd->rmsd_data[0] = ncons_loc;
2453             lincsd->rmsd_data[1] = p_ssd;
2454         }
2455         else
2456         {
2457             lincsd->rmsd_data[0] = 0;
2458             lincsd->rmsd_data[1] = 0;
2459             lincsd->rmsd_data[2] = 0;
2460         }
2461         if (bLog && fplog && lincsd->nc > 0)
2462         {
2463             fprintf(fplog,
2464                     "        After LINCS          %.6f    %.6f %6d %6d\n\n",
2465                     std::sqrt(p_ssd/ncons_loc), p_max,
2466                     ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2467                     ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2468         }
2469
2470         if (bWarn)
2471         {
2472             if (maxwarn < INT_MAX)
2473             {
2474                 cconerr(lincsd, xprime, pbc,
2475                         &ncons_loc, &p_ssd, &p_max, &p_imax);
2476                 if (isMultiSim(ms))
2477                 {
2478                     sprintf(buf3, " in simulation %d", ms->sim);
2479                 }
2480                 else
2481                 {
2482                     buf3[0] = 0;
2483                 }
2484                 sprintf(buf, "\nStep %s, time %g (ps)  LINCS WARNING%s\n"
2485                         "relative constraint deviation after LINCS:\n"
2486                         "rms %.6f, max %.6f (between atoms %d and %d)\n",
2487                         gmx_step_str(step, buf2), ir->init_t+step*ir->delta_t,
2488                         buf3,
2489                         std::sqrt(p_ssd/ncons_loc), p_max,
2490                         ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2491                         ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2492                 if (fplog)
2493                 {
2494                     fprintf(fplog, "%s", buf);
2495                 }
2496                 fprintf(stderr, "%s", buf);
2497                 lincs_warning(fplog, cr->dd, x, xprime, pbc,
2498                               lincsd->nc, lincsd->bla, lincsd->bllen,
2499                               ir->LincsWarnAngle, maxwarn, warncount);
2500             }
2501             bOK = (p_max < 0.5);
2502         }
2503
2504         if (lincsd->ncg_flex)
2505         {
2506             for (i = 0; (i < lincsd->nc); i++)
2507             {
2508                 if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
2509                 {
2510                     lincsd->bllen[i] = 0;
2511                 }
2512             }
2513         }
2514     }
2515     else
2516     {
2517         /* The OpenMP parallel region of constrain_lincs for derivatives */
2518 #pragma omp parallel num_threads(lincsd->ntask)
2519         {
2520             try
2521             {
2522                 int th = gmx_omp_get_thread_num();
2523
2524                 do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
2525                           md->invmass, econq, bCalcDHDL,
2526                           bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2527             }
2528             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2529         }
2530     }
2531
2532     if (bCalcDHDL)
2533     {
2534         /* Reduce the dH/dlambda contributions over the threads */
2535         real dhdlambda;
2536         int  th;
2537
2538         dhdlambda = 0;
2539         for (th = 0; th < lincsd->ntask; th++)
2540         {
2541             dhdlambda += lincsd->task[th].dhdlambda;
2542         }
2543         if (econq == econqCoord)
2544         {
2545             /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2546             /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2547             dhdlambda /= ir->delta_t*ir->delta_t;
2548         }
2549         *dvdlambda += dhdlambda;
2550     }
2551
2552     if (bCalcVir && lincsd->ntask > 1)
2553     {
2554         for (i = 1; i < lincsd->ntask; i++)
2555         {
2556             m_add(vir_r_m_dr, lincsd->task[i].vir_r_m_dr, vir_r_m_dr);
2557         }
2558     }
2559
2560     /* count assuming nit=1 */
2561     inc_nrnb(nrnb, eNR_LINCS, lincsd->nc_real);
2562     inc_nrnb(nrnb, eNR_LINCSMAT, (2+lincsd->nOrder)*lincsd->ncc);
2563     if (lincsd->ntriangle > 0)
2564     {
2565         inc_nrnb(nrnb, eNR_LINCSMAT, lincsd->nOrder*lincsd->ncc_triangle);
2566     }
2567     if (v)
2568     {
2569         inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc_real*2);
2570     }
2571     if (bCalcVir)
2572     {
2573         inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc_real);
2574     }
2575
2576     return bOK;
2577 }