src/gromacs/mdlib/lincs.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 /*! \internal \file
  39  * \brief Defines LINCS code.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \author Mark Abraham <mark.j.abraham@gmail.com>
  43  * \ingroup module_mdlib
  44  */
  45 #include "gmxpre.h"
  46
  47 #include "lincs.h"
  48
  49 #include "config.h"
  50
  51 #include <cassert>
  52 #include <cmath>
  53 #include <cstdlib>
  54
  55 #include <algorithm>
  56 #include <optional>
  57 #include <vector>
  58
  59 #include "gromacs/domdec/domdec.h"
  60 #include "gromacs/domdec/domdec_struct.h"
  61 #include "gromacs/gmxlib/nrnb.h"
  62 #include "gromacs/math/functions.h"
  63 #include "gromacs/math/paddedvector.h"
  64 #include "gromacs/math/units.h"
  65 #include "gromacs/math/vec.h"
  66 #include "gromacs/mdlib/constr.h"
  67 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  68 #include "gromacs/mdrunutility/multisim.h"
  69 #include "gromacs/mdtypes/commrec.h"
  70 #include "gromacs/mdtypes/inputrec.h"
  71 #include "gromacs/mdtypes/md_enums.h"
  72 #include "gromacs/mdtypes/observablesreducer.h"
  73 #include "gromacs/pbcutil/pbc.h"
  74 #include "gromacs/pbcutil/pbc_simd.h"
  75 #include "gromacs/simd/simd.h"
  76 #include "gromacs/simd/simd_math.h"
  77 #include "gromacs/simd/vector_operations.h"
  78 #include "gromacs/topology/mtop_util.h"
  79 #include "gromacs/utility/alignedallocator.h"
  80 #include "gromacs/utility/arrayref.h"
  81 #include "gromacs/utility/basedefinitions.h"
  82 #include "gromacs/utility/bitmask.h"
  83 #include "gromacs/utility/cstringutil.h"
  84 #include "gromacs/utility/exceptions.h"
  85 #include "gromacs/utility/fatalerror.h"
  86 #include "gromacs/utility/gmxomp.h"
  87 #include "gromacs/utility/listoflists.h"
  88 #include "gromacs/utility/pleasecite.h"
  89
  90 namespace gmx
  91 {
  92
  93 //! Indices of the two atoms involved in a single constraint
  94 struct AtomPair
  95 {
  96     //! \brief Constructor, does not initialize to catch bugs and faster construction
  97     AtomPair() {}
  98
  99     //! Index of atom 1
 100     int index1;
 101     //! Index of atom 2
 102     int index2;
 103 };
 104
 105 //! Unit of work within LINCS.
 106 struct Task
 107 {
 108     //! First constraint for this task.
 109     int b0 = 0;
 110     //! b1-1 is the last constraint for this task.
 111     int b1 = 0;
 112     //! The number of constraints in triangles.
 113     int ntriangle = 0;
 114     //! The list of triangle constraints.
 115     std::vector<int> triangle;
 116     //! The bits tell if the matrix element should be used.
 117     std::vector<int> tri_bits;
 118     //! Constraint index for updating atom data.
 119     std::vector<int> ind;
 120     //! Constraint index for updating atom data.
 121     std::vector<int> ind_r;
 122     //! Temporary variable for virial calculation.
 123     tensor vir_r_m_dr = { { 0 } };
 124     //! Temporary variable for lambda derivative.
 125     real dhdlambda;
 126 };
 127
 128 /*! \brief Data for LINCS algorithm.
 129  */
 130 class Lincs
 131 {
 132 public:
 133     //! The global number of constraints.
 134     int ncg = 0;
 135     //! The global number of flexible constraints.
 136     int ncg_flex = 0;
 137     //! The global number of constraints in triangles.
 138     int ncg_triangle = 0;
 139     //! The number of iterations.
 140     int nIter = 0;
 141     //! The order of the matrix expansion.
 142     int nOrder = 0;
 143     //! The maximum number of constraints connected to a single atom.
 144     int max_connect = 0;
 145
 146     //! The number of real constraints.
 147     int nc_real = 0;
 148     //! The number of constraints including padding for SIMD.
 149     int nc = 0;
 150     //! The number of constraint connections.
 151     int ncc = 0;
 152     //! The FE lambda value used for filling blc and blmf.
 153     real matlam = 0;
 154     //! mapping from topology to LINCS constraints.
 155     std::vector<int> con_index;
 156     //! The reference distance in topology A.
 157     std::vector<real, AlignedAllocator<real>> bllen0;
 158     //! The reference distance in top B - the r.d. in top A.
 159     std::vector<real, AlignedAllocator<real>> ddist;
 160     //! The atom pairs involved in the constraints.
 161     std::vector<AtomPair> atoms;
 162     //! 1/sqrt(invmass1  invmass2).
 163     std::vector<real, AlignedAllocator<real>> blc;
 164     //! As blc, but with all masses 1.
 165     std::vector<real, AlignedAllocator<real>> blc1;
 166     //! Index into blbnb and blmf.
 167     std::vector<int> blnr;
 168     //! List of constraint connections.
 169     std::vector<int> blbnb;
 170     //! The local number of constraints in triangles.
 171     int ntriangle = 0;
 172     //! The number of constraint connections in triangles.
 173     int ncc_triangle = 0;
 174     //! Communicate before each LINCS interation.
 175     bool bCommIter = false;
 176     //! Matrix of mass factors for constraint connections.
 177     std::vector<real> blmf;
 178     //! As blmf, but with all masses 1.
 179     std::vector<real> blmf1;
 180     //! The reference bond length.
 181     std::vector<real, AlignedAllocator<real>> bllen;
 182     //! The local atom count per constraint, can be NULL.
 183     std::vector<int> nlocat;
 184
 185     /*! \brief The number of tasks used for LINCS work.
 186      *
 187      * \todo This is mostly used to loop over \c task, which would
 188      * be nicer to do with range-based for loops, but the thread
 189      * index is used for constructing bit masks and organizing the
 190      * virial output buffer, so other things need to change,
 191      * first. */
 192     int ntask = 0;
 193     /*! \brief LINCS thread division */
 194     std::vector<Task> task;
 195     //! Atom flags for thread parallelization.
 196     std::vector<gmx_bitmask_t> atf;
 197     //! Are the LINCS tasks interdependent?
 198     bool bTaskDep = false;
 199     //! Are there triangle constraints that cross task borders?
 200     bool bTaskDepTri = false;
 201     //! Arrays for temporary storage in the LINCS algorithm.
 202     /*! @{ */
 203     PaddedVector<gmx::RVec>                   tmpv;
 204     std::vector<real>                         tmpncc;
 205     std::vector<real, AlignedAllocator<real>> tmp1;
 206     std::vector<real, AlignedAllocator<real>> tmp2;
 207     std::vector<real, AlignedAllocator<real>> tmp3;
 208     std::vector<real, AlignedAllocator<real>> tmp4;
 209     /*! @} */
 210     //! The Lagrange multipliers times -1.
 211     std::vector<real, AlignedAllocator<real>> mlambda;
 212     /*! \brief Callback used after constraining to require reduction
 213      * of values later used to compute the constraint RMS relative
 214      * deviation, so the latter can be output. */
 215     std::optional<ObservablesReducerBuilder::CallbackToRequireReduction> callbackToRequireReduction;
 216     /*! \brief View used for reducing the components of the global
 217      * relative RMS constraint deviation.
 218      *
 219      * Can be written any time, but that is only useful when followed
 220      * by a call of the callbackToRequireReduction. Useful to read
 221      * only from the callback that the ObservablesReducer will later
 222      * make after reduction. */
 223     ArrayRef<double> rmsdReductionBuffer;
 224     /*! \brief The value of the constraint RMS deviation after it has
 225      * been computed.
 226      *
 227      * When DD is active, filled by the ObservablesReducer, otherwise
 228      * filled directly here. */
 229     std::optional<double> constraintRmsDeviation;
 230 };
 231
 232 /*! \brief Define simd_width for memory allocation used for SIMD code */
 233 #if GMX_SIMD_HAVE_REAL
 234 static const int simd_width = GMX_SIMD_REAL_WIDTH;
 235 #else
 236 static const int simd_width = 1;
 237 #endif
 238
 239 real lincs_rmsd(const Lincs* lincsd)
 240 {
 241     if (lincsd->constraintRmsDeviation.has_value())
 242     {
 243         return real(lincsd->constraintRmsDeviation.value());
 244     }
 245     else
 246     {
 247         return 0;
 248     }
 249 }
 250
 251 /*! \brief Do a set of nrec LINCS matrix multiplications.
 252  *
 253  * This function will return with up to date thread-local
 254  * constraint data, without an OpenMP barrier.
 255  */
 256 static void lincs_matrix_expand(const Lincs&              lincsd,
 257                                 const Task&               li_task,
 258                                 gmx::ArrayRef<const real> blcc,
 259                                 gmx::ArrayRef<real>       rhs1,
 260                                 gmx::ArrayRef<real>       rhs2,
 261                                 gmx::ArrayRef<real>       sol)
 262 {
 263     gmx::ArrayRef<const int> blnr  = lincsd.blnr;
 264     gmx::ArrayRef<const int> blbnb = lincsd.blbnb;
 265
 266     const int b0   = li_task.b0;
 267     const int b1   = li_task.b1;
 268     const int nrec = lincsd.nOrder;
 269
 270     for (int rec = 0; rec < nrec; rec++)
 271     {
 272         if (lincsd.bTaskDep)
 273         {
 274 #pragma omp barrier
 275         }
 276         for (int b = b0; b < b1; b++)
 277         {
 278             real mvb;
 279             int  n;
 280
 281             mvb = 0;
 282             for (n = blnr[b]; n < blnr[b + 1]; n++)
 283             {
 284                 mvb = mvb + blcc[n] * rhs1[blbnb[n]];
 285             }
 286             rhs2[b] = mvb;
 287             sol[b]  = sol[b] + mvb;
 288         }
 289
 290         std::swap(rhs1, rhs2);
 291     } /* nrec*(ncons+2*nrtot) flops */
 292
 293     if (lincsd.ntriangle > 0)
 294     {
 295         /* Perform an extra nrec recursions for only the constraints
 296          * involved in rigid triangles.
 297          * In this way their accuracy should come close to those of the other
 298          * constraints, since traingles of constraints can produce eigenvalues
 299          * around 0.7, while the effective eigenvalue for bond constraints
 300          * is around 0.4 (and 0.7*0.7=0.5).
 301          */
 302
 303         if (lincsd.bTaskDep)
 304         {
 305             /* We need a barrier here, since other threads might still be
 306              * reading the contents of rhs1 and/o rhs2.
 307              * We could avoid this barrier by introducing two extra rhs
 308              * arrays for the triangle constraints only.
 309              */
 310 #pragma omp barrier
 311         }
 312
 313         /* Constraints involved in a triangle are ensured to be in the same
 314          * LINCS task. This means no barriers are required during the extra
 315          * iterations for the triangle constraints.
 316          */
 317         gmx::ArrayRef<const int> triangle = li_task.triangle;
 318         gmx::ArrayRef<const int> tri_bits = li_task.tri_bits;
 319
 320         for (int rec = 0; rec < nrec; rec++)
 321         {
 322             for (int tb = 0; tb < li_task.ntriangle; tb++)
 323             {
 324                 int  b, bits, nr0, nr1, n;
 325                 real mvb;
 326
 327                 b    = triangle[tb];
 328                 bits = tri_bits[tb];
 329                 mvb  = 0;
 330                 nr0  = blnr[b];
 331                 nr1  = blnr[b + 1];
 332                 for (n = nr0; n < nr1; n++)
 333                 {
 334                     if (bits & (1 << (n - nr0)))
 335                     {
 336                         mvb = mvb + blcc[n] * rhs1[blbnb[n]];
 337                     }
 338                 }
 339                 rhs2[b] = mvb;
 340                 sol[b]  = sol[b] + mvb;
 341             }
 342
 343             std::swap(rhs1, rhs2);
 344         } /* nrec*(ntriangle + ncc_triangle*2) flops */
 345
 346         if (lincsd.bTaskDepTri)
 347         {
 348             /* The constraints triangles are decoupled from each other,
 349              * but constraints in one triangle cross thread task borders.
 350              * We could probably avoid this with more advanced setup code.
 351              */
 352 #pragma omp barrier
 353         }
 354     }
 355 }
 356
 357 //! Update atomic coordinates when an index is not required.
 358 static void lincs_update_atoms_noind(int                            ncons,
 359                                      gmx::ArrayRef<const AtomPair>  atoms,
 360                                      real                           preFactor,
 361                                      gmx::ArrayRef<const real>      fac,
 362                                      gmx::ArrayRef<const gmx::RVec> r,
 363                                      gmx::ArrayRef<const real>      invmass,
 364                                      rvec*                          x)
 365 {
 366     if (!invmass.empty())
 367     {
 368         for (int b = 0; b < ncons; b++)
 369         {
 370             int  i    = atoms[b].index1;
 371             int  j    = atoms[b].index2;
 372             real mvb  = preFactor * fac[b];
 373             real im1  = invmass[i];
 374             real im2  = invmass[j];
 375             real tmp0 = r[b][0] * mvb;
 376             real tmp1 = r[b][1] * mvb;
 377             real tmp2 = r[b][2] * mvb;
 378             x[i][0] -= tmp0 * im1;
 379             x[i][1] -= tmp1 * im1;
 380             x[i][2] -= tmp2 * im1;
 381             x[j][0] += tmp0 * im2;
 382             x[j][1] += tmp1 * im2;
 383             x[j][2] += tmp2 * im2;
 384         } /* 16 ncons flops */
 385     }
 386     else
 387     {
 388         for (int b = 0; b < ncons; b++)
 389         {
 390             int  i    = atoms[b].index1;
 391             int  j    = atoms[b].index2;
 392             real mvb  = preFactor * fac[b];
 393             real tmp0 = r[b][0] * mvb;
 394             real tmp1 = r[b][1] * mvb;
 395             real tmp2 = r[b][2] * mvb;
 396             x[i][0] -= tmp0;
 397             x[i][1] -= tmp1;
 398             x[i][2] -= tmp2;
 399             x[j][0] += tmp0;
 400             x[j][1] += tmp1;
 401             x[j][2] += tmp2;
 402         }
 403     }
 404 }
 405
 406 //! Update atomic coordinates when an index is required.
 407 static void lincs_update_atoms_ind(gmx::ArrayRef<const int>       ind,
 408                                    gmx::ArrayRef<const AtomPair>  atoms,
 409                                    real                           preFactor,
 410                                    gmx::ArrayRef<const real>      fac,
 411                                    gmx::ArrayRef<const gmx::RVec> r,
 412                                    gmx::ArrayRef<const real>      invmass,
 413                                    rvec*                          x)
 414 {
 415     if (!invmass.empty())
 416     {
 417         for (int b : ind)
 418         {
 419             int  i    = atoms[b].index1;
 420             int  j    = atoms[b].index2;
 421             real mvb  = preFactor * fac[b];
 422             real im1  = invmass[i];
 423             real im2  = invmass[j];
 424             real tmp0 = r[b][0] * mvb;
 425             real tmp1 = r[b][1] * mvb;
 426             real tmp2 = r[b][2] * mvb;
 427             x[i][0] -= tmp0 * im1;
 428             x[i][1] -= tmp1 * im1;
 429             x[i][2] -= tmp2 * im1;
 430             x[j][0] += tmp0 * im2;
 431             x[j][1] += tmp1 * im2;
 432             x[j][2] += tmp2 * im2;
 433         } /* 16 ncons flops */
 434     }
 435     else
 436     {
 437         for (int b : ind)
 438         {
 439             int  i    = atoms[b].index1;
 440             int  j    = atoms[b].index2;
 441             real mvb  = preFactor * fac[b];
 442             real tmp0 = r[b][0] * mvb;
 443             real tmp1 = r[b][1] * mvb;
 444             real tmp2 = r[b][2] * mvb;
 445             x[i][0] -= tmp0;
 446             x[i][1] -= tmp1;
 447             x[i][2] -= tmp2;
 448             x[j][0] += tmp0;
 449             x[j][1] += tmp1;
 450             x[j][2] += tmp2;
 451         } /* 16 ncons flops */
 452     }
 453 }
 454
 455 //! Update coordinates for atoms.
 456 static void lincs_update_atoms(Lincs*                         li,
 457                                int                            th,
 458                                real                           preFactor,
 459                                gmx::ArrayRef<const real>      fac,
 460                                gmx::ArrayRef<const gmx::RVec> r,
 461                                gmx::ArrayRef<const real>      invmass,
 462                                rvec*                          x)
 463 {
 464     if (li->ntask == 1)
 465     {
 466         /* Single thread, we simply update for all constraints */
 467         lincs_update_atoms_noind(li->nc_real, li->atoms, preFactor, fac, r, invmass, x);
 468     }
 469     else
 470     {
 471         /* Update the atom vector components for our thread local
 472          * constraints that only access our local atom range.
 473          * This can be done without a barrier.
 474          */
 475         lincs_update_atoms_ind(li->task[th].ind, li->atoms, preFactor, fac, r, invmass, x);
 476
 477         if (!li->task[li->ntask].ind.empty())
 478         {
 479             /* Update the constraints that operate on atoms
 480              * in multiple thread atom blocks on the master thread.
 481              */
 482 #pragma omp barrier
 483 #pragma omp master
 484             {
 485                 lincs_update_atoms_ind(li->task[li->ntask].ind, li->atoms, preFactor, fac, r, invmass, x);
 486             }
 487         }
 488     }
 489 }
 490
 491 #if GMX_SIMD_HAVE_REAL
 492 //! Helper function so that we can run TSAN with SIMD support (where implemented).
 493 template<int align>
 494 static inline void gmx_simdcall gatherLoadUTransposeTSANSafe(const real*         base,
 495                                                              const std::int32_t* offset,
 496                                                              SimdReal*           v0,
 497                                                              SimdReal*           v1,
 498                                                              SimdReal*           v2)
 499 {
 500 #    if (CMAKE_BUILD_TYPE == CMAKE_BUILD_TYPE_TSAN) && GMX_SIMD_X86_AVX2_256
 501     // This function is only implemented in this case
 502     gatherLoadUTransposeSafe<align>(base, offset, v0, v1, v2);
 503 #    else
 504     gatherLoadUTranspose<align>(base, offset, v0, v1, v2);
 505 #    endif
 506 }
 507
 508 /*! \brief Calculate the constraint distance vectors r to project on from x.
 509  *
 510  * Determine the right-hand side of the matrix equation using quantity f.
 511  * This function only differs from calc_dr_x_xp_simd below in that
 512  * no constraint length is subtracted and no PBC is used for f. */
 513 static void gmx_simdcall calc_dr_x_f_simd(int                           b0,
 514                                           int                           b1,
 515                                           gmx::ArrayRef<const AtomPair> atoms,
 516                                           const rvec* gmx_restrict      x,
 517                                           const rvec* gmx_restrict      f,
 518                                           const real* gmx_restrict      blc,
 519                                           const real*                   pbc_simd,
 520                                           rvec* gmx_restrict            r,
 521                                           real* gmx_restrict            rhs,
 522                                           real* gmx_restrict            sol)
 523 {
 524     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 525
 526     alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset2[GMX_SIMD_REAL_WIDTH];
 527
 528     for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 529     {
 530         offset2[i] = i;
 531     }
 532
 533     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 534     {
 535         SimdReal                                 x0_S, y0_S, z0_S;
 536         SimdReal                                 x1_S, y1_S, z1_S;
 537         SimdReal                                 rx_S, ry_S, rz_S, n2_S, il_S;
 538         SimdReal                                 fx_S, fy_S, fz_S, ip_S, rhs_S;
 539         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset0[GMX_SIMD_REAL_WIDTH];
 540         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset1[GMX_SIMD_REAL_WIDTH];
 541
 542         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 543         {
 544             offset0[i] = atoms[bs + i].index1;
 545             offset1[i] = atoms[bs + i].index2;
 546         }
 547
 548         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset0, &x0_S, &y0_S, &z0_S);
 549         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset1, &x1_S, &y1_S, &z1_S);
 550         rx_S = x0_S - x1_S;
 551         ry_S = y0_S - y1_S;
 552         rz_S = z0_S - z1_S;
 553
 554         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 555
 556         n2_S = norm2(rx_S, ry_S, rz_S);
 557         il_S = invsqrt(n2_S);
 558
 559         rx_S = rx_S * il_S;
 560         ry_S = ry_S * il_S;
 561         rz_S = rz_S * il_S;
 562
 563         transposeScatterStoreU<3>(reinterpret_cast<real*>(r + bs), offset2, rx_S, ry_S, rz_S);
 564
 565         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(f), offset0, &x0_S, &y0_S, &z0_S);
 566         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(f), offset1, &x1_S, &y1_S, &z1_S);
 567         fx_S = x0_S - x1_S;
 568         fy_S = y0_S - y1_S;
 569         fz_S = z0_S - z1_S;
 570
 571         ip_S = iprod(rx_S, ry_S, rz_S, fx_S, fy_S, fz_S);
 572
 573         rhs_S = load<SimdReal>(blc + bs) * ip_S;
 574
 575         store(rhs + bs, rhs_S);
 576         store(sol + bs, rhs_S);
 577     }
 578 }
 579 #endif // GMX_SIMD_HAVE_REAL
 580
 581 /*! \brief LINCS projection, works on derivatives of the coordinates. */
 582 static void do_lincsp(ArrayRefWithPadding<const RVec> xPadded,
 583                       ArrayRefWithPadding<RVec>       fPadded,
 584                       ArrayRef<RVec>                  fp,
 585                       t_pbc*                          pbc,
 586                       Lincs*                          lincsd,
 587                       int                             th,
 588                       ArrayRef<const real>            invmass,
 589                       ConstraintVariable              econq,
 590                       bool                            bCalcDHDL,
 591                       bool                            bCalcVir,
 592                       tensor                          rmdf)
 593 {
 594     const int b0 = lincsd->task[th].b0;
 595     const int b1 = lincsd->task[th].b1;
 596
 597     gmx::ArrayRef<const AtomPair> atoms = lincsd->atoms;
 598     gmx::ArrayRef<gmx::RVec>      r     = lincsd->tmpv;
 599     gmx::ArrayRef<const int>      blnr  = lincsd->blnr;
 600     gmx::ArrayRef<const int>      blbnb = lincsd->blbnb;
 601
 602     gmx::ArrayRef<const real> blc;
 603     gmx::ArrayRef<const real> blmf;
 604     if (econq != ConstraintVariable::Force)
 605     {
 606         /* Use mass-weighted parameters */
 607         blc  = lincsd->blc;
 608         blmf = lincsd->blmf;
 609     }
 610     else
 611     {
 612         /* Use non mass-weighted parameters */
 613         blc  = lincsd->blc1;
 614         blmf = lincsd->blmf1;
 615     }
 616     gmx::ArrayRef<real> blcc = lincsd->tmpncc;
 617     gmx::ArrayRef<real> rhs1 = lincsd->tmp1;
 618     gmx::ArrayRef<real> rhs2 = lincsd->tmp2;
 619     gmx::ArrayRef<real> sol  = lincsd->tmp3;
 620
 621     const rvec* x = as_rvec_array(xPadded.paddedArrayRef().data());
 622     rvec*       f = as_rvec_array(fPadded.paddedArrayRef().data());
 623
 624 #if GMX_SIMD_HAVE_REAL
 625     /* This SIMD code does the same as the plain-C code after the #else.
 626      * The only difference is that we always call pbc code, as with SIMD
 627      * the overhead of pbc computation (when not needed) is small.
 628      */
 629     alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9 * GMX_SIMD_REAL_WIDTH];
 630
 631     /* Convert the pbc struct for SIMD */
 632     set_pbc_simd(pbc, pbc_simd);
 633
 634     /* Compute normalized x i-j vectors, store in r.
 635      * Compute the inner product of r and xp i-j and store in rhs1.
 636      */
 637     calc_dr_x_f_simd(
 638             b0, b1, atoms, x, f, blc.data(), pbc_simd, as_rvec_array(r.data()), rhs1.data(), sol.data());
 639
 640 #else // GMX_SIMD_HAVE_REAL
 641
 642     /* Compute normalized i-j vectors */
 643     if (pbc)
 644     {
 645         for (int b = b0; b < b1; b++)
 646         {
 647             rvec dx;
 648
 649             pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
 650             unitv(dx, r[b]);
 651         }
 652     }
 653     else
 654     {
 655         for (int b = b0; b < b1; b++)
 656         {
 657             rvec dx;
 658
 659             rvec_sub(x[atoms[b].index1], x[atoms[b].index2], dx);
 660             unitv(dx, r[b]);
 661         } /* 16 ncons flops */
 662     }
 663
 664     for (int b = b0; b < b1; b++)
 665     {
 666         int  i   = atoms[b].index1;
 667         int  j   = atoms[b].index2;
 668         real mvb = blc[b]
 669                    * (r[b][0] * (f[i][0] - f[j][0]) + r[b][1] * (f[i][1] - f[j][1])
 670                       + r[b][2] * (f[i][2] - f[j][2]));
 671         rhs1[b] = mvb;
 672         sol[b]  = mvb;
 673         /* 7 flops */
 674     }
 675
 676 #endif // GMX_SIMD_HAVE_REAL
 677
 678     if (lincsd->bTaskDep)
 679     {
 680         /* We need a barrier, since the matrix construction below
 681          * can access entries in r of other threads.
 682          */
 683 #pragma omp barrier
 684     }
 685
 686     /* Construct the (sparse) LINCS matrix */
 687     for (int b = b0; b < b1; b++)
 688     {
 689         for (int n = blnr[b]; n < blnr[b + 1]; n++)
 690         {
 691             blcc[n] = blmf[n] * ::iprod(r[b], r[blbnb[n]]);
 692         } /* 6 nr flops */
 693     }
 694     /* Together: 23*ncons + 6*nrtot flops */
 695
 696     lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
 697     /* nrec*(ncons+2*nrtot) flops */
 698
 699     if (econq == ConstraintVariable::Deriv_FlexCon)
 700     {
 701         /* We only want to constraint the flexible constraints,
 702          * so we mask out the normal ones by setting sol to 0.
 703          */
 704         for (int b = b0; b < b1; b++)
 705         {
 706             if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
 707             {
 708                 sol[b] = 0;
 709             }
 710         }
 711     }
 712
 713     /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
 714     for (int b = b0; b < b1; b++)
 715     {
 716         sol[b] *= blc[b];
 717     }
 718
 719     /* When constraining forces, we should not use mass weighting,
 720      * so we pass invmass=NULL, which results in the use of 1 for all atoms.
 721      */
 722     lincs_update_atoms(lincsd,
 723                        th,
 724                        1.0,
 725                        sol,
 726                        r,
 727                        (econq != ConstraintVariable::Force) ? invmass : gmx::ArrayRef<real>(),
 728                        as_rvec_array(fp.data()));
 729
 730     if (bCalcDHDL)
 731     {
 732         real dhdlambda = 0;
 733         for (int b = b0; b < b1; b++)
 734         {
 735             dhdlambda -= sol[b] * lincsd->ddist[b];
 736         }
 737
 738         lincsd->task[th].dhdlambda = dhdlambda;
 739     }
 740
 741     if (bCalcVir)
 742     {
 743         /* Constraint virial,
 744          * determines sum r_bond x delta f,
 745          * where delta f is the constraint correction
 746          * of the quantity that is being constrained.
 747          */
 748         for (int b = b0; b < b1; b++)
 749         {
 750             const real mvb = lincsd->bllen[b] * sol[b];
 751             for (int i = 0; i < DIM; i++)
 752             {
 753                 const real tmp1 = mvb * r[b][i];
 754                 for (int j = 0; j < DIM; j++)
 755                 {
 756                     rmdf[i][j] += tmp1 * r[b][j];
 757                 }
 758             }
 759         } /* 23 ncons flops */
 760     }
 761 }
 762
 763 #if GMX_SIMD_HAVE_REAL
 764
 765 /*! \brief Calculate the constraint distance vectors r to project on from x.
 766  *
 767  * Determine the right-hand side of the matrix equation using coordinates xp. */
 768 static void gmx_simdcall calc_dr_x_xp_simd(int                           b0,
 769                                            int                           b1,
 770                                            gmx::ArrayRef<const AtomPair> atoms,
 771                                            const rvec* gmx_restrict      x,
 772                                            const rvec* gmx_restrict      xp,
 773                                            const real* gmx_restrict      bllen,
 774                                            const real* gmx_restrict      blc,
 775                                            const real*                   pbc_simd,
 776                                            rvec* gmx_restrict            r,
 777                                            real* gmx_restrict            rhs,
 778                                            real* gmx_restrict            sol)
 779 {
 780     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 781     alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset2[GMX_SIMD_REAL_WIDTH];
 782
 783     for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 784     {
 785         offset2[i] = i;
 786     }
 787
 788     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 789     {
 790         SimdReal                                 x0_S, y0_S, z0_S;
 791         SimdReal                                 x1_S, y1_S, z1_S;
 792         SimdReal                                 rx_S, ry_S, rz_S, n2_S, il_S;
 793         SimdReal                                 rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
 794         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset0[GMX_SIMD_REAL_WIDTH];
 795         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset1[GMX_SIMD_REAL_WIDTH];
 796
 797         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 798         {
 799             offset0[i] = atoms[bs + i].index1;
 800             offset1[i] = atoms[bs + i].index2;
 801         }
 802
 803         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset0, &x0_S, &y0_S, &z0_S);
 804         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset1, &x1_S, &y1_S, &z1_S);
 805         rx_S = x0_S - x1_S;
 806         ry_S = y0_S - y1_S;
 807         rz_S = z0_S - z1_S;
 808
 809         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 810
 811         n2_S = norm2(rx_S, ry_S, rz_S);
 812         il_S = invsqrt(n2_S);
 813
 814         rx_S = rx_S * il_S;
 815         ry_S = ry_S * il_S;
 816         rz_S = rz_S * il_S;
 817
 818         transposeScatterStoreU<3>(reinterpret_cast<real*>(r + bs), offset2, rx_S, ry_S, rz_S);
 819
 820         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(xp), offset0, &x0_S, &y0_S, &z0_S);
 821         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(xp), offset1, &x1_S, &y1_S, &z1_S);
 822
 823         rxp_S = x0_S - x1_S;
 824         ryp_S = y0_S - y1_S;
 825         rzp_S = z0_S - z1_S;
 826
 827         pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
 828
 829         ip_S = iprod(rx_S, ry_S, rz_S, rxp_S, ryp_S, rzp_S);
 830
 831         rhs_S = load<SimdReal>(blc + bs) * (ip_S - load<SimdReal>(bllen + bs));
 832
 833         store(rhs + bs, rhs_S);
 834         store(sol + bs, rhs_S);
 835     }
 836 }
 837 #endif // GMX_SIMD_HAVE_REAL
 838
 839 /*! \brief Determine the distances and right-hand side for the next iteration. */
 840 gmx_unused static void calc_dist_iter(int                           b0,
 841                                       int                           b1,
 842                                       gmx::ArrayRef<const AtomPair> atoms,
 843                                       const rvec* gmx_restrict      xp,
 844                                       const real* gmx_restrict      bllen,
 845                                       const real* gmx_restrict      blc,
 846                                       const t_pbc*                  pbc,
 847                                       real                          wfac,
 848                                       real* gmx_restrict            rhs,
 849                                       real* gmx_restrict            sol,
 850                                       bool*                         bWarn)
 851 {
 852     for (int b = b0; b < b1; b++)
 853     {
 854         real len = bllen[b];
 855         rvec dx;
 856         if (pbc)
 857         {
 858             pbc_dx_aiuc(pbc, xp[atoms[b].index1], xp[atoms[b].index2], dx);
 859         }
 860         else
 861         {
 862             rvec_sub(xp[atoms[b].index1], xp[atoms[b].index2], dx);
 863         }
 864         real len2  = len * len;
 865         real dlen2 = 2 * len2 - ::norm2(dx);
 866         if (dlen2 < wfac * len2)
 867         {
 868             /* not race free - see detailed comment in caller */
 869             *bWarn = TRUE;
 870         }
 871         real mvb;
 872         if (dlen2 > 0)
 873         {
 874             mvb = blc[b] * (len - dlen2 * gmx::invsqrt(dlen2));
 875         }
 876         else
 877         {
 878             mvb = blc[b] * len;
 879         }
 880         rhs[b] = mvb;
 881         sol[b] = mvb;
 882     } /* 20*ncons flops */
 883 }
 884
 885 #if GMX_SIMD_HAVE_REAL
 886 /*! \brief As calc_dist_iter(), but using SIMD intrinsics. */
 887 static void gmx_simdcall calc_dist_iter_simd(int                           b0,
 888                                              int                           b1,
 889                                              gmx::ArrayRef<const AtomPair> atoms,
 890                                              const rvec* gmx_restrict      x,
 891                                              const real* gmx_restrict      bllen,
 892                                              const real* gmx_restrict      blc,
 893                                              const real*                   pbc_simd,
 894                                              real                          wfac,
 895                                              real* gmx_restrict            rhs,
 896                                              real* gmx_restrict            sol,
 897                                              bool*                         bWarn)
 898 {
 899     SimdReal min_S(GMX_REAL_MIN);
 900     SimdReal two_S(2.0);
 901     SimdReal wfac_S(wfac);
 902     SimdBool warn_S;
 903
 904     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 905
 906     /* Initialize all to FALSE */
 907     warn_S = (two_S < setZero());
 908
 909     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 910     {
 911         SimdReal                                 x0_S, y0_S, z0_S;
 912         SimdReal                                 x1_S, y1_S, z1_S;
 913         SimdReal                                 rx_S, ry_S, rz_S, n2_S;
 914         SimdReal                                 len_S, len2_S, dlen2_S, lc_S, blc_S;
 915         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset0[GMX_SIMD_REAL_WIDTH];
 916         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset1[GMX_SIMD_REAL_WIDTH];
 917
 918         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 919         {
 920             offset0[i] = atoms[bs + i].index1;
 921             offset1[i] = atoms[bs + i].index2;
 922         }
 923
 924         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset0, &x0_S, &y0_S, &z0_S);
 925         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset1, &x1_S, &y1_S, &z1_S);
 926
 927         rx_S = x0_S - x1_S;
 928         ry_S = y0_S - y1_S;
 929         rz_S = z0_S - z1_S;
 930
 931         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 932
 933         n2_S = norm2(rx_S, ry_S, rz_S);
 934
 935         len_S  = load<SimdReal>(bllen + bs);
 936         len2_S = len_S * len_S;
 937
 938         dlen2_S = fms(two_S, len2_S, n2_S);
 939
 940         warn_S = warn_S || (dlen2_S < (wfac_S * len2_S));
 941
 942         /* Avoid 1/0 by taking the max with REAL_MIN.
 943          * Note: when dlen2 is close to zero (90 degree constraint rotation),
 944          * the accuracy of the algorithm is no longer relevant.
 945          */
 946         dlen2_S = max(dlen2_S, min_S);
 947
 948         lc_S = fnma(dlen2_S, invsqrt(dlen2_S), len_S);
 949
 950         blc_S = load<SimdReal>(blc + bs);
 951
 952         lc_S = blc_S * lc_S;
 953
 954         store(rhs + bs, lc_S);
 955         store(sol + bs, lc_S);
 956     }
 957
 958     if (anyTrue(warn_S))
 959     {
 960         *bWarn = TRUE;
 961     }
 962 }
 963 #endif // GMX_SIMD_HAVE_REAL
 964
 965 //! Implements LINCS constraining.
 966 static void do_lincs(ArrayRefWithPadding<const RVec> xPadded,
 967                      ArrayRefWithPadding<RVec>       xpPadded,
 968                      const matrix                    box,
 969                      t_pbc*                          pbc,
 970                      Lincs*                          lincsd,
 971                      int                             th,
 972                      ArrayRef<const real>            invmass,
 973                      const t_commrec*                cr,
 974                      bool                            bCalcDHDL,
 975                      real                            wangle,
 976                      bool*                           bWarn,
 977                      real                            invdt,
 978                      ArrayRef<RVec>                  vRef,
 979                      bool                            bCalcVir,
 980                      tensor                          vir_r_m_dr)
 981 {
 982     const rvec*        x  = as_rvec_array(xPadded.paddedArrayRef().data());
 983     rvec*              xp = as_rvec_array(xpPadded.paddedArrayRef().data());
 984     rvec* gmx_restrict v  = as_rvec_array(vRef.data());
 985
 986     const int b0 = lincsd->task[th].b0;
 987     const int b1 = lincsd->task[th].b1;
 988
 989     gmx::ArrayRef<const AtomPair> atoms   = lincsd->atoms;
 990     gmx::ArrayRef<gmx::RVec>      r       = lincsd->tmpv;
 991     gmx::ArrayRef<const int>      blnr    = lincsd->blnr;
 992     gmx::ArrayRef<const int>      blbnb   = lincsd->blbnb;
 993     gmx::ArrayRef<const real>     blc     = lincsd->blc;
 994     gmx::ArrayRef<const real>     blmf    = lincsd->blmf;
 995     gmx::ArrayRef<const real>     bllen   = lincsd->bllen;
 996     gmx::ArrayRef<real>           blcc    = lincsd->tmpncc;
 997     gmx::ArrayRef<real>           rhs1    = lincsd->tmp1;
 998     gmx::ArrayRef<real>           rhs2    = lincsd->tmp2;
 999     gmx::ArrayRef<real>           sol     = lincsd->tmp3;
1000     gmx::ArrayRef<real>           blc_sol = lincsd->tmp4;
1001     gmx::ArrayRef<real>           mlambda = lincsd->mlambda;
1002     gmx::ArrayRef<const int>      nlocat  = lincsd->nlocat;
1003
1004 #if GMX_SIMD_HAVE_REAL
1005
1006     /* This SIMD code does the same as the plain-C code after the #else.
1007      * The only difference is that we always call pbc code, as with SIMD
1008      * the overhead of pbc computation (when not needed) is small.
1009      */
1010     alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9 * GMX_SIMD_REAL_WIDTH];
1011
1012     /* Convert the pbc struct for SIMD */
1013     set_pbc_simd(pbc, pbc_simd);
1014
1015     /* Compute normalized x i-j vectors, store in r.
1016      * Compute the inner product of r and xp i-j and store in rhs1.
1017      */
1018     calc_dr_x_xp_simd(
1019             b0, b1, atoms, x, xp, bllen.data(), blc.data(), pbc_simd, as_rvec_array(r.data()), rhs1.data(), sol.data());
1020
1021 #else // GMX_SIMD_HAVE_REAL
1022
1023     if (pbc)
1024     {
1025         /* Compute normalized i-j vectors */
1026         for (int b = b0; b < b1; b++)
1027         {
1028             rvec dx;
1029             pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
1030             unitv(dx, r[b]);
1031
1032             pbc_dx_aiuc(pbc, xp[atoms[b].index1], xp[atoms[b].index2], dx);
1033             real mvb = blc[b] * (::iprod(r[b], dx) - bllen[b]);
1034             rhs1[b]  = mvb;
1035             sol[b]   = mvb;
1036         }
1037     }
1038     else
1039     {
1040         /* Compute normalized i-j vectors */
1041         for (int b = b0; b < b1; b++)
1042         {
1043             int  i    = atoms[b].index1;
1044             int  j    = atoms[b].index2;
1045             real tmp0 = x[i][0] - x[j][0];
1046             real tmp1 = x[i][1] - x[j][1];
1047             real tmp2 = x[i][2] - x[j][2];
1048             real rlen = gmx::invsqrt(tmp0 * tmp0 + tmp1 * tmp1 + tmp2 * tmp2);
1049             r[b][0]   = rlen * tmp0;
1050             r[b][1]   = rlen * tmp1;
1051             r[b][2]   = rlen * tmp2;
1052             /* 16 ncons flops */
1053
1054             real mvb = blc[b]
1055                        * (r[b][0] * (xp[i][0] - xp[j][0]) + r[b][1] * (xp[i][1] - xp[j][1])
1056                           + r[b][2] * (xp[i][2] - xp[j][2]) - bllen[b]);
1057             rhs1[b] = mvb;
1058             sol[b]  = mvb;
1059             /* 10 flops */
1060         }
1061         /* Together: 26*ncons + 6*nrtot flops */
1062     }
1063
1064 #endif // GMX_SIMD_HAVE_REAL
1065
1066     if (lincsd->bTaskDep)
1067     {
1068         /* We need a barrier, since the matrix construction below
1069          * can access entries in r of other threads.
1070          */
1071 #pragma omp barrier
1072     }
1073
1074     /* Construct the (sparse) LINCS matrix */
1075     for (int b = b0; b < b1; b++)
1076     {
1077         for (int n = blnr[b]; n < blnr[b + 1]; n++)
1078         {
1079             blcc[n] = blmf[n] * gmx::dot(r[b], r[blbnb[n]]);
1080         }
1081     }
1082     /* Together: 26*ncons + 6*nrtot flops */
1083
1084     lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
1085     /* nrec*(ncons+2*nrtot) flops */
1086
1087 #if GMX_SIMD_HAVE_REAL
1088     for (int b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1089     {
1090         SimdReal t1 = load<SimdReal>(blc.data() + b);
1091         SimdReal t2 = load<SimdReal>(sol.data() + b);
1092         store(mlambda.data() + b, t1 * t2);
1093     }
1094 #else
1095     for (int b = b0; b < b1; b++)
1096     {
1097         mlambda[b] = blc[b] * sol[b];
1098     }
1099 #endif // GMX_SIMD_HAVE_REAL
1100
1101     /* Update the coordinates */
1102     lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
1103
1104     /*
1105      ********  Correction for centripetal effects  ********
1106      */
1107
1108     real wfac;
1109
1110     wfac = std::cos(gmx::c_deg2Rad * wangle);
1111     wfac = wfac * wfac;
1112
1113     for (int iter = 0; iter < lincsd->nIter; iter++)
1114     {
1115         if ((lincsd->bCommIter && haveDDAtomOrdering(*cr) && cr->dd->constraints))
1116         {
1117 #pragma omp barrier
1118 #pragma omp master
1119             {
1120                 /* Communicate the corrected non-local coordinates */
1121                 if (haveDDAtomOrdering(*cr))
1122                 {
1123                     dd_move_x_constraints(cr->dd, box, xpPadded.unpaddedArrayRef(), ArrayRef<RVec>(), FALSE);
1124                 }
1125             }
1126 #pragma omp barrier
1127         }
1128         else if (lincsd->bTaskDep)
1129         {
1130 #pragma omp barrier
1131         }
1132
1133 #if GMX_SIMD_HAVE_REAL
1134         calc_dist_iter_simd(
1135                 b0, b1, atoms, xp, bllen.data(), blc.data(), pbc_simd, wfac, rhs1.data(), sol.data(), bWarn);
1136 #else
1137         calc_dist_iter(b0, b1, atoms, xp, bllen.data(), blc.data(), pbc, wfac, rhs1.data(), sol.data(), bWarn);
1138         /* 20*ncons flops */
1139 #endif // GMX_SIMD_HAVE_REAL
1140
1141         lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
1142         /* nrec*(ncons+2*nrtot) flops */
1143
1144 #if GMX_SIMD_HAVE_REAL
1145         for (int b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1146         {
1147             SimdReal t1  = load<SimdReal>(blc.data() + b);
1148             SimdReal t2  = load<SimdReal>(sol.data() + b);
1149             SimdReal mvb = t1 * t2;
1150             store(blc_sol.data() + b, mvb);
1151             store(mlambda.data() + b, load<SimdReal>(mlambda.data() + b) + mvb);
1152         }
1153 #else
1154         for (int b = b0; b < b1; b++)
1155         {
1156             real mvb   = blc[b] * sol[b];
1157             blc_sol[b] = mvb;
1158             mlambda[b] += mvb;
1159         }
1160 #endif // GMX_SIMD_HAVE_REAL
1161
1162         /* Update the coordinates */
1163         lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
1164     }
1165     /* nit*ncons*(37+9*nrec) flops */
1166
1167     if (v != nullptr)
1168     {
1169         /* Update the velocities */
1170         lincs_update_atoms(lincsd, th, invdt, mlambda, r, invmass, v);
1171         /* 16 ncons flops */
1172     }
1173
1174     if (!nlocat.empty() && (bCalcDHDL || bCalcVir))
1175     {
1176         if (lincsd->bTaskDep)
1177         {
1178             /* In lincs_update_atoms threads might cross-read mlambda */
1179 #pragma omp barrier
1180         }
1181
1182         /* Only account for local atoms */
1183         for (int b = b0; b < b1; b++)
1184         {
1185             mlambda[b] *= 0.5 * nlocat[b];
1186         }
1187     }
1188
1189     if (bCalcDHDL)
1190     {
1191         real dhdl = 0;
1192         for (int b = b0; b < b1; b++)
1193         {
1194             /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1195              * later after the contributions are reduced over the threads.
1196              */
1197             dhdl -= lincsd->mlambda[b] * lincsd->ddist[b];
1198         }
1199         lincsd->task[th].dhdlambda = dhdl;
1200     }
1201
1202     if (bCalcVir)
1203     {
1204         /* Constraint virial */
1205         for (int b = b0; b < b1; b++)
1206         {
1207             real tmp0 = -bllen[b] * mlambda[b];
1208             for (int i = 0; i < DIM; i++)
1209             {
1210                 real tmp1 = tmp0 * r[b][i];
1211                 for (int j = 0; j < DIM; j++)
1212                 {
1213                     vir_r_m_dr[i][j] -= tmp1 * r[b][j];
1214                 }
1215             }
1216         } /* 22 ncons flops */
1217     }
1218
1219     /* Total:
1220      * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1221      * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1222      *
1223      * (26+nrec)*ncons + (6+2*nrec)*nrtot
1224      * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1225      * if nit=1
1226      * (63+nrec)*ncons + (6+4*nrec)*nrtot
1227      */
1228 }
1229
1230 /*! \brief Sets the elements in the LINCS matrix for task task. */
1231 static void set_lincs_matrix_task(Lincs*               li,
1232                                   Task*                li_task,
1233                                   ArrayRef<const real> invmass,
1234                                   int*                 ncc_triangle,
1235                                   int*                 nCrossTaskTriangles)
1236 {
1237     /* Construct the coupling coefficient matrix blmf */
1238     li_task->ntriangle   = 0;
1239     *ncc_triangle        = 0;
1240     *nCrossTaskTriangles = 0;
1241     for (int i = li_task->b0; i < li_task->b1; i++)
1242     {
1243         const int a1 = li->atoms[i].index1;
1244         const int a2 = li->atoms[i].index2;
1245         for (int n = li->blnr[i]; n < li->blnr[i + 1]; n++)
1246         {
1247             const int k = li->blbnb[n];
1248
1249             /* If we are using multiple, independent tasks for LINCS,
1250              * the calls to check_assign_connected should have
1251              * put all connected constraints in our task.
1252              */
1253             assert(li->bTaskDep || (k >= li_task->b0 && k < li_task->b1));
1254
1255             int sign;
1256             if (a1 == li->atoms[k].index1 || a2 == li->atoms[k].index2)
1257             {
1258                 sign = -1;
1259             }
1260             else
1261             {
1262                 sign = 1;
1263             }
1264             int center;
1265             int end;
1266             if (a1 == li->atoms[k].index1 || a1 == li->atoms[k].index2)
1267             {
1268                 center = a1;
1269                 end    = a2;
1270             }
1271             else
1272             {
1273                 center = a2;
1274                 end    = a1;
1275             }
1276             li->blmf[n]  = sign * invmass[center] * li->blc[i] * li->blc[k];
1277             li->blmf1[n] = sign * 0.5;
1278             if (li->ncg_triangle > 0)
1279             {
1280                 /* Look for constraint triangles */
1281                 for (int nk = li->blnr[k]; nk < li->blnr[k + 1]; nk++)
1282                 {
1283                     const int kk = li->blbnb[nk];
1284                     if (kk != i && kk != k && (li->atoms[kk].index1 == end || li->atoms[kk].index2 == end))
1285                     {
1286                         /* Check if the constraints in this triangle actually
1287                          * belong to a different task. We still assign them
1288                          * here, since it's convenient for the triangle
1289                          * iterations, but we then need an extra barrier.
1290                          */
1291                         if (k < li_task->b0 || k >= li_task->b1 || kk < li_task->b0 || kk >= li_task->b1)
1292                         {
1293                             (*nCrossTaskTriangles)++;
1294                         }
1295
1296                         if (li_task->ntriangle == 0 || li_task->triangle[li_task->ntriangle - 1] < i)
1297                         {
1298                             /* Add this constraint to the triangle list */
1299                             li_task->triangle[li_task->ntriangle] = i;
1300                             li_task->tri_bits[li_task->ntriangle] = 0;
1301                             li_task->ntriangle++;
1302                             if (li->blnr[i + 1] - li->blnr[i]
1303                                 > static_cast<int>(sizeof(li_task->tri_bits[0]) * 8 - 1))
1304                             {
1305                                 gmx_fatal(FARGS,
1306                                           "A constraint is connected to %d constraints, this is "
1307                                           "more than the %zu allowed for constraints participating "
1308                                           "in triangles",
1309                                           li->blnr[i + 1] - li->blnr[i],
1310                                           sizeof(li_task->tri_bits[0]) * 8 - 1);
1311                             }
1312                         }
1313                         li_task->tri_bits[li_task->ntriangle - 1] |= (1 << (n - li->blnr[i]));
1314                         (*ncc_triangle)++;
1315                     }
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 /*! \brief Sets the elements in the LINCS matrix. */
1323 static void set_lincs_matrix(Lincs* li, ArrayRef<const real> invmass, real lambda)
1324 {
1325     const real invsqrt2 = 0.7071067811865475244;
1326
1327     for (int i = 0; (i < li->nc); i++)
1328     {
1329         const int a1 = li->atoms[i].index1;
1330         const int a2 = li->atoms[i].index2;
1331         li->blc[i]   = gmx::invsqrt(invmass[a1] + invmass[a2]);
1332         li->blc1[i]  = invsqrt2;
1333     }
1334
1335     /* Construct the coupling coefficient matrix blmf */
1336     int ntriangle = 0, ncc_triangle = 0, nCrossTaskTriangles = 0;
1337 #pragma omp parallel for reduction(+: ntriangle, ncc_triangle, nCrossTaskTriangles) num_threads(li->ntask) schedule(static)
1338     for (int th = 0; th < li->ntask; th++)
1339     {
1340         try
1341         {
1342             set_lincs_matrix_task(li, &li->task[th], invmass, &ncc_triangle, &nCrossTaskTriangles);
1343             ntriangle += li->task[th].ntriangle;
1344         }
1345         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1346     }
1347     li->ntriangle    = ntriangle;
1348     li->ncc_triangle = ncc_triangle;
1349     li->bTaskDepTri  = (nCrossTaskTriangles > 0);
1350
1351     if (debug)
1352     {
1353         fprintf(debug, "The %d constraints participate in %d triangles\n", li->nc, li->ntriangle);
1354         fprintf(debug, "There are %d constraint couplings, of which %d in triangles\n", li->ncc, li->ncc_triangle);
1355         if (li->ntriangle > 0 && li->ntask > 1)
1356         {
1357             fprintf(debug,
1358                     "%d constraint triangles contain constraints assigned to different tasks\n",
1359                     nCrossTaskTriangles);
1360         }
1361     }
1362
1363     /* Set matlam,
1364      * so we know with which lambda value the masses have been set.
1365      */
1366     li->matlam = lambda;
1367 }
1368
1369 //! Finds all triangles of atoms that share constraints to a central atom.
1370 static int count_triangle_constraints(const InteractionLists& ilist, const ListOfLists<int>& at2con)
1371 {
1372     const int ncon1    = ilist[F_CONSTR].size() / 3;
1373     const int ncon_tot = ncon1 + ilist[F_CONSTRNC].size() / 3;
1374
1375     gmx::ArrayRef<const int> ia1 = ilist[F_CONSTR].iatoms;
1376     gmx::ArrayRef<const int> ia2 = ilist[F_CONSTRNC].iatoms;
1377
1378     int ncon_triangle = 0;
1379     for (int c0 = 0; c0 < ncon_tot; c0++)
1380     {
1381         bool       bTriangle = FALSE;
1382         const int* iap       = constr_iatomptr(ia1, ia2, c0);
1383         const int  a00       = iap[1];
1384         const int  a01       = iap[2];
1385         for (const int c1 : at2con[a01])
1386         {
1387             if (c1 != c0)
1388             {
1389                 const int* iap = constr_iatomptr(ia1, ia2, c1);
1390                 const int  a10 = iap[1];
1391                 const int  a11 = iap[2];
1392                 int        ac1;
1393                 if (a10 == a01)
1394                 {
1395                     ac1 = a11;
1396                 }
1397                 else
1398                 {
1399                     ac1 = a10;
1400                 }
1401                 for (const int c2 : at2con[ac1])
1402                 {
1403                     if (c2 != c0 && c2 != c1)
1404                     {
1405                         const int* iap = constr_iatomptr(ia1, ia2, c2);
1406                         const int  a20 = iap[1];
1407                         const int  a21 = iap[2];
1408                         if (a20 == a00 || a21 == a00)
1409                         {
1410                             bTriangle = TRUE;
1411                         }
1412                     }
1413                 }
1414             }
1415         }
1416         if (bTriangle)
1417         {
1418             ncon_triangle++;
1419         }
1420     }
1421
1422     return ncon_triangle;
1423 }
1424
1425 //! Finds sequences of sequential constraints.
1426 static bool more_than_two_sequential_constraints(const InteractionLists& ilist, const ListOfLists<int>& at2con)
1427 {
1428     const int ncon1    = ilist[F_CONSTR].size() / 3;
1429     const int ncon_tot = ncon1 + ilist[F_CONSTRNC].size() / 3;
1430
1431     gmx::ArrayRef<const int> ia1 = ilist[F_CONSTR].iatoms;
1432     gmx::ArrayRef<const int> ia2 = ilist[F_CONSTRNC].iatoms;
1433
1434     for (int c = 0; c < ncon_tot; c++)
1435     {
1436         const int* iap = constr_iatomptr(ia1, ia2, c);
1437         const int  a1  = iap[1];
1438         const int  a2  = iap[2];
1439         /* Check if this constraint has constraints connected at both atoms */
1440         if (at2con[a1].ssize() > 1 && at2con[a2].ssize() > 1)
1441         {
1442             return true;
1443         }
1444     }
1445
1446     return false;
1447 }
1448
1449 Lincs* init_lincs(FILE*                            fplog,
1450                   const gmx_mtop_t&                mtop,
1451                   int                              nflexcon_global,
1452                   ArrayRef<const ListOfLists<int>> atomToConstraintsPerMolType,
1453                   bool                             bPLINCS,
1454                   int                              nIter,
1455                   int                              nProjOrder,
1456                   ObservablesReducerBuilder*       observablesReducerBuilder)
1457 {
1458     // TODO this should become a unique_ptr
1459     Lincs* li;
1460     bool   bMoreThanTwoSeq;
1461
1462     if (fplog)
1463     {
1464         fprintf(fplog, "\nInitializing%s LINear Constraint Solver\n", bPLINCS ? " Parallel" : "");
1465     }
1466
1467     li = new Lincs;
1468
1469     li->ncg      = gmx_mtop_ftype_count(mtop, F_CONSTR) + gmx_mtop_ftype_count(mtop, F_CONSTRNC);
1470     li->ncg_flex = nflexcon_global;
1471
1472     li->nIter  = nIter;
1473     li->nOrder = nProjOrder;
1474
1475     li->max_connect = 0;
1476     for (size_t mt = 0; mt < mtop.moltype.size(); mt++)
1477     {
1478         const auto& at2con = atomToConstraintsPerMolType[mt];
1479         for (int a = 0; a < mtop.moltype[mt].atoms.nr; a++)
1480         {
1481             li->max_connect = std::max(li->max_connect, int(at2con[a].ssize()));
1482         }
1483     }
1484
1485     li->ncg_triangle = 0;
1486     bMoreThanTwoSeq  = FALSE;
1487     for (const gmx_molblock_t& molb : mtop.molblock)
1488     {
1489         const gmx_moltype_t& molt   = mtop.moltype[molb.type];
1490         const auto&          at2con = atomToConstraintsPerMolType[molb.type];
1491
1492         li->ncg_triangle += molb.nmol * count_triangle_constraints(molt.ilist, at2con);
1493
1494         if (!bMoreThanTwoSeq && more_than_two_sequential_constraints(molt.ilist, at2con))
1495         {
1496             bMoreThanTwoSeq = TRUE;
1497         }
1498     }
1499
1500     /* Check if we need to communicate not only before LINCS,
1501      * but also before each iteration.
1502      * The check for only two sequential constraints is only
1503      * useful for the common case of H-bond only constraints.
1504      * With more effort we could also make it useful for small
1505      * molecules with nr. sequential constraints <= nOrder-1.
1506      */
1507     li->bCommIter = (bPLINCS && (li->nOrder < 1 || bMoreThanTwoSeq));
1508
1509     if (debug && bPLINCS)
1510     {
1511         fprintf(debug, "PLINCS communication before each iteration: %d\n", static_cast<int>(li->bCommIter));
1512     }
1513
1514     /* LINCS can run on any number of threads.
1515      * Currently the number is fixed for the whole simulation,
1516      * but it could be set in set_lincs().
1517      * The current constraint to task assignment code can create independent
1518      * tasks only when not more than two constraints are connected sequentially.
1519      */
1520     li->ntask    = gmx_omp_nthreads_get(ModuleMultiThread::Lincs);
1521     li->bTaskDep = (li->ntask > 1 && bMoreThanTwoSeq);
1522     if (debug)
1523     {
1524         fprintf(debug, "LINCS: using %d threads, tasks are %sdependent\n", li->ntask, li->bTaskDep ? "" : "in");
1525     }
1526     if (li->ntask == 1)
1527     {
1528         li->task.resize(1);
1529     }
1530     else
1531     {
1532         /* Allocate an extra elements for "task-overlap" constraints */
1533         li->task.resize(li->ntask + 1);
1534     }
1535
1536     if (bPLINCS || li->ncg_triangle > 0)
1537     {
1538         please_cite(fplog, "Hess2008a");
1539     }
1540     else
1541     {
1542         please_cite(fplog, "Hess97a");
1543     }
1544
1545     if (fplog)
1546     {
1547         fprintf(fplog, "The number of constraints is %d\n", li->ncg);
1548         if (bPLINCS)
1549         {
1550             fprintf(fplog,
1551                     "There are constraints between atoms in different decomposition domains,\n"
1552                     "will communicate selected coordinates each lincs iteration\n");
1553         }
1554         if (li->ncg_triangle > 0)
1555         {
1556             fprintf(fplog,
1557                     "%d constraints are involved in constraint triangles,\n"
1558                     "will apply an additional matrix expansion of order %d for couplings\n"
1559                     "between constraints inside triangles\n",
1560                     li->ncg_triangle,
1561                     li->nOrder);
1562         }
1563     }
1564
1565     if (observablesReducerBuilder)
1566     {
1567         ObservablesReducerBuilder::CallbackFromBuilder callbackFromBuilder =
1568                 [li](ObservablesReducerBuilder::CallbackToRequireReduction c, gmx::ArrayRef<double> v) {
1569                     li->callbackToRequireReduction = std::move(c);
1570                     li->rmsdReductionBuffer        = v;
1571                 };
1572
1573         // Make the callback that runs afer reduction.
1574         ObservablesReducerBuilder::CallbackAfterReduction callbackAfterReduction = [li](gmx::Step /*step*/) {
1575             if (li->rmsdReductionBuffer[0] > 0)
1576             {
1577                 li->constraintRmsDeviation =
1578                         std::sqrt(li->rmsdReductionBuffer[1] / li->rmsdReductionBuffer[0]);
1579             }
1580         };
1581
1582         const int reductionValuesRequired = 2;
1583         observablesReducerBuilder->addSubscriber(reductionValuesRequired,
1584                                                  std::move(callbackFromBuilder),
1585                                                  std::move(callbackAfterReduction));
1586     }
1587
1588     return li;
1589 }
1590
1591 void done_lincs(Lincs* li)
1592 {
1593     delete li;
1594 }
1595
1596 /*! \brief Sets up the work division over the threads. */
1597 static void lincs_thread_setup(Lincs* li, int natoms)
1598 {
1599     li->atf.resize(natoms);
1600
1601     gmx::ArrayRef<gmx_bitmask_t> atf = li->atf;
1602
1603     /* Clear the atom flags */
1604     for (gmx_bitmask_t& mask : atf)
1605     {
1606         bitmask_clear(&mask);
1607     }
1608
1609     if (li->ntask > BITMASK_SIZE)
1610     {
1611         gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
1612     }
1613
1614     for (int th = 0; th < li->ntask; th++)
1615     {
1616         const Task* li_task = &li->task[th];
1617
1618         /* For each atom set a flag for constraints from each */
1619         for (int b = li_task->b0; b < li_task->b1; b++)
1620         {
1621             bitmask_set_bit(&atf[li->atoms[b].index1], th);
1622             bitmask_set_bit(&atf[li->atoms[b].index2], th);
1623         }
1624     }
1625
1626 #pragma omp parallel for num_threads(li->ntask) schedule(static)
1627     for (int th = 0; th < li->ntask; th++)
1628     {
1629         try
1630         {
1631             Task*         li_task;
1632             gmx_bitmask_t mask;
1633             int           b;
1634
1635             li_task = &li->task[th];
1636
1637             bitmask_init_low_bits(&mask, th);
1638
1639             li_task->ind.clear();
1640             li_task->ind_r.clear();
1641             for (b = li_task->b0; b < li_task->b1; b++)
1642             {
1643                 /* We let the constraint with the lowest thread index
1644                  * operate on atoms with constraints from multiple threads.
1645                  */
1646                 if (bitmask_is_disjoint(atf[li->atoms[b].index1], mask)
1647                     && bitmask_is_disjoint(atf[li->atoms[b].index2], mask))
1648                 {
1649                     /* Add the constraint to the local atom update index */
1650                     li_task->ind.push_back(b);
1651                 }
1652                 else
1653                 {
1654                     /* Add the constraint to the rest block */
1655                     li_task->ind_r.push_back(b);
1656                 }
1657             }
1658         }
1659         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1660     }
1661
1662     /* We need to copy all constraints which have not be assigned
1663      * to a thread to a separate list which will be handled by one thread.
1664      */
1665     Task* li_m = &li->task[li->ntask];
1666
1667     li_m->ind.clear();
1668     for (int th = 0; th < li->ntask; th++)
1669     {
1670         const Task& li_task = li->task[th];
1671
1672         for (int ind_r : li_task.ind_r)
1673         {
1674             li_m->ind.push_back(ind_r);
1675         }
1676
1677         if (debug)
1678         {
1679             fprintf(debug, "LINCS thread %d: %zu constraints\n", th, li_task.ind.size());
1680         }
1681     }
1682
1683     if (debug)
1684     {
1685         fprintf(debug, "LINCS thread r: %zu constraints\n", li_m->ind.size());
1686     }
1687 }
1688
1689 //! Assign a constraint.
1690 static void assign_constraint(Lincs*                  li,
1691                               int                     constraint_index,
1692                               int                     a1,
1693                               int                     a2,
1694                               real                    lenA,
1695                               real                    lenB,
1696                               const ListOfLists<int>& at2con)
1697 {
1698     int con;
1699
1700     con = li->nc;
1701
1702     /* Make an mapping of local topology constraint index to LINCS index */
1703     li->con_index[constraint_index] = con;
1704
1705     li->bllen0[con] = lenA;
1706     li->ddist[con]  = lenB - lenA;
1707     /* Set the length to the topology A length */
1708     li->bllen[con]        = lenA;
1709     li->atoms[con].index1 = a1;
1710     li->atoms[con].index2 = a2;
1711
1712     /* Make space in the constraint connection matrix for constraints
1713      * connected to both end of the current constraint.
1714      */
1715     li->ncc += at2con[a1].ssize() - 1 + at2con[a2].ssize() - 1;
1716
1717     li->blnr[con + 1] = li->ncc;
1718
1719     /* Increase the constraint count */
1720     li->nc++;
1721 }
1722
1723 /*! \brief Check if constraint with topology index constraint_index is connected
1724  * to other constraints, and if so add those connected constraints to our task. */
1725 static void check_assign_connected(Lincs*                        li,
1726                                    gmx::ArrayRef<const int>      iatom,
1727                                    const InteractionDefinitions& idef,
1728                                    bool                          bDynamics,
1729                                    int                           a1,
1730                                    int                           a2,
1731                                    const ListOfLists<int>&       at2con)
1732 {
1733     /* Currently this function only supports constraint groups
1734      * in which all constraints share at least one atom
1735      * (e.g. H-bond constraints).
1736      * Check both ends of the current constraint for
1737      * connected constraints. We need to assign those
1738      * to the same task.
1739      */
1740     for (int end = 0; end < 2; end++)
1741     {
1742         const int a = (end == 0 ? a1 : a2);
1743
1744         for (const int cc : at2con[a])
1745         {
1746             /* Check if constraint cc has not yet been assigned */
1747             if (li->con_index[cc] == -1)
1748             {
1749                 const int  type = iatom[cc * 3];
1750                 const real lenA = idef.iparams[type].constr.dA;
1751                 const real lenB = idef.iparams[type].constr.dB;
1752
1753                 if (bDynamics || lenA != 0 || lenB != 0)
1754                 {
1755                     assign_constraint(li, cc, iatom[3 * cc + 1], iatom[3 * cc + 2], lenA, lenB, at2con);
1756                 }
1757             }
1758         }
1759     }
1760 }
1761
1762 /*! \brief Check if constraint with topology index constraint_index is involved
1763  * in a constraint triangle, and if so add the other two constraints
1764  * in the triangle to our task. */
1765 static void check_assign_triangle(Lincs*                        li,
1766                                   gmx::ArrayRef<const int>      iatom,
1767                                   const InteractionDefinitions& idef,
1768                                   bool                          bDynamics,
1769                                   int                           constraint_index,
1770                                   int                           a1,
1771                                   int                           a2,
1772                                   const ListOfLists<int>&       at2con)
1773 {
1774     int nca, cc[32], ca[32];
1775     int c_triangle[2] = { -1, -1 };
1776
1777     nca = 0;
1778     for (const int c : at2con[a1])
1779     {
1780         if (c != constraint_index)
1781         {
1782             int aa1, aa2;
1783
1784             aa1 = iatom[c * 3 + 1];
1785             aa2 = iatom[c * 3 + 2];
1786             if (aa1 != a1)
1787             {
1788                 cc[nca] = c;
1789                 ca[nca] = aa1;
1790                 nca++;
1791             }
1792             if (aa2 != a1)
1793             {
1794                 cc[nca] = c;
1795                 ca[nca] = aa2;
1796                 nca++;
1797             }
1798         }
1799     }
1800
1801     for (const int c : at2con[a2])
1802     {
1803         if (c != constraint_index)
1804         {
1805             int aa1, aa2, i;
1806
1807             aa1 = iatom[c * 3 + 1];
1808             aa2 = iatom[c * 3 + 2];
1809             if (aa1 != a2)
1810             {
1811                 for (i = 0; i < nca; i++)
1812                 {
1813                     if (aa1 == ca[i])
1814                     {
1815                         c_triangle[0] = cc[i];
1816                         c_triangle[1] = c;
1817                     }
1818                 }
1819             }
1820             if (aa2 != a2)
1821             {
1822                 for (i = 0; i < nca; i++)
1823                 {
1824                     if (aa2 == ca[i])
1825                     {
1826                         c_triangle[0] = cc[i];
1827                         c_triangle[1] = c;
1828                     }
1829                 }
1830             }
1831         }
1832     }
1833
1834     if (c_triangle[0] >= 0)
1835     {
1836         int end;
1837
1838         for (end = 0; end < 2; end++)
1839         {
1840             /* Check if constraint c_triangle[end] has not yet been assigned */
1841             if (li->con_index[c_triangle[end]] == -1)
1842             {
1843                 int  i, type;
1844                 real lenA, lenB;
1845
1846                 i    = c_triangle[end] * 3;
1847                 type = iatom[i];
1848                 lenA = idef.iparams[type].constr.dA;
1849                 lenB = idef.iparams[type].constr.dB;
1850
1851                 if (bDynamics || lenA != 0 || lenB != 0)
1852                 {
1853                     assign_constraint(li, c_triangle[end], iatom[i + 1], iatom[i + 2], lenA, lenB, at2con);
1854                 }
1855             }
1856         }
1857     }
1858 }
1859
1860 //! Sets matrix indices.
1861 static void set_matrix_indices(Lincs* li, const Task& li_task, const ListOfLists<int>& at2con, bool bSortMatrix)
1862 {
1863     for (int b = li_task.b0; b < li_task.b1; b++)
1864     {
1865         const int a1 = li->atoms[b].index1;
1866         const int a2 = li->atoms[b].index2;
1867
1868         int i = li->blnr[b];
1869         for (const int constraint : at2con[a1])
1870         {
1871             const int concon = li->con_index[constraint];
1872             if (concon != b)
1873             {
1874                 li->blbnb[i++] = concon;
1875             }
1876         }
1877         for (const int constraint : at2con[a2])
1878         {
1879             const int concon = li->con_index[constraint];
1880             if (concon != b)
1881             {
1882                 li->blbnb[i++] = concon;
1883             }
1884         }
1885
1886         if (bSortMatrix)
1887         {
1888             /* Order the blbnb matrix to optimize memory access */
1889             std::sort(li->blbnb.begin() + li->blnr[b], li->blbnb.begin() + li->blnr[b + 1]);
1890         }
1891     }
1892 }
1893
1894 void set_lincs(const InteractionDefinitions& idef,
1895                const int                     numAtoms,
1896                ArrayRef<const real>          invmass,
1897                const real                    lambda,
1898                bool                          bDynamics,
1899                const t_commrec*              cr,
1900                Lincs*                        li)
1901 {
1902     li->nc_real = 0;
1903     li->nc      = 0;
1904     li->ncc     = 0;
1905     /* Zero the thread index ranges.
1906      * Otherwise without local constraints we could return with old ranges.
1907      */
1908     for (int i = 0; i < li->ntask; i++)
1909     {
1910         li->task[i].b0 = 0;
1911         li->task[i].b1 = 0;
1912         li->task[i].ind.clear();
1913     }
1914     if (li->ntask > 1)
1915     {
1916         li->task[li->ntask].ind.clear();
1917     }
1918
1919     /* This is the local topology, so there are only F_CONSTR constraints */
1920     if (idef.il[F_CONSTR].empty())
1921     {
1922         /* There are no constraints,
1923          * we do not need to fill any data structures.
1924          */
1925         return;
1926     }
1927
1928     if (debug)
1929     {
1930         fprintf(debug, "Building the LINCS connectivity\n");
1931     }
1932
1933     int natoms;
1934     if (haveDDAtomOrdering(*cr))
1935     {
1936         if (cr->dd->constraints)
1937         {
1938             int start;
1939
1940             dd_get_constraint_range(*cr->dd, &start, &natoms);
1941         }
1942         else
1943         {
1944             natoms = dd_numHomeAtoms(*cr->dd);
1945         }
1946     }
1947     else
1948     {
1949         natoms = numAtoms;
1950     }
1951
1952     const ListOfLists<int> at2con =
1953             make_at2con(natoms, idef.il, idef.iparams, flexibleConstraintTreatment(bDynamics));
1954
1955     const int ncon_tot = idef.il[F_CONSTR].size() / 3;
1956
1957     /* Ensure we have enough padding for aligned loads for each thread */
1958     const int numEntries = ncon_tot + li->ntask * simd_width;
1959     li->con_index.resize(numEntries);
1960     li->bllen0.resize(numEntries);
1961     li->ddist.resize(numEntries);
1962     li->atoms.resize(numEntries);
1963     li->blc.resize(numEntries);
1964     li->blc1.resize(numEntries);
1965     li->blnr.resize(numEntries + 1);
1966     li->bllen.resize(numEntries);
1967     li->tmpv.resizeWithPadding(numEntries);
1968     if (haveDDAtomOrdering(*cr))
1969     {
1970         li->nlocat.resize(numEntries);
1971     }
1972     li->tmp1.resize(numEntries);
1973     li->tmp2.resize(numEntries);
1974     li->tmp3.resize(numEntries);
1975     li->tmp4.resize(numEntries);
1976     li->mlambda.resize(numEntries);
1977
1978     gmx::ArrayRef<const int> iatom = idef.il[F_CONSTR].iatoms;
1979
1980     li->blnr[0] = li->ncc;
1981
1982     /* Assign the constraints for li->ntask LINCS tasks.
1983      * We target a uniform distribution of constraints over the tasks.
1984      * Note that when flexible constraints are present, but are removed here
1985      * (e.g. because we are doing EM) we get imbalance, but since that doesn't
1986      * happen during normal MD, that's ok.
1987      */
1988
1989     /* Determine the number of constraints we need to assign here */
1990     int ncon_assign = ncon_tot;
1991     if (!bDynamics)
1992     {
1993         /* With energy minimization, flexible constraints are ignored
1994          * (and thus minimized, as they should be).
1995          */
1996         ncon_assign -= countFlexibleConstraints(idef.il, idef.iparams);
1997     }
1998
1999     /* Set the target constraint count per task to exactly uniform,
2000      * this might be overridden below.
2001      */
2002     int ncon_target = (ncon_assign + li->ntask - 1) / li->ntask;
2003
2004     /* Mark all constraints as unassigned by setting their index to -1 */
2005     for (int con = 0; con < ncon_tot; con++)
2006     {
2007         li->con_index[con] = -1;
2008     }
2009
2010     int con = 0;
2011     for (int th = 0; th < li->ntask; th++)
2012     {
2013         Task* li_task;
2014
2015         li_task = &li->task[th];
2016
2017 #if GMX_SIMD_HAVE_REAL
2018         /* With indepedent tasks we likely have H-bond constraints or constraint
2019          * pairs. The connected constraints will be pulled into the task, so the
2020          * constraints per task will often exceed ncon_target.
2021          * Triangle constraints can also increase the count, but there are
2022          * relatively few of those, so we usually expect to get ncon_target.
2023          */
2024         if (li->bTaskDep)
2025         {
2026             /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
2027              * since otherwise a lot of operations can be wasted.
2028              * There are several ways to round here, we choose the one
2029              * that alternates block sizes, which helps with Intel HT.
2030              */
2031             ncon_target = ((ncon_assign * (th + 1)) / li->ntask - li->nc_real + GMX_SIMD_REAL_WIDTH - 1)
2032                           & ~(GMX_SIMD_REAL_WIDTH - 1);
2033         }
2034 #endif // GMX_SIMD==2 && GMX_SIMD_HAVE_REAL
2035
2036         /* Continue filling the arrays where we left off with the previous task,
2037          * including padding for SIMD.
2038          */
2039         li_task->b0 = li->nc;
2040
2041         gmx::ArrayRef<const t_iparams> iparams = idef.iparams;
2042
2043         while (con < ncon_tot && li->nc - li_task->b0 < ncon_target)
2044         {
2045             if (li->con_index[con] == -1)
2046             {
2047                 int  type, a1, a2;
2048                 real lenA, lenB;
2049
2050                 type = iatom[3 * con];
2051                 a1   = iatom[3 * con + 1];
2052                 a2   = iatom[3 * con + 2];
2053                 lenA = iparams[type].constr.dA;
2054                 lenB = iparams[type].constr.dB;
2055                 /* Skip the flexible constraints when not doing dynamics */
2056                 if (bDynamics || lenA != 0 || lenB != 0)
2057                 {
2058                     assign_constraint(li, con, a1, a2, lenA, lenB, at2con);
2059
2060                     if (li->ntask > 1 && !li->bTaskDep)
2061                     {
2062                         /* We can generate independent tasks. Check if we
2063                          * need to assign connected constraints to our task.
2064                          */
2065                         check_assign_connected(li, iatom, idef, bDynamics, a1, a2, at2con);
2066                     }
2067                     if (li->ntask > 1 && li->ncg_triangle > 0)
2068                     {
2069                         /* Ensure constraints in one triangle are assigned
2070                          * to the same task.
2071                          */
2072                         check_assign_triangle(li, iatom, idef, bDynamics, con, a1, a2, at2con);
2073                     }
2074                 }
2075             }
2076
2077             con++;
2078         }
2079
2080         li_task->b1 = li->nc;
2081
2082         if (simd_width > 1)
2083         {
2084             /* Copy the last atom pair indices and lengths for constraints
2085              * up to a multiple of simd_width, such that we can do all
2086              * SIMD operations without having to worry about end effects.
2087              */
2088             int i, last;
2089
2090             li->nc = ((li_task->b1 + simd_width - 1) / simd_width) * simd_width;
2091             last   = li_task->b1 - 1;
2092             for (i = li_task->b1; i < li->nc; i++)
2093             {
2094                 li->atoms[i]    = li->atoms[last];
2095                 li->bllen0[i]   = li->bllen0[last];
2096                 li->ddist[i]    = li->ddist[last];
2097                 li->bllen[i]    = li->bllen[last];
2098                 li->blnr[i + 1] = li->blnr[last + 1];
2099             }
2100         }
2101
2102         /* Keep track of how many constraints we assigned */
2103         li->nc_real += li_task->b1 - li_task->b0;
2104
2105         if (debug)
2106         {
2107             fprintf(debug, "LINCS task %d constraints %d - %d\n", th, li_task->b0, li_task->b1);
2108         }
2109     }
2110
2111     assert(li->nc_real == ncon_assign);
2112
2113     bool bSortMatrix;
2114
2115     /* Without DD we order the blbnb matrix to optimize memory access.
2116      * With DD the overhead of sorting is more than the gain during access.
2117      */
2118     bSortMatrix = !haveDDAtomOrdering(*cr);
2119
2120     li->blbnb.resize(li->ncc);
2121
2122 #pragma omp parallel for num_threads(li->ntask) schedule(static)
2123     for (int th = 0; th < li->ntask; th++)
2124     {
2125         try
2126         {
2127             Task& li_task = li->task[th];
2128
2129             if (li->ncg_triangle > 0)
2130             {
2131                 /* This is allocating too much, but it is difficult to improve */
2132                 li_task.triangle.resize(li_task.b1 - li_task.b0);
2133                 li_task.tri_bits.resize(li_task.b1 - li_task.b0);
2134             }
2135
2136             set_matrix_indices(li, li_task, at2con, bSortMatrix);
2137         }
2138         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2139     }
2140
2141     if (cr->dd == nullptr)
2142     {
2143         /* Since the matrix is static, we should free some memory */
2144         li->blbnb.resize(li->ncc);
2145     }
2146
2147     li->blmf.resize(li->ncc);
2148     li->blmf1.resize(li->ncc);
2149     li->tmpncc.resize(li->ncc);
2150
2151     gmx::ArrayRef<const int> nlocat_dd = dd_constraints_nlocalatoms(cr->dd);
2152     if (!nlocat_dd.empty())
2153     {
2154         /* Convert nlocat from local topology to LINCS constraint indexing */
2155         for (con = 0; con < ncon_tot; con++)
2156         {
2157             li->nlocat[li->con_index[con]] = nlocat_dd[con];
2158         }
2159     }
2160     else
2161     {
2162         li->nlocat.clear();
2163     }
2164
2165     if (debug)
2166     {
2167         fprintf(debug, "Number of constraints is %d, padded %d, couplings %d\n", li->nc_real, li->nc, li->ncc);
2168     }
2169
2170     if (li->ntask > 1)
2171     {
2172         lincs_thread_setup(li, numAtoms);
2173     }
2174
2175     set_lincs_matrix(li, invmass, lambda);
2176 }
2177
2178 //! Issues a warning when LINCS constraints cannot be satisfied.
2179 static void lincs_warning(gmx_domdec_t*                 dd,
2180                           ArrayRef<const RVec>          x,
2181                           ArrayRef<const RVec>          xprime,
2182                           t_pbc*                        pbc,
2183                           int                           ncons,
2184                           gmx::ArrayRef<const AtomPair> atoms,
2185                           gmx::ArrayRef<const real>     bllen,
2186                           real                          wangle,
2187                           int                           maxwarn,
2188                           int*                          warncount)
2189 {
2190     real wfac = std::cos(gmx::c_deg2Rad * wangle);
2191
2192     fprintf(stderr,
2193             "bonds that rotated more than %g degrees:\n"
2194             " atom 1 atom 2  angle  previous, current, constraint length\n",
2195             wangle);
2196
2197     for (int b = 0; b < ncons; b++)
2198     {
2199         const int i = atoms[b].index1;
2200         const int j = atoms[b].index2;
2201         rvec      v0;
2202         rvec      v1;
2203         if (pbc)
2204         {
2205             pbc_dx_aiuc(pbc, x[i], x[j], v0);
2206             pbc_dx_aiuc(pbc, xprime[i], xprime[j], v1);
2207         }
2208         else
2209         {
2210             rvec_sub(x[i], x[j], v0);
2211             rvec_sub(xprime[i], xprime[j], v1);
2212         }
2213         real d0     = norm(v0);
2214         real d1     = norm(v1);
2215         real cosine = ::iprod(v0, v1) / (d0 * d1);
2216         if (cosine < wfac)
2217         {
2218             fprintf(stderr,
2219                     " %6d %6d  %5.1f  %8.4f %8.4f    %8.4f\n",
2220                     ddglatnr(dd, i),
2221                     ddglatnr(dd, j),
2222                     gmx::c_rad2Deg * std::acos(cosine),
2223                     d0,
2224                     d1,
2225                     bllen[b]);
2226             if (!std::isfinite(d1))
2227             {
2228                 gmx_fatal(FARGS, "Bond length not finite.");
2229             }
2230
2231             (*warncount)++;
2232         }
2233     }
2234     if (*warncount > maxwarn)
2235     {
2236         too_many_constraint_warnings(ConstraintAlgorithm::Lincs, *warncount);
2237     }
2238 }
2239
2240 //! Status information about how well LINCS satisified the constraints in this domain
2241 struct LincsDeviations
2242 {
2243     //! The maximum over all bonds in this domain of the relative deviation in bond lengths
2244     real maxDeviation = 0;
2245     //! Sum over all bonds in this domain of the squared relative deviation
2246     real sumSquaredDeviation = 0;
2247     //! Index of bond with max deviation
2248     int indexOfMaxDeviation = -1;
2249     //! Number of bonds constrained in this domain
2250     int numConstraints = 0;
2251 };
2252
2253 //! Determine how well the constraints have been satisfied.
2254 static LincsDeviations makeLincsDeviations(const Lincs& lincsd, ArrayRef<const RVec> x, const t_pbc* pbc)
2255 {
2256     LincsDeviations                result;
2257     const ArrayRef<const AtomPair> atoms  = lincsd.atoms;
2258     const ArrayRef<const real>     bllen  = lincsd.bllen;
2259     const ArrayRef<const int>      nlocat = lincsd.nlocat;
2260
2261     for (int task = 0; task < lincsd.ntask; task++)
2262     {
2263         for (int b = lincsd.task[task].b0; b < lincsd.task[task].b1; b++)
2264         {
2265             rvec dx;
2266             if (pbc)
2267             {
2268                 pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
2269             }
2270             else
2271             {
2272                 rvec_sub(x[atoms[b].index1], x[atoms[b].index2], dx);
2273             }
2274             real r2  = ::norm2(dx);
2275             real len = r2 * gmx::invsqrt(r2);
2276             real d   = std::abs(len / bllen[b] - 1.0_real);
2277             if (d > result.maxDeviation && (nlocat.empty() || nlocat[b]))
2278             {
2279                 result.maxDeviation        = d;
2280                 result.indexOfMaxDeviation = b;
2281             }
2282             if (nlocat.empty())
2283             {
2284                 result.sumSquaredDeviation += d * d;
2285                 result.numConstraints++;
2286             }
2287             else
2288             {
2289                 result.sumSquaredDeviation += nlocat[b] * d * d;
2290                 result.numConstraints += nlocat[b];
2291             }
2292         }
2293     }
2294
2295     if (!nlocat.empty())
2296     {
2297         result.numConstraints /= 2;
2298         result.sumSquaredDeviation *= 0.5;
2299     }
2300     return result;
2301 }
2302
2303 bool constrain_lincs(bool                            computeRmsd,
2304                      const t_inputrec&               ir,
2305                      int64_t                         step,
2306                      Lincs*                          lincsd,
2307                      ArrayRef<const real>            invmass,
2308                      const t_commrec*                cr,
2309                      const gmx_multisim_t*           ms,
2310                      ArrayRefWithPadding<const RVec> xPadded,
2311                      ArrayRefWithPadding<RVec>       xprimePadded,
2312                      ArrayRef<RVec>                  min_proj,
2313                      const matrix                    box,
2314                      t_pbc*                          pbc,
2315                      const bool                      hasMassPerturbed,
2316                      real                            lambda,
2317                      real*                           dvdlambda,
2318                      real                            invdt,
2319                      ArrayRef<RVec>                  v,
2320                      bool                            bCalcVir,
2321                      tensor                          vir_r_m_dr,
2322                      ConstraintVariable              econq,
2323                      t_nrnb*                         nrnb,
2324                      int                             maxwarn,
2325                      int*                            warncount)
2326 {
2327     bool bOK = TRUE;
2328
2329     /* This boolean should be set by a flag passed to this routine.
2330      * We can also easily check if any constraint length is changed,
2331      * if not dH/dlambda=0 and we can also set the boolean to FALSE.
2332      */
2333     bool bCalcDHDL = (ir.efep != FreeEnergyPerturbationType::No && dvdlambda != nullptr);
2334
2335     if (lincsd->nc == 0 && cr->dd == nullptr)
2336     {
2337         return bOK;
2338     }
2339
2340     ArrayRef<const RVec> x      = xPadded.unpaddedArrayRef();
2341     ArrayRef<RVec>       xprime = xprimePadded.unpaddedArrayRef();
2342
2343     if (econq == ConstraintVariable::Positions)
2344     {
2345         /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
2346          * also with efep!=fepNO.
2347          */
2348         if (ir.efep != FreeEnergyPerturbationType::No)
2349         {
2350             if (hasMassPerturbed && lincsd->matlam != lambda)
2351             {
2352                 set_lincs_matrix(lincsd, invmass, lambda);
2353             }
2354
2355             for (int i = 0; i < lincsd->nc; i++)
2356             {
2357                 lincsd->bllen[i] = lincsd->bllen0[i] + lambda * lincsd->ddist[i];
2358             }
2359         }
2360
2361         if (lincsd->ncg_flex)
2362         {
2363             /* Set the flexible constraint lengths to the old lengths */
2364             if (pbc != nullptr)
2365             {
2366                 for (int i = 0; i < lincsd->nc; i++)
2367                 {
2368                     if (lincsd->bllen[i] == 0)
2369                     {
2370                         rvec dx;
2371                         pbc_dx_aiuc(pbc, x[lincsd->atoms[i].index1], x[lincsd->atoms[i].index2], dx);
2372                         lincsd->bllen[i] = norm(dx);
2373                     }
2374                 }
2375             }
2376             else
2377             {
2378                 for (int i = 0; i < lincsd->nc; i++)
2379                 {
2380                     if (lincsd->bllen[i] == 0)
2381                     {
2382                         lincsd->bllen[i] = std::sqrt(
2383                                 distance2(x[lincsd->atoms[i].index1], x[lincsd->atoms[i].index2]));
2384                     }
2385                 }
2386             }
2387         }
2388
2389         const bool printDebugOutput = ((debug != nullptr) && lincsd->nc > 0);
2390         if (printDebugOutput)
2391         {
2392             LincsDeviations deviations = makeLincsDeviations(*lincsd, xprime, pbc);
2393             fprintf(debug, "   Rel. Constraint Deviation:  RMS         MAX     between atoms\n");
2394             fprintf(debug,
2395                     "       Before LINCS          %.6f    %.6f %6d %6d\n",
2396                     std::sqrt(deviations.sumSquaredDeviation / deviations.numConstraints),
2397                     deviations.maxDeviation,
2398                     ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2399                     ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2400         }
2401
2402         /* This bWarn var can be updated by multiple threads
2403          * at the same time. But as we only need to detect
2404          * if a warning occurred or not, this is not an issue.
2405          */
2406         bool bWarn = FALSE;
2407
2408         /* The OpenMP parallel region of constrain_lincs for coords */
2409 #pragma omp parallel num_threads(lincsd->ntask)
2410         {
2411             try
2412             {
2413                 int th = gmx_omp_get_thread_num();
2414
2415                 clear_mat(lincsd->task[th].vir_r_m_dr);
2416
2417                 do_lincs(xPadded,
2418                          xprimePadded,
2419                          box,
2420                          pbc,
2421                          lincsd,
2422                          th,
2423                          invmass,
2424                          cr,
2425                          bCalcDHDL,
2426                          ir.LincsWarnAngle,
2427                          &bWarn,
2428                          invdt,
2429                          v,
2430                          bCalcVir,
2431                          th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2432             }
2433             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2434         }
2435
2436         if (computeRmsd || printDebugOutput || bWarn)
2437         {
2438             LincsDeviations deviations = makeLincsDeviations(*lincsd, xprime, pbc);
2439
2440             if (computeRmsd)
2441             {
2442                 if (lincsd->callbackToRequireReduction.has_value())
2443                 {
2444                     // This is reduced across domains in compute_globals and
2445                     // reported to the log file.
2446                     lincsd->rmsdReductionBuffer[0] = deviations.numConstraints;
2447                     lincsd->rmsdReductionBuffer[1] = deviations.sumSquaredDeviation;
2448
2449                     // Call the ObservablesReducer via the callback it
2450                     // gave us for the purpose.
2451                     ObservablesReducerStatus status =
2452                             lincsd->callbackToRequireReduction.value()(ReductionRequirement::Soon);
2453                     GMX_RELEASE_ASSERT(status == ObservablesReducerStatus::ReadyToReduce,
2454                                        "The LINCS RMSD is computed after observables have been "
2455                                        "reduced, please reorder them.");
2456                 }
2457                 else
2458                 {
2459                     // Compute the deviation directly
2460                     lincsd->constraintRmsDeviation =
2461                             std::sqrt(deviations.sumSquaredDeviation / deviations.numConstraints);
2462                 }
2463             }
2464             if (printDebugOutput)
2465             {
2466                 fprintf(debug,
2467                         "        After LINCS          %.6f    %.6f %6d %6d\n\n",
2468                         std::sqrt(deviations.sumSquaredDeviation / deviations.numConstraints),
2469                         deviations.maxDeviation,
2470                         ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2471                         ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2472             }
2473
2474             if (bWarn)
2475             {
2476                 if (maxwarn < INT_MAX)
2477                 {
2478                     std::string simMesg;
2479                     if (isMultiSim(ms))
2480                     {
2481                         simMesg += gmx::formatString(" in simulation %d", ms->simulationIndex_);
2482                     }
2483                     fprintf(stderr,
2484                             "\nStep %" PRId64
2485                             ", time %g (ps)  LINCS WARNING%s\n"
2486                             "relative constraint deviation after LINCS:\n"
2487                             "rms %.6f, max %.6f (between atoms %d and %d)\n",
2488                             step,
2489                             ir.init_t + step * ir.delta_t,
2490                             simMesg.c_str(),
2491                             std::sqrt(deviations.sumSquaredDeviation / deviations.numConstraints),
2492                             deviations.maxDeviation,
2493                             ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2494                             ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2495
2496                     lincs_warning(
2497                             cr->dd, x, xprime, pbc, lincsd->nc, lincsd->atoms, lincsd->bllen, ir.LincsWarnAngle, maxwarn, warncount);
2498                 }
2499                 bOK = (deviations.maxDeviation < 0.5);
2500             }
2501         }
2502
2503         if (lincsd->ncg_flex)
2504         {
2505             for (int i = 0; (i < lincsd->nc); i++)
2506             {
2507                 if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
2508                 {
2509                     lincsd->bllen[i] = 0;
2510                 }
2511             }
2512         }
2513     }
2514     else
2515     {
2516         /* The OpenMP parallel region of constrain_lincs for derivatives */
2517 #pragma omp parallel num_threads(lincsd->ntask)
2518         {
2519             try
2520             {
2521                 int th = gmx_omp_get_thread_num();
2522
2523                 do_lincsp(xPadded,
2524                           xprimePadded,
2525                           min_proj,
2526                           pbc,
2527                           lincsd,
2528                           th,
2529                           invmass,
2530                           econq,
2531                           bCalcDHDL,
2532                           bCalcVir,
2533                           th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2534             }
2535             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2536         }
2537     }
2538
2539     if (bCalcDHDL)
2540     {
2541         /* Reduce the dH/dlambda contributions over the threads */
2542         real dhdlambda;
2543         int  th;
2544
2545         dhdlambda = 0;
2546         for (th = 0; th < lincsd->ntask; th++)
2547         {
2548             dhdlambda += lincsd->task[th].dhdlambda;
2549         }
2550         if (econq == ConstraintVariable::Positions)
2551         {
2552             /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2553             /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2554             dhdlambda /= ir.delta_t * ir.delta_t;
2555         }
2556         *dvdlambda += dhdlambda;
2557     }
2558
2559     if (bCalcVir && lincsd->ntask > 1)
2560     {
2561         for (int i = 1; i < lincsd->ntask; i++)
2562         {
2563             m_add(vir_r_m_dr, lincsd->task[i].vir_r_m_dr, vir_r_m_dr);
2564         }
2565     }
2566
2567     /* count assuming nit=1 */
2568     inc_nrnb(nrnb, eNR_LINCS, lincsd->nc_real);
2569     inc_nrnb(nrnb, eNR_LINCSMAT, (2 + lincsd->nOrder) * lincsd->ncc);
2570     if (lincsd->ntriangle > 0)
2571     {
2572         inc_nrnb(nrnb, eNR_LINCSMAT, lincsd->nOrder * lincsd->ncc_triangle);
2573     }
2574     if (!v.empty())
2575     {
2576         inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc_real * 2);
2577     }
2578     if (bCalcVir)
2579     {
2580         inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc_real);
2581     }
2582
2583     return bOK;
2584 }
2585
2586 } // namespace gmx