src/gromacs/mdlib/lincs.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 /*! \internal \file
  39  * \brief Defines LINCS code.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \author Mark Abraham <mark.j.abraham@gmail.com>
  43  * \ingroup module_mdlib
  44  */
  45 #include "gmxpre.h"
  46
  47 #include "lincs.h"
  48
  49 #include "config.h"
  50
  51 #include <cassert>
  52 #include <cmath>
  53 #include <cstdlib>
  54
  55 #include <algorithm>
  56 #include <vector>
  57
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_struct.h"
  60 #include "gromacs/gmxlib/nrnb.h"
  61 #include "gromacs/math/functions.h"
  62 #include "gromacs/math/paddedvector.h"
  63 #include "gromacs/math/units.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/mdlib/constr.h"
  66 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  67 #include "gromacs/mdrunutility/multisim.h"
  68 #include "gromacs/mdtypes/commrec.h"
  69 #include "gromacs/mdtypes/inputrec.h"
  70 #include "gromacs/mdtypes/md_enums.h"
  71 #include "gromacs/mdtypes/mdatom.h"
  72 #include "gromacs/pbcutil/pbc.h"
  73 #include "gromacs/pbcutil/pbc_simd.h"
  74 #include "gromacs/simd/simd.h"
  75 #include "gromacs/simd/simd_math.h"
  76 #include "gromacs/simd/vector_operations.h"
  77 #include "gromacs/topology/mtop_util.h"
  78 #include "gromacs/utility/alignedallocator.h"
  79 #include "gromacs/utility/arrayref.h"
  80 #include "gromacs/utility/basedefinitions.h"
  81 #include "gromacs/utility/bitmask.h"
  82 #include "gromacs/utility/cstringutil.h"
  83 #include "gromacs/utility/exceptions.h"
  84 #include "gromacs/utility/fatalerror.h"
  85 #include "gromacs/utility/gmxomp.h"
  86 #include "gromacs/utility/listoflists.h"
  87 #include "gromacs/utility/pleasecite.h"
  88
  89 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  90
  91 namespace gmx
  92 {
  93
  94 //! Indices of the two atoms involved in a single constraint
  95 struct AtomPair
  96 {
  97     //! \brief Constructor, does not initialize to catch bugs and faster construction
  98     AtomPair() {}
  99
 100     //! Index of atom 1
 101     int index1;
 102     //! Index of atom 2
 103     int index2;
 104 };
 105
 106 //! Unit of work within LINCS.
 107 struct Task
 108 {
 109     //! First constraint for this task.
 110     int b0 = 0;
 111     //! b1-1 is the last constraint for this task.
 112     int b1 = 0;
 113     //! The number of constraints in triangles.
 114     int ntriangle = 0;
 115     //! The list of triangle constraints.
 116     std::vector<int> triangle;
 117     //! The bits tell if the matrix element should be used.
 118     std::vector<int> tri_bits;
 119     //! Constraint index for updating atom data.
 120     std::vector<int> ind;
 121     //! Constraint index for updating atom data.
 122     std::vector<int> ind_r;
 123     //! Temporary variable for virial calculation.
 124     tensor vir_r_m_dr = { { 0 } };
 125     //! Temporary variable for lambda derivative.
 126     real dhdlambda;
 127 };
 128
 129 /*! \brief Data for LINCS algorithm.
 130  */
 131 class Lincs
 132 {
 133 public:
 134     //! The global number of constraints.
 135     int ncg = 0;
 136     //! The global number of flexible constraints.
 137     int ncg_flex = 0;
 138     //! The global number of constraints in triangles.
 139     int ncg_triangle = 0;
 140     //! The number of iterations.
 141     int nIter = 0;
 142     //! The order of the matrix expansion.
 143     int nOrder = 0;
 144     //! The maximum number of constraints connected to a single atom.
 145     int max_connect = 0;
 146
 147     //! The number of real constraints.
 148     int nc_real = 0;
 149     //! The number of constraints including padding for SIMD.
 150     int nc = 0;
 151     //! The number of constraint connections.
 152     int ncc = 0;
 153     //! The FE lambda value used for filling blc and blmf.
 154     real matlam = 0;
 155     //! mapping from topology to LINCS constraints.
 156     std::vector<int> con_index;
 157     //! The reference distance in topology A.
 158     std::vector<real, AlignedAllocator<real>> bllen0;
 159     //! The reference distance in top B - the r.d. in top A.
 160     std::vector<real, AlignedAllocator<real>> ddist;
 161     //! The atom pairs involved in the constraints.
 162     std::vector<AtomPair> atoms;
 163     //! 1/sqrt(invmass1  invmass2).
 164     std::vector<real, AlignedAllocator<real>> blc;
 165     //! As blc, but with all masses 1.
 166     std::vector<real, AlignedAllocator<real>> blc1;
 167     //! Index into blbnb and blmf.
 168     std::vector<int> blnr;
 169     //! List of constraint connections.
 170     std::vector<int> blbnb;
 171     //! The local number of constraints in triangles.
 172     int ntriangle = 0;
 173     //! The number of constraint connections in triangles.
 174     int ncc_triangle = 0;
 175     //! Communicate before each LINCS interation.
 176     bool bCommIter = false;
 177     //! Matrix of mass factors for constraint connections.
 178     std::vector<real> blmf;
 179     //! As blmf, but with all masses 1.
 180     std::vector<real> blmf1;
 181     //! The reference bond length.
 182     std::vector<real, AlignedAllocator<real>> bllen;
 183     //! The local atom count per constraint, can be NULL.
 184     std::vector<int> nlocat;
 185
 186     /*! \brief The number of tasks used for LINCS work.
 187      *
 188      * \todo This is mostly used to loop over \c task, which would
 189      * be nicer to do with range-based for loops, but the thread
 190      * index is used for constructing bit masks and organizing the
 191      * virial output buffer, so other things need to change,
 192      * first. */
 193     int ntask = 0;
 194     /*! \brief LINCS thread division */
 195     std::vector<Task> task;
 196     //! Atom flags for thread parallelization.
 197     std::vector<gmx_bitmask_t> atf;
 198     //! Are the LINCS tasks interdependent?
 199     bool bTaskDep = false;
 200     //! Are there triangle constraints that cross task borders?
 201     bool bTaskDepTri = false;
 202     //! Arrays for temporary storage in the LINCS algorithm.
 203     /*! @{ */
 204     PaddedVector<gmx::RVec>                   tmpv;
 205     std::vector<real>                         tmpncc;
 206     std::vector<real, AlignedAllocator<real>> tmp1;
 207     std::vector<real, AlignedAllocator<real>> tmp2;
 208     std::vector<real, AlignedAllocator<real>> tmp3;
 209     std::vector<real, AlignedAllocator<real>> tmp4;
 210     /*! @} */
 211     //! The Lagrange multipliers times -1.
 212     std::vector<real, AlignedAllocator<real>> mlambda;
 213     //! Storage for the constraint RMS relative deviation output.
 214     std::array<real, 2> rmsdData = { { 0 } };
 215 };
 216
 217 /*! \brief Define simd_width for memory allocation used for SIMD code */
 218 #if GMX_SIMD_HAVE_REAL
 219 static const int simd_width = GMX_SIMD_REAL_WIDTH;
 220 #else
 221 static const int simd_width = 1;
 222 #endif
 223
 224 ArrayRef<real> lincs_rmsdData(Lincs* lincsd)
 225 {
 226     return lincsd->rmsdData;
 227 }
 228
 229 real lincs_rmsd(const Lincs* lincsd)
 230 {
 231     if (lincsd->rmsdData[0] > 0)
 232     {
 233         return std::sqrt(lincsd->rmsdData[1] / lincsd->rmsdData[0]);
 234     }
 235     else
 236     {
 237         return 0;
 238     }
 239 }
 240
 241 /*! \brief Do a set of nrec LINCS matrix multiplications.
 242  *
 243  * This function will return with up to date thread-local
 244  * constraint data, without an OpenMP barrier.
 245  */
 246 static void lincs_matrix_expand(const Lincs&              lincsd,
 247                                 const Task&               li_task,
 248                                 gmx::ArrayRef<const real> blcc,
 249                                 gmx::ArrayRef<real>       rhs1,
 250                                 gmx::ArrayRef<real>       rhs2,
 251                                 gmx::ArrayRef<real>       sol)
 252 {
 253     gmx::ArrayRef<const int> blnr  = lincsd.blnr;
 254     gmx::ArrayRef<const int> blbnb = lincsd.blbnb;
 255
 256     const int b0   = li_task.b0;
 257     const int b1   = li_task.b1;
 258     const int nrec = lincsd.nOrder;
 259
 260     for (int rec = 0; rec < nrec; rec++)
 261     {
 262         if (lincsd.bTaskDep)
 263         {
 264 #pragma omp barrier
 265         }
 266         for (int b = b0; b < b1; b++)
 267         {
 268             real mvb;
 269             int  n;
 270
 271             mvb = 0;
 272             for (n = blnr[b]; n < blnr[b + 1]; n++)
 273             {
 274                 mvb = mvb + blcc[n] * rhs1[blbnb[n]];
 275             }
 276             rhs2[b] = mvb;
 277             sol[b]  = sol[b] + mvb;
 278         }
 279
 280         std::swap(rhs1, rhs2);
 281     } /* nrec*(ncons+2*nrtot) flops */
 282
 283     if (lincsd.ntriangle > 0)
 284     {
 285         /* Perform an extra nrec recursions for only the constraints
 286          * involved in rigid triangles.
 287          * In this way their accuracy should come close to those of the other
 288          * constraints, since traingles of constraints can produce eigenvalues
 289          * around 0.7, while the effective eigenvalue for bond constraints
 290          * is around 0.4 (and 0.7*0.7=0.5).
 291          */
 292
 293         if (lincsd.bTaskDep)
 294         {
 295             /* We need a barrier here, since other threads might still be
 296              * reading the contents of rhs1 and/o rhs2.
 297              * We could avoid this barrier by introducing two extra rhs
 298              * arrays for the triangle constraints only.
 299              */
 300 #pragma omp barrier
 301         }
 302
 303         /* Constraints involved in a triangle are ensured to be in the same
 304          * LINCS task. This means no barriers are required during the extra
 305          * iterations for the triangle constraints.
 306          */
 307         gmx::ArrayRef<const int> triangle = li_task.triangle;
 308         gmx::ArrayRef<const int> tri_bits = li_task.tri_bits;
 309
 310         for (int rec = 0; rec < nrec; rec++)
 311         {
 312             for (int tb = 0; tb < li_task.ntriangle; tb++)
 313             {
 314                 int  b, bits, nr0, nr1, n;
 315                 real mvb;
 316
 317                 b    = triangle[tb];
 318                 bits = tri_bits[tb];
 319                 mvb  = 0;
 320                 nr0  = blnr[b];
 321                 nr1  = blnr[b + 1];
 322                 for (n = nr0; n < nr1; n++)
 323                 {
 324                     if (bits & (1 << (n - nr0)))
 325                     {
 326                         mvb = mvb + blcc[n] * rhs1[blbnb[n]];
 327                     }
 328                 }
 329                 rhs2[b] = mvb;
 330                 sol[b]  = sol[b] + mvb;
 331             }
 332
 333             std::swap(rhs1, rhs2);
 334         } /* nrec*(ntriangle + ncc_triangle*2) flops */
 335
 336         if (lincsd.bTaskDepTri)
 337         {
 338             /* The constraints triangles are decoupled from each other,
 339              * but constraints in one triangle cross thread task borders.
 340              * We could probably avoid this with more advanced setup code.
 341              */
 342 #pragma omp barrier
 343         }
 344     }
 345 }
 346
 347 //! Update atomic coordinates when an index is not required.
 348 static void lincs_update_atoms_noind(int                            ncons,
 349                                      gmx::ArrayRef<const AtomPair>  atoms,
 350                                      real                           preFactor,
 351                                      gmx::ArrayRef<const real>      fac,
 352                                      gmx::ArrayRef<const gmx::RVec> r,
 353                                      const real*                    invmass,
 354                                      rvec*                          x)
 355 {
 356     if (invmass != nullptr)
 357     {
 358         for (int b = 0; b < ncons; b++)
 359         {
 360             int  i    = atoms[b].index1;
 361             int  j    = atoms[b].index2;
 362             real mvb  = preFactor * fac[b];
 363             real im1  = invmass[i];
 364             real im2  = invmass[j];
 365             real tmp0 = r[b][0] * mvb;
 366             real tmp1 = r[b][1] * mvb;
 367             real tmp2 = r[b][2] * mvb;
 368             x[i][0] -= tmp0 * im1;
 369             x[i][1] -= tmp1 * im1;
 370             x[i][2] -= tmp2 * im1;
 371             x[j][0] += tmp0 * im2;
 372             x[j][1] += tmp1 * im2;
 373             x[j][2] += tmp2 * im2;
 374         } /* 16 ncons flops */
 375     }
 376     else
 377     {
 378         for (int b = 0; b < ncons; b++)
 379         {
 380             int  i    = atoms[b].index1;
 381             int  j    = atoms[b].index2;
 382             real mvb  = preFactor * fac[b];
 383             real tmp0 = r[b][0] * mvb;
 384             real tmp1 = r[b][1] * mvb;
 385             real tmp2 = r[b][2] * mvb;
 386             x[i][0] -= tmp0;
 387             x[i][1] -= tmp1;
 388             x[i][2] -= tmp2;
 389             x[j][0] += tmp0;
 390             x[j][1] += tmp1;
 391             x[j][2] += tmp2;
 392         }
 393     }
 394 }
 395
 396 //! Update atomic coordinates when an index is required.
 397 static void lincs_update_atoms_ind(gmx::ArrayRef<const int>       ind,
 398                                    gmx::ArrayRef<const AtomPair>  atoms,
 399                                    real                           preFactor,
 400                                    gmx::ArrayRef<const real>      fac,
 401                                    gmx::ArrayRef<const gmx::RVec> r,
 402                                    const real*                    invmass,
 403                                    rvec*                          x)
 404 {
 405     if (invmass != nullptr)
 406     {
 407         for (int b : ind)
 408         {
 409             int  i    = atoms[b].index1;
 410             int  j    = atoms[b].index2;
 411             real mvb  = preFactor * fac[b];
 412             real im1  = invmass[i];
 413             real im2  = invmass[j];
 414             real tmp0 = r[b][0] * mvb;
 415             real tmp1 = r[b][1] * mvb;
 416             real tmp2 = r[b][2] * mvb;
 417             x[i][0] -= tmp0 * im1;
 418             x[i][1] -= tmp1 * im1;
 419             x[i][2] -= tmp2 * im1;
 420             x[j][0] += tmp0 * im2;
 421             x[j][1] += tmp1 * im2;
 422             x[j][2] += tmp2 * im2;
 423         } /* 16 ncons flops */
 424     }
 425     else
 426     {
 427         for (int b : ind)
 428         {
 429             int  i    = atoms[b].index1;
 430             int  j    = atoms[b].index2;
 431             real mvb  = preFactor * fac[b];
 432             real tmp0 = r[b][0] * mvb;
 433             real tmp1 = r[b][1] * mvb;
 434             real tmp2 = r[b][2] * mvb;
 435             x[i][0] -= tmp0;
 436             x[i][1] -= tmp1;
 437             x[i][2] -= tmp2;
 438             x[j][0] += tmp0;
 439             x[j][1] += tmp1;
 440             x[j][2] += tmp2;
 441         } /* 16 ncons flops */
 442     }
 443 }
 444
 445 //! Update coordinates for atoms.
 446 static void lincs_update_atoms(Lincs*                         li,
 447                                int                            th,
 448                                real                           preFactor,
 449                                gmx::ArrayRef<const real>      fac,
 450                                gmx::ArrayRef<const gmx::RVec> r,
 451                                const real*                    invmass,
 452                                rvec*                          x)
 453 {
 454     if (li->ntask == 1)
 455     {
 456         /* Single thread, we simply update for all constraints */
 457         lincs_update_atoms_noind(li->nc_real, li->atoms, preFactor, fac, r, invmass, x);
 458     }
 459     else
 460     {
 461         /* Update the atom vector components for our thread local
 462          * constraints that only access our local atom range.
 463          * This can be done without a barrier.
 464          */
 465         lincs_update_atoms_ind(li->task[th].ind, li->atoms, preFactor, fac, r, invmass, x);
 466
 467         if (!li->task[li->ntask].ind.empty())
 468         {
 469             /* Update the constraints that operate on atoms
 470              * in multiple thread atom blocks on the master thread.
 471              */
 472 #pragma omp barrier
 473 #pragma omp master
 474             {
 475                 lincs_update_atoms_ind(li->task[li->ntask].ind, li->atoms, preFactor, fac, r, invmass, x);
 476             }
 477         }
 478     }
 479 }
 480
 481 #if GMX_SIMD_HAVE_REAL
 482 //! Helper function so that we can run TSAN with SIMD support (where implemented).
 483 template<int align>
 484 static inline void gmx_simdcall gatherLoadUTransposeTSANSafe(const real*         base,
 485                                                              const std::int32_t* offset,
 486                                                              SimdReal*           v0,
 487                                                              SimdReal*           v1,
 488                                                              SimdReal*           v2)
 489 {
 490 #    if (CMAKE_BUILD_TYPE == CMAKE_BUILD_TYPE_TSAN) && GMX_SIMD_X86_AVX2_256
 491     // This function is only implemented in this case
 492     gatherLoadUTransposeSafe<align>(base, offset, v0, v1, v2);
 493 #    else
 494     gatherLoadUTranspose<align>(base, offset, v0, v1, v2);
 495 #    endif
 496 }
 497
 498 /*! \brief Calculate the constraint distance vectors r to project on from x.
 499  *
 500  * Determine the right-hand side of the matrix equation using quantity f.
 501  * This function only differs from calc_dr_x_xp_simd below in that
 502  * no constraint length is subtracted and no PBC is used for f. */
 503 static void gmx_simdcall calc_dr_x_f_simd(int                           b0,
 504                                           int                           b1,
 505                                           gmx::ArrayRef<const AtomPair> atoms,
 506                                           const rvec* gmx_restrict x,
 507                                           const rvec* gmx_restrict f,
 508                                           const real* gmx_restrict blc,
 509                                           const real*              pbc_simd,
 510                                           rvec* gmx_restrict r,
 511                                           real* gmx_restrict rhs,
 512                                           real* gmx_restrict sol)
 513 {
 514     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 515
 516     alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset2[GMX_SIMD_REAL_WIDTH];
 517
 518     for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 519     {
 520         offset2[i] = i;
 521     }
 522
 523     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 524     {
 525         SimdReal                                 x0_S, y0_S, z0_S;
 526         SimdReal                                 x1_S, y1_S, z1_S;
 527         SimdReal                                 rx_S, ry_S, rz_S, n2_S, il_S;
 528         SimdReal                                 fx_S, fy_S, fz_S, ip_S, rhs_S;
 529         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset0[GMX_SIMD_REAL_WIDTH];
 530         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset1[GMX_SIMD_REAL_WIDTH];
 531
 532         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 533         {
 534             offset0[i] = atoms[bs + i].index1;
 535             offset1[i] = atoms[bs + i].index2;
 536         }
 537
 538         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset0, &x0_S, &y0_S, &z0_S);
 539         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset1, &x1_S, &y1_S, &z1_S);
 540         rx_S = x0_S - x1_S;
 541         ry_S = y0_S - y1_S;
 542         rz_S = z0_S - z1_S;
 543
 544         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 545
 546         n2_S = norm2(rx_S, ry_S, rz_S);
 547         il_S = invsqrt(n2_S);
 548
 549         rx_S = rx_S * il_S;
 550         ry_S = ry_S * il_S;
 551         rz_S = rz_S * il_S;
 552
 553         transposeScatterStoreU<3>(reinterpret_cast<real*>(r + bs), offset2, rx_S, ry_S, rz_S);
 554
 555         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(f), offset0, &x0_S, &y0_S, &z0_S);
 556         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(f), offset1, &x1_S, &y1_S, &z1_S);
 557         fx_S = x0_S - x1_S;
 558         fy_S = y0_S - y1_S;
 559         fz_S = z0_S - z1_S;
 560
 561         ip_S = iprod(rx_S, ry_S, rz_S, fx_S, fy_S, fz_S);
 562
 563         rhs_S = load<SimdReal>(blc + bs) * ip_S;
 564
 565         store(rhs + bs, rhs_S);
 566         store(sol + bs, rhs_S);
 567     }
 568 }
 569 #endif // GMX_SIMD_HAVE_REAL
 570
 571 /*! \brief LINCS projection, works on derivatives of the coordinates. */
 572 static void do_lincsp(ArrayRefWithPadding<const RVec> xPadded,
 573                       ArrayRefWithPadding<RVec>       fPadded,
 574                       ArrayRef<RVec>                  fp,
 575                       t_pbc*                          pbc,
 576                       Lincs*                          lincsd,
 577                       int                             th,
 578                       real*                           invmass,
 579                       ConstraintVariable              econq,
 580                       bool                            bCalcDHDL,
 581                       bool                            bCalcVir,
 582                       tensor                          rmdf)
 583 {
 584     const int b0 = lincsd->task[th].b0;
 585     const int b1 = lincsd->task[th].b1;
 586
 587     gmx::ArrayRef<const AtomPair> atoms = lincsd->atoms;
 588     gmx::ArrayRef<gmx::RVec>      r     = lincsd->tmpv;
 589     gmx::ArrayRef<const int>      blnr  = lincsd->blnr;
 590     gmx::ArrayRef<const int>      blbnb = lincsd->blbnb;
 591
 592     gmx::ArrayRef<const real> blc;
 593     gmx::ArrayRef<const real> blmf;
 594     if (econq != ConstraintVariable::Force)
 595     {
 596         /* Use mass-weighted parameters */
 597         blc  = lincsd->blc;
 598         blmf = lincsd->blmf;
 599     }
 600     else
 601     {
 602         /* Use non mass-weighted parameters */
 603         blc  = lincsd->blc1;
 604         blmf = lincsd->blmf1;
 605     }
 606     gmx::ArrayRef<real> blcc = lincsd->tmpncc;
 607     gmx::ArrayRef<real> rhs1 = lincsd->tmp1;
 608     gmx::ArrayRef<real> rhs2 = lincsd->tmp2;
 609     gmx::ArrayRef<real> sol  = lincsd->tmp3;
 610
 611     const rvec* x = as_rvec_array(xPadded.paddedArrayRef().data());
 612     rvec*       f = as_rvec_array(fPadded.paddedArrayRef().data());
 613
 614 #if GMX_SIMD_HAVE_REAL
 615     /* This SIMD code does the same as the plain-C code after the #else.
 616      * The only difference is that we always call pbc code, as with SIMD
 617      * the overhead of pbc computation (when not needed) is small.
 618      */
 619     alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9 * GMX_SIMD_REAL_WIDTH];
 620
 621     /* Convert the pbc struct for SIMD */
 622     set_pbc_simd(pbc, pbc_simd);
 623
 624     /* Compute normalized x i-j vectors, store in r.
 625      * Compute the inner product of r and xp i-j and store in rhs1.
 626      */
 627     calc_dr_x_f_simd(b0, b1, atoms, x, f, blc.data(), pbc_simd, as_rvec_array(r.data()),
 628                      rhs1.data(), sol.data());
 629
 630 #else // GMX_SIMD_HAVE_REAL
 631
 632     /* Compute normalized i-j vectors */
 633     if (pbc)
 634     {
 635         for (int b = b0; b < b1; b++)
 636         {
 637             rvec dx;
 638
 639             pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
 640             unitv(dx, r[b]);
 641         }
 642     }
 643     else
 644     {
 645         for (int b = b0; b < b1; b++)
 646         {
 647             rvec dx;
 648
 649             rvec_sub(x[atoms[b].index1], x[atoms[b].index2], dx);
 650             unitv(dx, r[b]);
 651         } /* 16 ncons flops */
 652     }
 653
 654     for (int b = b0; b < b1; b++)
 655     {
 656         int  i   = atoms[b].index1;
 657         int  j   = atoms[b].index2;
 658         real mvb = blc[b]
 659                    * (r[b][0] * (f[i][0] - f[j][0]) + r[b][1] * (f[i][1] - f[j][1])
 660                       + r[b][2] * (f[i][2] - f[j][2]));
 661         rhs1[b] = mvb;
 662         sol[b]  = mvb;
 663         /* 7 flops */
 664     }
 665
 666 #endif // GMX_SIMD_HAVE_REAL
 667
 668     if (lincsd->bTaskDep)
 669     {
 670         /* We need a barrier, since the matrix construction below
 671          * can access entries in r of other threads.
 672          */
 673 #pragma omp barrier
 674     }
 675
 676     /* Construct the (sparse) LINCS matrix */
 677     for (int b = b0; b < b1; b++)
 678     {
 679         for (int n = blnr[b]; n < blnr[b + 1]; n++)
 680         {
 681             blcc[n] = blmf[n] * ::iprod(r[b], r[blbnb[n]]);
 682         } /* 6 nr flops */
 683     }
 684     /* Together: 23*ncons + 6*nrtot flops */
 685
 686     lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
 687     /* nrec*(ncons+2*nrtot) flops */
 688
 689     if (econq == ConstraintVariable::Deriv_FlexCon)
 690     {
 691         /* We only want to constraint the flexible constraints,
 692          * so we mask out the normal ones by setting sol to 0.
 693          */
 694         for (int b = b0; b < b1; b++)
 695         {
 696             if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
 697             {
 698                 sol[b] = 0;
 699             }
 700         }
 701     }
 702
 703     /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
 704     for (int b = b0; b < b1; b++)
 705     {
 706         sol[b] *= blc[b];
 707     }
 708
 709     /* When constraining forces, we should not use mass weighting,
 710      * so we pass invmass=NULL, which results in the use of 1 for all atoms.
 711      */
 712     lincs_update_atoms(lincsd, th, 1.0, sol, r, (econq != ConstraintVariable::Force) ? invmass : nullptr,
 713                        as_rvec_array(fp.data()));
 714
 715     if (bCalcDHDL)
 716     {
 717         real dhdlambda = 0;
 718         for (int b = b0; b < b1; b++)
 719         {
 720             dhdlambda -= sol[b] * lincsd->ddist[b];
 721         }
 722
 723         lincsd->task[th].dhdlambda = dhdlambda;
 724     }
 725
 726     if (bCalcVir)
 727     {
 728         /* Constraint virial,
 729          * determines sum r_bond x delta f,
 730          * where delta f is the constraint correction
 731          * of the quantity that is being constrained.
 732          */
 733         for (int b = b0; b < b1; b++)
 734         {
 735             const real mvb = lincsd->bllen[b] * sol[b];
 736             for (int i = 0; i < DIM; i++)
 737             {
 738                 const real tmp1 = mvb * r[b][i];
 739                 for (int j = 0; j < DIM; j++)
 740                 {
 741                     rmdf[i][j] += tmp1 * r[b][j];
 742                 }
 743             }
 744         } /* 23 ncons flops */
 745     }
 746 }
 747
 748 #if GMX_SIMD_HAVE_REAL
 749
 750 /*! \brief Calculate the constraint distance vectors r to project on from x.
 751  *
 752  * Determine the right-hand side of the matrix equation using coordinates xp. */
 753 static void gmx_simdcall calc_dr_x_xp_simd(int                           b0,
 754                                            int                           b1,
 755                                            gmx::ArrayRef<const AtomPair> atoms,
 756                                            const rvec* gmx_restrict x,
 757                                            const rvec* gmx_restrict xp,
 758                                            const real* gmx_restrict bllen,
 759                                            const real* gmx_restrict blc,
 760                                            const real*              pbc_simd,
 761                                            rvec* gmx_restrict r,
 762                                            real* gmx_restrict rhs,
 763                                            real* gmx_restrict sol)
 764 {
 765     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 766     alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset2[GMX_SIMD_REAL_WIDTH];
 767
 768     for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 769     {
 770         offset2[i] = i;
 771     }
 772
 773     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 774     {
 775         SimdReal                                 x0_S, y0_S, z0_S;
 776         SimdReal                                 x1_S, y1_S, z1_S;
 777         SimdReal                                 rx_S, ry_S, rz_S, n2_S, il_S;
 778         SimdReal                                 rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
 779         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset0[GMX_SIMD_REAL_WIDTH];
 780         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset1[GMX_SIMD_REAL_WIDTH];
 781
 782         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 783         {
 784             offset0[i] = atoms[bs + i].index1;
 785             offset1[i] = atoms[bs + i].index2;
 786         }
 787
 788         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset0, &x0_S, &y0_S, &z0_S);
 789         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset1, &x1_S, &y1_S, &z1_S);
 790         rx_S = x0_S - x1_S;
 791         ry_S = y0_S - y1_S;
 792         rz_S = z0_S - z1_S;
 793
 794         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 795
 796         n2_S = norm2(rx_S, ry_S, rz_S);
 797         il_S = invsqrt(n2_S);
 798
 799         rx_S = rx_S * il_S;
 800         ry_S = ry_S * il_S;
 801         rz_S = rz_S * il_S;
 802
 803         transposeScatterStoreU<3>(reinterpret_cast<real*>(r + bs), offset2, rx_S, ry_S, rz_S);
 804
 805         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(xp), offset0, &x0_S, &y0_S, &z0_S);
 806         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(xp), offset1, &x1_S, &y1_S, &z1_S);
 807
 808         rxp_S = x0_S - x1_S;
 809         ryp_S = y0_S - y1_S;
 810         rzp_S = z0_S - z1_S;
 811
 812         pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
 813
 814         ip_S = iprod(rx_S, ry_S, rz_S, rxp_S, ryp_S, rzp_S);
 815
 816         rhs_S = load<SimdReal>(blc + bs) * (ip_S - load<SimdReal>(bllen + bs));
 817
 818         store(rhs + bs, rhs_S);
 819         store(sol + bs, rhs_S);
 820     }
 821 }
 822 #endif // GMX_SIMD_HAVE_REAL
 823
 824 /*! \brief Determine the distances and right-hand side for the next iteration. */
 825 gmx_unused static void calc_dist_iter(int                           b0,
 826                                       int                           b1,
 827                                       gmx::ArrayRef<const AtomPair> atoms,
 828                                       const rvec* gmx_restrict xp,
 829                                       const real* gmx_restrict bllen,
 830                                       const real* gmx_restrict blc,
 831                                       const t_pbc*             pbc,
 832                                       real                     wfac,
 833                                       real* gmx_restrict rhs,
 834                                       real* gmx_restrict sol,
 835                                       bool*              bWarn)
 836 {
 837     for (int b = b0; b < b1; b++)
 838     {
 839         real len = bllen[b];
 840         rvec dx;
 841         if (pbc)
 842         {
 843             pbc_dx_aiuc(pbc, xp[atoms[b].index1], xp[atoms[b].index2], dx);
 844         }
 845         else
 846         {
 847             rvec_sub(xp[atoms[b].index1], xp[atoms[b].index2], dx);
 848         }
 849         real len2  = len * len;
 850         real dlen2 = 2 * len2 - ::norm2(dx);
 851         if (dlen2 < wfac * len2)
 852         {
 853             /* not race free - see detailed comment in caller */
 854             *bWarn = TRUE;
 855         }
 856         real mvb;
 857         if (dlen2 > 0)
 858         {
 859             mvb = blc[b] * (len - dlen2 * gmx::invsqrt(dlen2));
 860         }
 861         else
 862         {
 863             mvb = blc[b] * len;
 864         }
 865         rhs[b] = mvb;
 866         sol[b] = mvb;
 867     } /* 20*ncons flops */
 868 }
 869
 870 #if GMX_SIMD_HAVE_REAL
 871 /*! \brief As calc_dist_iter(), but using SIMD intrinsics. */
 872 static void gmx_simdcall calc_dist_iter_simd(int                           b0,
 873                                              int                           b1,
 874                                              gmx::ArrayRef<const AtomPair> atoms,
 875                                              const rvec* gmx_restrict x,
 876                                              const real* gmx_restrict bllen,
 877                                              const real* gmx_restrict blc,
 878                                              const real*              pbc_simd,
 879                                              real                     wfac,
 880                                              real* gmx_restrict rhs,
 881                                              real* gmx_restrict sol,
 882                                              bool*              bWarn)
 883 {
 884     SimdReal min_S(GMX_REAL_MIN);
 885     SimdReal two_S(2.0);
 886     SimdReal wfac_S(wfac);
 887     SimdBool warn_S;
 888
 889     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 890
 891     /* Initialize all to FALSE */
 892     warn_S = (two_S < setZero());
 893
 894     for (int bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 895     {
 896         SimdReal                                 x0_S, y0_S, z0_S;
 897         SimdReal                                 x1_S, y1_S, z1_S;
 898         SimdReal                                 rx_S, ry_S, rz_S, n2_S;
 899         SimdReal                                 len_S, len2_S, dlen2_S, lc_S, blc_S;
 900         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset0[GMX_SIMD_REAL_WIDTH];
 901         alignas(GMX_SIMD_ALIGNMENT) std::int32_t offset1[GMX_SIMD_REAL_WIDTH];
 902
 903         for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 904         {
 905             offset0[i] = atoms[bs + i].index1;
 906             offset1[i] = atoms[bs + i].index2;
 907         }
 908
 909         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset0, &x0_S, &y0_S, &z0_S);
 910         gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real*>(x), offset1, &x1_S, &y1_S, &z1_S);
 911
 912         rx_S = x0_S - x1_S;
 913         ry_S = y0_S - y1_S;
 914         rz_S = z0_S - z1_S;
 915
 916         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 917
 918         n2_S = norm2(rx_S, ry_S, rz_S);
 919
 920         len_S  = load<SimdReal>(bllen + bs);
 921         len2_S = len_S * len_S;
 922
 923         dlen2_S = fms(two_S, len2_S, n2_S);
 924
 925         warn_S = warn_S || (dlen2_S < (wfac_S * len2_S));
 926
 927         /* Avoid 1/0 by taking the max with REAL_MIN.
 928          * Note: when dlen2 is close to zero (90 degree constraint rotation),
 929          * the accuracy of the algorithm is no longer relevant.
 930          */
 931         dlen2_S = max(dlen2_S, min_S);
 932
 933         lc_S = fnma(dlen2_S, invsqrt(dlen2_S), len_S);
 934
 935         blc_S = load<SimdReal>(blc + bs);
 936
 937         lc_S = blc_S * lc_S;
 938
 939         store(rhs + bs, lc_S);
 940         store(sol + bs, lc_S);
 941     }
 942
 943     if (anyTrue(warn_S))
 944     {
 945         *bWarn = TRUE;
 946     }
 947 }
 948 #endif // GMX_SIMD_HAVE_REAL
 949
 950 //! Implements LINCS constraining.
 951 static void do_lincs(ArrayRefWithPadding<const RVec> xPadded,
 952                      ArrayRefWithPadding<RVec>       xpPadded,
 953                      const matrix                    box,
 954                      t_pbc*                          pbc,
 955                      Lincs*                          lincsd,
 956                      int                             th,
 957                      const real*                     invmass,
 958                      const t_commrec*                cr,
 959                      bool                            bCalcDHDL,
 960                      real                            wangle,
 961                      bool*                           bWarn,
 962                      real                            invdt,
 963                      ArrayRef<RVec>                  vRef,
 964                      bool                            bCalcVir,
 965                      tensor                          vir_r_m_dr)
 966 {
 967     const rvec* x        = as_rvec_array(xPadded.paddedArrayRef().data());
 968     rvec*       xp       = as_rvec_array(xpPadded.paddedArrayRef().data());
 969     rvec* gmx_restrict v = as_rvec_array(vRef.data());
 970
 971     const int b0 = lincsd->task[th].b0;
 972     const int b1 = lincsd->task[th].b1;
 973
 974     gmx::ArrayRef<const AtomPair> atoms   = lincsd->atoms;
 975     gmx::ArrayRef<gmx::RVec>      r       = lincsd->tmpv;
 976     gmx::ArrayRef<const int>      blnr    = lincsd->blnr;
 977     gmx::ArrayRef<const int>      blbnb   = lincsd->blbnb;
 978     gmx::ArrayRef<const real>     blc     = lincsd->blc;
 979     gmx::ArrayRef<const real>     blmf    = lincsd->blmf;
 980     gmx::ArrayRef<const real>     bllen   = lincsd->bllen;
 981     gmx::ArrayRef<real>           blcc    = lincsd->tmpncc;
 982     gmx::ArrayRef<real>           rhs1    = lincsd->tmp1;
 983     gmx::ArrayRef<real>           rhs2    = lincsd->tmp2;
 984     gmx::ArrayRef<real>           sol     = lincsd->tmp3;
 985     gmx::ArrayRef<real>           blc_sol = lincsd->tmp4;
 986     gmx::ArrayRef<real>           mlambda = lincsd->mlambda;
 987     gmx::ArrayRef<const int>      nlocat  = lincsd->nlocat;
 988
 989 #if GMX_SIMD_HAVE_REAL
 990
 991     /* This SIMD code does the same as the plain-C code after the #else.
 992      * The only difference is that we always call pbc code, as with SIMD
 993      * the overhead of pbc computation (when not needed) is small.
 994      */
 995     alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9 * GMX_SIMD_REAL_WIDTH];
 996
 997     /* Convert the pbc struct for SIMD */
 998     set_pbc_simd(pbc, pbc_simd);
 999
1000     /* Compute normalized x i-j vectors, store in r.
1001      * Compute the inner product of r and xp i-j and store in rhs1.
1002      */
1003     calc_dr_x_xp_simd(b0, b1, atoms, x, xp, bllen.data(), blc.data(), pbc_simd,
1004                       as_rvec_array(r.data()), rhs1.data(), sol.data());
1005
1006 #else // GMX_SIMD_HAVE_REAL
1007
1008     if (pbc)
1009     {
1010         /* Compute normalized i-j vectors */
1011         for (int b = b0; b < b1; b++)
1012         {
1013             rvec dx;
1014             pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
1015             unitv(dx, r[b]);
1016
1017             pbc_dx_aiuc(pbc, xp[atoms[b].index1], xp[atoms[b].index2], dx);
1018             real mvb = blc[b] * (::iprod(r[b], dx) - bllen[b]);
1019             rhs1[b]  = mvb;
1020             sol[b]   = mvb;
1021         }
1022     }
1023     else
1024     {
1025         /* Compute normalized i-j vectors */
1026         for (int b = b0; b < b1; b++)
1027         {
1028             int  i    = atoms[b].index1;
1029             int  j    = atoms[b].index2;
1030             real tmp0 = x[i][0] - x[j][0];
1031             real tmp1 = x[i][1] - x[j][1];
1032             real tmp2 = x[i][2] - x[j][2];
1033             real rlen = gmx::invsqrt(tmp0 * tmp0 + tmp1 * tmp1 + tmp2 * tmp2);
1034             r[b][0]   = rlen * tmp0;
1035             r[b][1]   = rlen * tmp1;
1036             r[b][2]   = rlen * tmp2;
1037             /* 16 ncons flops */
1038
1039             real mvb = blc[b]
1040                        * (r[b][0] * (xp[i][0] - xp[j][0]) + r[b][1] * (xp[i][1] - xp[j][1])
1041                           + r[b][2] * (xp[i][2] - xp[j][2]) - bllen[b]);
1042             rhs1[b] = mvb;
1043             sol[b]  = mvb;
1044             /* 10 flops */
1045         }
1046         /* Together: 26*ncons + 6*nrtot flops */
1047     }
1048
1049 #endif // GMX_SIMD_HAVE_REAL
1050
1051     if (lincsd->bTaskDep)
1052     {
1053         /* We need a barrier, since the matrix construction below
1054          * can access entries in r of other threads.
1055          */
1056 #pragma omp barrier
1057     }
1058
1059     /* Construct the (sparse) LINCS matrix */
1060     for (int b = b0; b < b1; b++)
1061     {
1062         for (int n = blnr[b]; n < blnr[b + 1]; n++)
1063         {
1064             blcc[n] = blmf[n] * gmx::dot(r[b], r[blbnb[n]]);
1065         }
1066     }
1067     /* Together: 26*ncons + 6*nrtot flops */
1068
1069     lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
1070     /* nrec*(ncons+2*nrtot) flops */
1071
1072 #if GMX_SIMD_HAVE_REAL
1073     for (int b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1074     {
1075         SimdReal t1 = load<SimdReal>(blc.data() + b);
1076         SimdReal t2 = load<SimdReal>(sol.data() + b);
1077         store(mlambda.data() + b, t1 * t2);
1078     }
1079 #else
1080     for (int b = b0; b < b1; b++)
1081     {
1082         mlambda[b] = blc[b] * sol[b];
1083     }
1084 #endif // GMX_SIMD_HAVE_REAL
1085
1086     /* Update the coordinates */
1087     lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
1088
1089     /*
1090      ********  Correction for centripetal effects  ********
1091      */
1092
1093     real wfac;
1094
1095     wfac = std::cos(DEG2RAD * wangle);
1096     wfac = wfac * wfac;
1097
1098     for (int iter = 0; iter < lincsd->nIter; iter++)
1099     {
1100         if ((lincsd->bCommIter && DOMAINDECOMP(cr) && cr->dd->constraints))
1101         {
1102 #pragma omp barrier
1103 #pragma omp master
1104             {
1105                 /* Communicate the corrected non-local coordinates */
1106                 if (DOMAINDECOMP(cr))
1107                 {
1108                     dd_move_x_constraints(cr->dd, box, xpPadded.unpaddedArrayRef(), ArrayRef<RVec>(), FALSE);
1109                 }
1110             }
1111 #pragma omp barrier
1112         }
1113         else if (lincsd->bTaskDep)
1114         {
1115 #pragma omp barrier
1116         }
1117
1118 #if GMX_SIMD_HAVE_REAL
1119         calc_dist_iter_simd(b0, b1, atoms, xp, bllen.data(), blc.data(), pbc_simd, wfac,
1120                             rhs1.data(), sol.data(), bWarn);
1121 #else
1122         calc_dist_iter(b0, b1, atoms, xp, bllen.data(), blc.data(), pbc, wfac, rhs1.data(),
1123                        sol.data(), bWarn);
1124         /* 20*ncons flops */
1125 #endif // GMX_SIMD_HAVE_REAL
1126
1127         lincs_matrix_expand(*lincsd, lincsd->task[th], blcc, rhs1, rhs2, sol);
1128         /* nrec*(ncons+2*nrtot) flops */
1129
1130 #if GMX_SIMD_HAVE_REAL
1131         for (int b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1132         {
1133             SimdReal t1  = load<SimdReal>(blc.data() + b);
1134             SimdReal t2  = load<SimdReal>(sol.data() + b);
1135             SimdReal mvb = t1 * t2;
1136             store(blc_sol.data() + b, mvb);
1137             store(mlambda.data() + b, load<SimdReal>(mlambda.data() + b) + mvb);
1138         }
1139 #else
1140         for (int b = b0; b < b1; b++)
1141         {
1142             real mvb   = blc[b] * sol[b];
1143             blc_sol[b] = mvb;
1144             mlambda[b] += mvb;
1145         }
1146 #endif // GMX_SIMD_HAVE_REAL
1147
1148         /* Update the coordinates */
1149         lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
1150     }
1151     /* nit*ncons*(37+9*nrec) flops */
1152
1153     if (v != nullptr)
1154     {
1155         /* Update the velocities */
1156         lincs_update_atoms(lincsd, th, invdt, mlambda, r, invmass, v);
1157         /* 16 ncons flops */
1158     }
1159
1160     if (!nlocat.empty() && (bCalcDHDL || bCalcVir))
1161     {
1162         if (lincsd->bTaskDep)
1163         {
1164             /* In lincs_update_atoms threads might cross-read mlambda */
1165 #pragma omp barrier
1166         }
1167
1168         /* Only account for local atoms */
1169         for (int b = b0; b < b1; b++)
1170         {
1171             mlambda[b] *= 0.5 * nlocat[b];
1172         }
1173     }
1174
1175     if (bCalcDHDL)
1176     {
1177         real dhdl = 0;
1178         for (int b = b0; b < b1; b++)
1179         {
1180             /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1181              * later after the contributions are reduced over the threads.
1182              */
1183             dhdl -= lincsd->mlambda[b] * lincsd->ddist[b];
1184         }
1185         lincsd->task[th].dhdlambda = dhdl;
1186     }
1187
1188     if (bCalcVir)
1189     {
1190         /* Constraint virial */
1191         for (int b = b0; b < b1; b++)
1192         {
1193             real tmp0 = -bllen[b] * mlambda[b];
1194             for (int i = 0; i < DIM; i++)
1195             {
1196                 real tmp1 = tmp0 * r[b][i];
1197                 for (int j = 0; j < DIM; j++)
1198                 {
1199                     vir_r_m_dr[i][j] -= tmp1 * r[b][j];
1200                 }
1201             }
1202         } /* 22 ncons flops */
1203     }
1204
1205     /* Total:
1206      * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1207      * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1208      *
1209      * (26+nrec)*ncons + (6+2*nrec)*nrtot
1210      * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1211      * if nit=1
1212      * (63+nrec)*ncons + (6+4*nrec)*nrtot
1213      */
1214 }
1215
1216 /*! \brief Sets the elements in the LINCS matrix for task task. */
1217 static void set_lincs_matrix_task(Lincs* li, Task* li_task, const real* invmass, int* ncc_triangle, int* nCrossTaskTriangles)
1218 {
1219     /* Construct the coupling coefficient matrix blmf */
1220     li_task->ntriangle   = 0;
1221     *ncc_triangle        = 0;
1222     *nCrossTaskTriangles = 0;
1223     for (int i = li_task->b0; i < li_task->b1; i++)
1224     {
1225         const int a1 = li->atoms[i].index1;
1226         const int a2 = li->atoms[i].index2;
1227         for (int n = li->blnr[i]; n < li->blnr[i + 1]; n++)
1228         {
1229             const int k = li->blbnb[n];
1230
1231             /* If we are using multiple, independent tasks for LINCS,
1232              * the calls to check_assign_connected should have
1233              * put all connected constraints in our task.
1234              */
1235             assert(li->bTaskDep || (k >= li_task->b0 && k < li_task->b1));
1236
1237             int sign;
1238             if (a1 == li->atoms[k].index1 || a2 == li->atoms[k].index2)
1239             {
1240                 sign = -1;
1241             }
1242             else
1243             {
1244                 sign = 1;
1245             }
1246             int center;
1247             int end;
1248             if (a1 == li->atoms[k].index1 || a1 == li->atoms[k].index2)
1249             {
1250                 center = a1;
1251                 end    = a2;
1252             }
1253             else
1254             {
1255                 center = a2;
1256                 end    = a1;
1257             }
1258             li->blmf[n]  = sign * invmass[center] * li->blc[i] * li->blc[k];
1259             li->blmf1[n] = sign * 0.5;
1260             if (li->ncg_triangle > 0)
1261             {
1262                 /* Look for constraint triangles */
1263                 for (int nk = li->blnr[k]; nk < li->blnr[k + 1]; nk++)
1264                 {
1265                     const int kk = li->blbnb[nk];
1266                     if (kk != i && kk != k && (li->atoms[kk].index1 == end || li->atoms[kk].index2 == end))
1267                     {
1268                         /* Check if the constraints in this triangle actually
1269                          * belong to a different task. We still assign them
1270                          * here, since it's convenient for the triangle
1271                          * iterations, but we then need an extra barrier.
1272                          */
1273                         if (k < li_task->b0 || k >= li_task->b1 || kk < li_task->b0 || kk >= li_task->b1)
1274                         {
1275                             (*nCrossTaskTriangles)++;
1276                         }
1277
1278                         if (li_task->ntriangle == 0 || li_task->triangle[li_task->ntriangle - 1] < i)
1279                         {
1280                             /* Add this constraint to the triangle list */
1281                             li_task->triangle[li_task->ntriangle] = i;
1282                             li_task->tri_bits[li_task->ntriangle] = 0;
1283                             li_task->ntriangle++;
1284                             if (li->blnr[i + 1] - li->blnr[i]
1285                                 > static_cast<int>(sizeof(li_task->tri_bits[0]) * 8 - 1))
1286                             {
1287                                 gmx_fatal(FARGS,
1288                                           "A constraint is connected to %d constraints, this is "
1289                                           "more than the %zu allowed for constraints participating "
1290                                           "in triangles",
1291                                           li->blnr[i + 1] - li->blnr[i],
1292                                           sizeof(li_task->tri_bits[0]) * 8 - 1);
1293                             }
1294                         }
1295                         li_task->tri_bits[li_task->ntriangle - 1] |= (1 << (n - li->blnr[i]));
1296                         (*ncc_triangle)++;
1297                     }
1298                 }
1299             }
1300         }
1301     }
1302 }
1303
1304 /*! \brief Sets the elements in the LINCS matrix. */
1305 static void set_lincs_matrix(Lincs* li, real* invmass, real lambda)
1306 {
1307     const real invsqrt2 = 0.7071067811865475244;
1308
1309     for (int i = 0; (i < li->nc); i++)
1310     {
1311         const int a1 = li->atoms[i].index1;
1312         const int a2 = li->atoms[i].index2;
1313         li->blc[i]   = gmx::invsqrt(invmass[a1] + invmass[a2]);
1314         li->blc1[i]  = invsqrt2;
1315     }
1316
1317     /* Construct the coupling coefficient matrix blmf */
1318     int ntriangle = 0, ncc_triangle = 0, nCrossTaskTriangles = 0;
1319 #pragma omp parallel for reduction(+: ntriangle, ncc_triangle, nCrossTaskTriangles) num_threads(li->ntask) schedule(static)
1320     for (int th = 0; th < li->ntask; th++)
1321     {
1322         try
1323         {
1324             set_lincs_matrix_task(li, &li->task[th], invmass, &ncc_triangle, &nCrossTaskTriangles);
1325             ntriangle += li->task[th].ntriangle;
1326         }
1327         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1328     }
1329     li->ntriangle    = ntriangle;
1330     li->ncc_triangle = ncc_triangle;
1331     li->bTaskDepTri  = (nCrossTaskTriangles > 0);
1332
1333     if (debug)
1334     {
1335         fprintf(debug, "The %d constraints participate in %d triangles\n", li->nc, li->ntriangle);
1336         fprintf(debug, "There are %d constraint couplings, of which %d in triangles\n", li->ncc,
1337                 li->ncc_triangle);
1338         if (li->ntriangle > 0 && li->ntask > 1)
1339         {
1340             fprintf(debug,
1341                     "%d constraint triangles contain constraints assigned to different tasks\n",
1342                     nCrossTaskTriangles);
1343         }
1344     }
1345
1346     /* Set matlam,
1347      * so we know with which lambda value the masses have been set.
1348      */
1349     li->matlam = lambda;
1350 }
1351
1352 //! Finds all triangles of atoms that share constraints to a central atom.
1353 static int count_triangle_constraints(const InteractionLists& ilist, const ListOfLists<int>& at2con)
1354 {
1355     const int ncon1    = ilist[F_CONSTR].size() / 3;
1356     const int ncon_tot = ncon1 + ilist[F_CONSTRNC].size() / 3;
1357
1358     gmx::ArrayRef<const int> ia1 = ilist[F_CONSTR].iatoms;
1359     gmx::ArrayRef<const int> ia2 = ilist[F_CONSTRNC].iatoms;
1360
1361     int ncon_triangle = 0;
1362     for (int c0 = 0; c0 < ncon_tot; c0++)
1363     {
1364         bool       bTriangle = FALSE;
1365         const int* iap       = constr_iatomptr(ia1, ia2, c0);
1366         const int  a00       = iap[1];
1367         const int  a01       = iap[2];
1368         for (const int c1 : at2con[a01])
1369         {
1370             if (c1 != c0)
1371             {
1372                 const int* iap = constr_iatomptr(ia1, ia2, c1);
1373                 const int  a10 = iap[1];
1374                 const int  a11 = iap[2];
1375                 int        ac1;
1376                 if (a10 == a01)
1377                 {
1378                     ac1 = a11;
1379                 }
1380                 else
1381                 {
1382                     ac1 = a10;
1383                 }
1384                 for (const int c2 : at2con[ac1])
1385                 {
1386                     if (c2 != c0 && c2 != c1)
1387                     {
1388                         const int* iap = constr_iatomptr(ia1, ia2, c2);
1389                         const int  a20 = iap[1];
1390                         const int  a21 = iap[2];
1391                         if (a20 == a00 || a21 == a00)
1392                         {
1393                             bTriangle = TRUE;
1394                         }
1395                     }
1396                 }
1397             }
1398         }
1399         if (bTriangle)
1400         {
1401             ncon_triangle++;
1402         }
1403     }
1404
1405     return ncon_triangle;
1406 }
1407
1408 //! Finds sequences of sequential constraints.
1409 static bool more_than_two_sequential_constraints(const InteractionLists& ilist, const ListOfLists<int>& at2con)
1410 {
1411     const int ncon1    = ilist[F_CONSTR].size() / 3;
1412     const int ncon_tot = ncon1 + ilist[F_CONSTRNC].size() / 3;
1413
1414     gmx::ArrayRef<const int> ia1 = ilist[F_CONSTR].iatoms;
1415     gmx::ArrayRef<const int> ia2 = ilist[F_CONSTRNC].iatoms;
1416
1417     for (int c = 0; c < ncon_tot; c++)
1418     {
1419         const int* iap = constr_iatomptr(ia1, ia2, c);
1420         const int  a1  = iap[1];
1421         const int  a2  = iap[2];
1422         /* Check if this constraint has constraints connected at both atoms */
1423         if (at2con[a1].ssize() > 1 && at2con[a2].ssize() > 1)
1424         {
1425             return true;
1426         }
1427     }
1428
1429     return false;
1430 }
1431
1432 Lincs* init_lincs(FILE*                            fplog,
1433                   const gmx_mtop_t&                mtop,
1434                   int                              nflexcon_global,
1435                   ArrayRef<const ListOfLists<int>> atomToConstraintsPerMolType,
1436                   bool                             bPLINCS,
1437                   int                              nIter,
1438                   int                              nProjOrder)
1439 {
1440     // TODO this should become a unique_ptr
1441     Lincs* li;
1442     bool   bMoreThanTwoSeq;
1443
1444     if (fplog)
1445     {
1446         fprintf(fplog, "\nInitializing%s LINear Constraint Solver\n", bPLINCS ? " Parallel" : "");
1447     }
1448
1449     li = new Lincs;
1450
1451     li->ncg      = gmx_mtop_ftype_count(mtop, F_CONSTR) + gmx_mtop_ftype_count(mtop, F_CONSTRNC);
1452     li->ncg_flex = nflexcon_global;
1453
1454     li->nIter  = nIter;
1455     li->nOrder = nProjOrder;
1456
1457     li->max_connect = 0;
1458     for (size_t mt = 0; mt < mtop.moltype.size(); mt++)
1459     {
1460         const auto& at2con = atomToConstraintsPerMolType[mt];
1461         for (int a = 0; a < mtop.moltype[mt].atoms.nr; a++)
1462         {
1463             li->max_connect = std::max(li->max_connect, int(at2con[a].ssize()));
1464         }
1465     }
1466
1467     li->ncg_triangle = 0;
1468     bMoreThanTwoSeq  = FALSE;
1469     for (const gmx_molblock_t& molb : mtop.molblock)
1470     {
1471         const gmx_moltype_t& molt   = mtop.moltype[molb.type];
1472         const auto&          at2con = atomToConstraintsPerMolType[molb.type];
1473
1474         li->ncg_triangle += molb.nmol * count_triangle_constraints(molt.ilist, at2con);
1475
1476         if (!bMoreThanTwoSeq && more_than_two_sequential_constraints(molt.ilist, at2con))
1477         {
1478             bMoreThanTwoSeq = TRUE;
1479         }
1480     }
1481
1482     /* Check if we need to communicate not only before LINCS,
1483      * but also before each iteration.
1484      * The check for only two sequential constraints is only
1485      * useful for the common case of H-bond only constraints.
1486      * With more effort we could also make it useful for small
1487      * molecules with nr. sequential constraints <= nOrder-1.
1488      */
1489     li->bCommIter = (bPLINCS && (li->nOrder < 1 || bMoreThanTwoSeq));
1490
1491     if (debug && bPLINCS)
1492     {
1493         fprintf(debug, "PLINCS communication before each iteration: %d\n", static_cast<int>(li->bCommIter));
1494     }
1495
1496     /* LINCS can run on any number of threads.
1497      * Currently the number is fixed for the whole simulation,
1498      * but it could be set in set_lincs().
1499      * The current constraint to task assignment code can create independent
1500      * tasks only when not more than two constraints are connected sequentially.
1501      */
1502     li->ntask    = gmx_omp_nthreads_get(emntLINCS);
1503     li->bTaskDep = (li->ntask > 1 && bMoreThanTwoSeq);
1504     if (debug)
1505     {
1506         fprintf(debug, "LINCS: using %d threads, tasks are %sdependent\n", li->ntask,
1507                 li->bTaskDep ? "" : "in");
1508     }
1509     if (li->ntask == 1)
1510     {
1511         li->task.resize(1);
1512     }
1513     else
1514     {
1515         /* Allocate an extra elements for "task-overlap" constraints */
1516         li->task.resize(li->ntask + 1);
1517     }
1518
1519     if (bPLINCS || li->ncg_triangle > 0)
1520     {
1521         please_cite(fplog, "Hess2008a");
1522     }
1523     else
1524     {
1525         please_cite(fplog, "Hess97a");
1526     }
1527
1528     if (fplog)
1529     {
1530         fprintf(fplog, "The number of constraints is %d\n", li->ncg);
1531         if (bPLINCS)
1532         {
1533             fprintf(fplog,
1534                     "There are constraints between atoms in different decomposition domains,\n"
1535                     "will communicate selected coordinates each lincs iteration\n");
1536         }
1537         if (li->ncg_triangle > 0)
1538         {
1539             fprintf(fplog,
1540                     "%d constraints are involved in constraint triangles,\n"
1541                     "will apply an additional matrix expansion of order %d for couplings\n"
1542                     "between constraints inside triangles\n",
1543                     li->ncg_triangle, li->nOrder);
1544         }
1545     }
1546
1547     return li;
1548 }
1549
1550 void done_lincs(Lincs* li)
1551 {
1552     delete li;
1553 }
1554
1555 /*! \brief Sets up the work division over the threads. */
1556 static void lincs_thread_setup(Lincs* li, int natoms)
1557 {
1558     li->atf.resize(natoms);
1559
1560     gmx::ArrayRef<gmx_bitmask_t> atf = li->atf;
1561
1562     /* Clear the atom flags */
1563     for (gmx_bitmask_t& mask : atf)
1564     {
1565         bitmask_clear(&mask);
1566     }
1567
1568     if (li->ntask > BITMASK_SIZE)
1569     {
1570         gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
1571     }
1572
1573     for (int th = 0; th < li->ntask; th++)
1574     {
1575         const Task* li_task = &li->task[th];
1576
1577         /* For each atom set a flag for constraints from each */
1578         for (int b = li_task->b0; b < li_task->b1; b++)
1579         {
1580             bitmask_set_bit(&atf[li->atoms[b].index1], th);
1581             bitmask_set_bit(&atf[li->atoms[b].index2], th);
1582         }
1583     }
1584
1585 #pragma omp parallel for num_threads(li->ntask) schedule(static)
1586     for (int th = 0; th < li->ntask; th++)
1587     {
1588         try
1589         {
1590             Task*         li_task;
1591             gmx_bitmask_t mask;
1592             int           b;
1593
1594             li_task = &li->task[th];
1595
1596             bitmask_init_low_bits(&mask, th);
1597
1598             li_task->ind.clear();
1599             li_task->ind_r.clear();
1600             for (b = li_task->b0; b < li_task->b1; b++)
1601             {
1602                 /* We let the constraint with the lowest thread index
1603                  * operate on atoms with constraints from multiple threads.
1604                  */
1605                 if (bitmask_is_disjoint(atf[li->atoms[b].index1], mask)
1606                     && bitmask_is_disjoint(atf[li->atoms[b].index2], mask))
1607                 {
1608                     /* Add the constraint to the local atom update index */
1609                     li_task->ind.push_back(b);
1610                 }
1611                 else
1612                 {
1613                     /* Add the constraint to the rest block */
1614                     li_task->ind_r.push_back(b);
1615                 }
1616             }
1617         }
1618         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1619     }
1620
1621     /* We need to copy all constraints which have not be assigned
1622      * to a thread to a separate list which will be handled by one thread.
1623      */
1624     Task* li_m = &li->task[li->ntask];
1625
1626     li_m->ind.clear();
1627     for (int th = 0; th < li->ntask; th++)
1628     {
1629         const Task& li_task = li->task[th];
1630
1631         for (int ind_r : li_task.ind_r)
1632         {
1633             li_m->ind.push_back(ind_r);
1634         }
1635
1636         if (debug)
1637         {
1638             fprintf(debug, "LINCS thread %d: %zu constraints\n", th, li_task.ind.size());
1639         }
1640     }
1641
1642     if (debug)
1643     {
1644         fprintf(debug, "LINCS thread r: %zu constraints\n", li_m->ind.size());
1645     }
1646 }
1647
1648 //! Assign a constraint.
1649 static void assign_constraint(Lincs*                  li,
1650                               int                     constraint_index,
1651                               int                     a1,
1652                               int                     a2,
1653                               real                    lenA,
1654                               real                    lenB,
1655                               const ListOfLists<int>& at2con)
1656 {
1657     int con;
1658
1659     con = li->nc;
1660
1661     /* Make an mapping of local topology constraint index to LINCS index */
1662     li->con_index[constraint_index] = con;
1663
1664     li->bllen0[con] = lenA;
1665     li->ddist[con]  = lenB - lenA;
1666     /* Set the length to the topology A length */
1667     li->bllen[con]        = lenA;
1668     li->atoms[con].index1 = a1;
1669     li->atoms[con].index2 = a2;
1670
1671     /* Make space in the constraint connection matrix for constraints
1672      * connected to both end of the current constraint.
1673      */
1674     li->ncc += at2con[a1].ssize() - 1 + at2con[a2].ssize() - 1;
1675
1676     li->blnr[con + 1] = li->ncc;
1677
1678     /* Increase the constraint count */
1679     li->nc++;
1680 }
1681
1682 /*! \brief Check if constraint with topology index constraint_index is connected
1683  * to other constraints, and if so add those connected constraints to our task. */
1684 static void check_assign_connected(Lincs*                  li,
1685                                    const t_iatom*          iatom,
1686                                    const t_idef&           idef,
1687                                    bool                    bDynamics,
1688                                    int                     a1,
1689                                    int                     a2,
1690                                    const ListOfLists<int>& at2con)
1691 {
1692     /* Currently this function only supports constraint groups
1693      * in which all constraints share at least one atom
1694      * (e.g. H-bond constraints).
1695      * Check both ends of the current constraint for
1696      * connected constraints. We need to assign those
1697      * to the same task.
1698      */
1699     for (int end = 0; end < 2; end++)
1700     {
1701         const int a = (end == 0 ? a1 : a2);
1702
1703         for (const int cc : at2con[a])
1704         {
1705             /* Check if constraint cc has not yet been assigned */
1706             if (li->con_index[cc] == -1)
1707             {
1708                 const int  type = iatom[cc * 3];
1709                 const real lenA = idef.iparams[type].constr.dA;
1710                 const real lenB = idef.iparams[type].constr.dB;
1711
1712                 if (bDynamics || lenA != 0 || lenB != 0)
1713                 {
1714                     assign_constraint(li, cc, iatom[3 * cc + 1], iatom[3 * cc + 2], lenA, lenB, at2con);
1715                 }
1716             }
1717         }
1718     }
1719 }
1720
1721 /*! \brief Check if constraint with topology index constraint_index is involved
1722  * in a constraint triangle, and if so add the other two constraints
1723  * in the triangle to our task. */
1724 static void check_assign_triangle(Lincs*                  li,
1725                                   const t_iatom*          iatom,
1726                                   const t_idef&           idef,
1727                                   bool                    bDynamics,
1728                                   int                     constraint_index,
1729                                   int                     a1,
1730                                   int                     a2,
1731                                   const ListOfLists<int>& at2con)
1732 {
1733     int nca, cc[32], ca[32];
1734     int c_triangle[2] = { -1, -1 };
1735
1736     nca = 0;
1737     for (const int c : at2con[a1])
1738     {
1739         if (c != constraint_index)
1740         {
1741             int aa1, aa2;
1742
1743             aa1 = iatom[c * 3 + 1];
1744             aa2 = iatom[c * 3 + 2];
1745             if (aa1 != a1)
1746             {
1747                 cc[nca] = c;
1748                 ca[nca] = aa1;
1749                 nca++;
1750             }
1751             if (aa2 != a1)
1752             {
1753                 cc[nca] = c;
1754                 ca[nca] = aa2;
1755                 nca++;
1756             }
1757         }
1758     }
1759
1760     for (const int c : at2con[a2])
1761     {
1762         if (c != constraint_index)
1763         {
1764             int aa1, aa2, i;
1765
1766             aa1 = iatom[c * 3 + 1];
1767             aa2 = iatom[c * 3 + 2];
1768             if (aa1 != a2)
1769             {
1770                 for (i = 0; i < nca; i++)
1771                 {
1772                     if (aa1 == ca[i])
1773                     {
1774                         c_triangle[0] = cc[i];
1775                         c_triangle[1] = c;
1776                     }
1777                 }
1778             }
1779             if (aa2 != a2)
1780             {
1781                 for (i = 0; i < nca; i++)
1782                 {
1783                     if (aa2 == ca[i])
1784                     {
1785                         c_triangle[0] = cc[i];
1786                         c_triangle[1] = c;
1787                     }
1788                 }
1789             }
1790         }
1791     }
1792
1793     if (c_triangle[0] >= 0)
1794     {
1795         int end;
1796
1797         for (end = 0; end < 2; end++)
1798         {
1799             /* Check if constraint c_triangle[end] has not yet been assigned */
1800             if (li->con_index[c_triangle[end]] == -1)
1801             {
1802                 int  i, type;
1803                 real lenA, lenB;
1804
1805                 i    = c_triangle[end] * 3;
1806                 type = iatom[i];
1807                 lenA = idef.iparams[type].constr.dA;
1808                 lenB = idef.iparams[type].constr.dB;
1809
1810                 if (bDynamics || lenA != 0 || lenB != 0)
1811                 {
1812                     assign_constraint(li, c_triangle[end], iatom[i + 1], iatom[i + 2], lenA, lenB, at2con);
1813                 }
1814             }
1815         }
1816     }
1817 }
1818
1819 //! Sets matrix indices.
1820 static void set_matrix_indices(Lincs* li, const Task& li_task, const ListOfLists<int>& at2con, bool bSortMatrix)
1821 {
1822     for (int b = li_task.b0; b < li_task.b1; b++)
1823     {
1824         const int a1 = li->atoms[b].index1;
1825         const int a2 = li->atoms[b].index2;
1826
1827         int i = li->blnr[b];
1828         for (const int constraint : at2con[a1])
1829         {
1830             const int concon = li->con_index[constraint];
1831             if (concon != b)
1832             {
1833                 li->blbnb[i++] = concon;
1834             }
1835         }
1836         for (const int constraint : at2con[a2])
1837         {
1838             const int concon = li->con_index[constraint];
1839             if (concon != b)
1840             {
1841                 li->blbnb[i++] = concon;
1842             }
1843         }
1844
1845         if (bSortMatrix)
1846         {
1847             /* Order the blbnb matrix to optimize memory access */
1848             std::sort(li->blbnb.begin() + li->blnr[b], li->blbnb.begin() + li->blnr[b + 1]);
1849         }
1850     }
1851 }
1852
1853 void set_lincs(const t_idef& idef, const t_mdatoms& md, bool bDynamics, const t_commrec* cr, Lincs* li)
1854 {
1855     int      natoms;
1856     t_iatom* iatom;
1857
1858     li->nc_real = 0;
1859     li->nc      = 0;
1860     li->ncc     = 0;
1861     /* Zero the thread index ranges.
1862      * Otherwise without local constraints we could return with old ranges.
1863      */
1864     for (int i = 0; i < li->ntask; i++)
1865     {
1866         li->task[i].b0 = 0;
1867         li->task[i].b1 = 0;
1868         li->task[i].ind.clear();
1869     }
1870     if (li->ntask > 1)
1871     {
1872         li->task[li->ntask].ind.clear();
1873     }
1874
1875     /* This is the local topology, so there are only F_CONSTR constraints */
1876     if (idef.il[F_CONSTR].nr == 0)
1877     {
1878         /* There are no constraints,
1879          * we do not need to fill any data structures.
1880          */
1881         return;
1882     }
1883
1884     if (debug)
1885     {
1886         fprintf(debug, "Building the LINCS connectivity\n");
1887     }
1888
1889     if (DOMAINDECOMP(cr))
1890     {
1891         if (cr->dd->constraints)
1892         {
1893             int start;
1894
1895             dd_get_constraint_range(cr->dd, &start, &natoms);
1896         }
1897         else
1898         {
1899             natoms = dd_numHomeAtoms(*cr->dd);
1900         }
1901     }
1902     else
1903     {
1904         natoms = md.homenr;
1905     }
1906
1907     const ListOfLists<int> at2con =
1908             make_at2con(natoms, idef.il, idef.iparams, flexibleConstraintTreatment(bDynamics));
1909
1910     const int ncon_tot = idef.il[F_CONSTR].nr / 3;
1911
1912     /* Ensure we have enough padding for aligned loads for each thread */
1913     const int numEntries = ncon_tot + li->ntask * simd_width;
1914     li->con_index.resize(numEntries);
1915     li->bllen0.resize(numEntries);
1916     li->ddist.resize(numEntries);
1917     li->atoms.resize(numEntries);
1918     li->blc.resize(numEntries);
1919     li->blc1.resize(numEntries);
1920     li->blnr.resize(numEntries + 1);
1921     li->bllen.resize(numEntries);
1922     li->tmpv.resizeWithPadding(numEntries);
1923     if (DOMAINDECOMP(cr))
1924     {
1925         li->nlocat.resize(numEntries);
1926     }
1927     li->tmp1.resize(numEntries);
1928     li->tmp2.resize(numEntries);
1929     li->tmp3.resize(numEntries);
1930     li->tmp4.resize(numEntries);
1931     li->mlambda.resize(numEntries);
1932
1933     iatom = idef.il[F_CONSTR].iatoms;
1934
1935     li->blnr[0] = li->ncc;
1936
1937     /* Assign the constraints for li->ntask LINCS tasks.
1938      * We target a uniform distribution of constraints over the tasks.
1939      * Note that when flexible constraints are present, but are removed here
1940      * (e.g. because we are doing EM) we get imbalance, but since that doesn't
1941      * happen during normal MD, that's ok.
1942      */
1943
1944     /* Determine the number of constraints we need to assign here */
1945     int ncon_assign = ncon_tot;
1946     if (!bDynamics)
1947     {
1948         /* With energy minimization, flexible constraints are ignored
1949          * (and thus minimized, as they should be).
1950          */
1951         ncon_assign -= countFlexibleConstraints(idef.il, idef.iparams);
1952     }
1953
1954     /* Set the target constraint count per task to exactly uniform,
1955      * this might be overridden below.
1956      */
1957     int ncon_target = (ncon_assign + li->ntask - 1) / li->ntask;
1958
1959     /* Mark all constraints as unassigned by setting their index to -1 */
1960     for (int con = 0; con < ncon_tot; con++)
1961     {
1962         li->con_index[con] = -1;
1963     }
1964
1965     int con = 0;
1966     for (int th = 0; th < li->ntask; th++)
1967     {
1968         Task* li_task;
1969
1970         li_task = &li->task[th];
1971
1972 #if GMX_SIMD_HAVE_REAL
1973         /* With indepedent tasks we likely have H-bond constraints or constraint
1974          * pairs. The connected constraints will be pulled into the task, so the
1975          * constraints per task will often exceed ncon_target.
1976          * Triangle constraints can also increase the count, but there are
1977          * relatively few of those, so we usually expect to get ncon_target.
1978          */
1979         if (li->bTaskDep)
1980         {
1981             /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
1982              * since otherwise a lot of operations can be wasted.
1983              * There are several ways to round here, we choose the one
1984              * that alternates block sizes, which helps with Intel HT.
1985              */
1986             ncon_target = ((ncon_assign * (th + 1)) / li->ntask - li->nc_real + GMX_SIMD_REAL_WIDTH - 1)
1987                           & ~(GMX_SIMD_REAL_WIDTH - 1);
1988         }
1989 #endif // GMX_SIMD==2 && GMX_SIMD_HAVE_REAL
1990
1991         /* Continue filling the arrays where we left off with the previous task,
1992          * including padding for SIMD.
1993          */
1994         li_task->b0 = li->nc;
1995
1996         while (con < ncon_tot && li->nc - li_task->b0 < ncon_target)
1997         {
1998             if (li->con_index[con] == -1)
1999             {
2000                 int  type, a1, a2;
2001                 real lenA, lenB;
2002
2003                 type = iatom[3 * con];
2004                 a1   = iatom[3 * con + 1];
2005                 a2   = iatom[3 * con + 2];
2006                 lenA = idef.iparams[type].constr.dA;
2007                 lenB = idef.iparams[type].constr.dB;
2008                 /* Skip the flexible constraints when not doing dynamics */
2009                 if (bDynamics || lenA != 0 || lenB != 0)
2010                 {
2011                     assign_constraint(li, con, a1, a2, lenA, lenB, at2con);
2012
2013                     if (li->ntask > 1 && !li->bTaskDep)
2014                     {
2015                         /* We can generate independent tasks. Check if we
2016                          * need to assign connected constraints to our task.
2017                          */
2018                         check_assign_connected(li, iatom, idef, bDynamics, a1, a2, at2con);
2019                     }
2020                     if (li->ntask > 1 && li->ncg_triangle > 0)
2021                     {
2022                         /* Ensure constraints in one triangle are assigned
2023                          * to the same task.
2024                          */
2025                         check_assign_triangle(li, iatom, idef, bDynamics, con, a1, a2, at2con);
2026                     }
2027                 }
2028             }
2029
2030             con++;
2031         }
2032
2033         li_task->b1 = li->nc;
2034
2035         if (simd_width > 1)
2036         {
2037             /* Copy the last atom pair indices and lengths for constraints
2038              * up to a multiple of simd_width, such that we can do all
2039              * SIMD operations without having to worry about end effects.
2040              */
2041             int i, last;
2042
2043             li->nc = ((li_task->b1 + simd_width - 1) / simd_width) * simd_width;
2044             last   = li_task->b1 - 1;
2045             for (i = li_task->b1; i < li->nc; i++)
2046             {
2047                 li->atoms[i]    = li->atoms[last];
2048                 li->bllen0[i]   = li->bllen0[last];
2049                 li->ddist[i]    = li->ddist[last];
2050                 li->bllen[i]    = li->bllen[last];
2051                 li->blnr[i + 1] = li->blnr[last + 1];
2052             }
2053         }
2054
2055         /* Keep track of how many constraints we assigned */
2056         li->nc_real += li_task->b1 - li_task->b0;
2057
2058         if (debug)
2059         {
2060             fprintf(debug, "LINCS task %d constraints %d - %d\n", th, li_task->b0, li_task->b1);
2061         }
2062     }
2063
2064     assert(li->nc_real == ncon_assign);
2065
2066     bool bSortMatrix;
2067
2068     /* Without DD we order the blbnb matrix to optimize memory access.
2069      * With DD the overhead of sorting is more than the gain during access.
2070      */
2071     bSortMatrix = !DOMAINDECOMP(cr);
2072
2073     li->blbnb.resize(li->ncc);
2074
2075 #pragma omp parallel for num_threads(li->ntask) schedule(static)
2076     for (int th = 0; th < li->ntask; th++)
2077     {
2078         try
2079         {
2080             Task& li_task = li->task[th];
2081
2082             if (li->ncg_triangle > 0)
2083             {
2084                 /* This is allocating too much, but it is difficult to improve */
2085                 li_task.triangle.resize(li_task.b1 - li_task.b0);
2086                 li_task.tri_bits.resize(li_task.b1 - li_task.b0);
2087             }
2088
2089             set_matrix_indices(li, li_task, at2con, bSortMatrix);
2090         }
2091         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2092     }
2093
2094     if (cr->dd == nullptr)
2095     {
2096         /* Since the matrix is static, we should free some memory */
2097         li->blbnb.resize(li->ncc);
2098     }
2099
2100     li->blmf.resize(li->ncc);
2101     li->blmf1.resize(li->ncc);
2102     li->tmpncc.resize(li->ncc);
2103
2104     gmx::ArrayRef<const int> nlocat_dd = dd_constraints_nlocalatoms(cr->dd);
2105     if (!nlocat_dd.empty())
2106     {
2107         /* Convert nlocat from local topology to LINCS constraint indexing */
2108         for (con = 0; con < ncon_tot; con++)
2109         {
2110             li->nlocat[li->con_index[con]] = nlocat_dd[con];
2111         }
2112     }
2113     else
2114     {
2115         li->nlocat.clear();
2116     }
2117
2118     if (debug)
2119     {
2120         fprintf(debug, "Number of constraints is %d, padded %d, couplings %d\n", li->nc_real,
2121                 li->nc, li->ncc);
2122     }
2123
2124     if (li->ntask > 1)
2125     {
2126         lincs_thread_setup(li, md.nr);
2127     }
2128
2129     set_lincs_matrix(li, md.invmass, md.lambda);
2130 }
2131
2132 //! Issues a warning when LINCS constraints cannot be satisfied.
2133 static void lincs_warning(gmx_domdec_t*                 dd,
2134                           ArrayRef<const RVec>          x,
2135                           ArrayRef<const RVec>          xprime,
2136                           t_pbc*                        pbc,
2137                           int                           ncons,
2138                           gmx::ArrayRef<const AtomPair> atoms,
2139                           gmx::ArrayRef<const real>     bllen,
2140                           real                          wangle,
2141                           int                           maxwarn,
2142                           int*                          warncount)
2143 {
2144     real wfac = std::cos(DEG2RAD * wangle);
2145
2146     fprintf(stderr,
2147             "bonds that rotated more than %g degrees:\n"
2148             " atom 1 atom 2  angle  previous, current, constraint length\n",
2149             wangle);
2150
2151     for (int b = 0; b < ncons; b++)
2152     {
2153         const int i = atoms[b].index1;
2154         const int j = atoms[b].index2;
2155         rvec      v0;
2156         rvec      v1;
2157         if (pbc)
2158         {
2159             pbc_dx_aiuc(pbc, x[i], x[j], v0);
2160             pbc_dx_aiuc(pbc, xprime[i], xprime[j], v1);
2161         }
2162         else
2163         {
2164             rvec_sub(x[i], x[j], v0);
2165             rvec_sub(xprime[i], xprime[j], v1);
2166         }
2167         real d0     = norm(v0);
2168         real d1     = norm(v1);
2169         real cosine = ::iprod(v0, v1) / (d0 * d1);
2170         if (cosine < wfac)
2171         {
2172             fprintf(stderr, " %6d %6d  %5.1f  %8.4f %8.4f    %8.4f\n", ddglatnr(dd, i),
2173                     ddglatnr(dd, j), RAD2DEG * std::acos(cosine), d0, d1, bllen[b]);
2174             if (!std::isfinite(d1))
2175             {
2176                 gmx_fatal(FARGS, "Bond length not finite.");
2177             }
2178
2179             (*warncount)++;
2180         }
2181     }
2182     if (*warncount > maxwarn)
2183     {
2184         too_many_constraint_warnings(econtLINCS, *warncount);
2185     }
2186 }
2187
2188 //! Status information about how well LINCS satisified the constraints in this domain
2189 struct LincsDeviations
2190 {
2191     //! The maximum over all bonds in this domain of the relative deviation in bond lengths
2192     real maxDeviation = 0;
2193     //! Sum over all bonds in this domain of the squared relative deviation
2194     real sumSquaredDeviation = 0;
2195     //! Index of bond with max deviation
2196     int indexOfMaxDeviation = -1;
2197     //! Number of bonds constrained in this domain
2198     int numConstraints = 0;
2199 };
2200
2201 //! Determine how well the constraints have been satisfied.
2202 static LincsDeviations makeLincsDeviations(const Lincs& lincsd, ArrayRef<const RVec> x, const t_pbc* pbc)
2203 {
2204     LincsDeviations                result;
2205     const ArrayRef<const AtomPair> atoms  = lincsd.atoms;
2206     const ArrayRef<const real>     bllen  = lincsd.bllen;
2207     const ArrayRef<const int>      nlocat = lincsd.nlocat;
2208
2209     for (int task = 0; task < lincsd.ntask; task++)
2210     {
2211         for (int b = lincsd.task[task].b0; b < lincsd.task[task].b1; b++)
2212         {
2213             rvec dx;
2214             if (pbc)
2215             {
2216                 pbc_dx_aiuc(pbc, x[atoms[b].index1], x[atoms[b].index2], dx);
2217             }
2218             else
2219             {
2220                 rvec_sub(x[atoms[b].index1], x[atoms[b].index2], dx);
2221             }
2222             real r2  = ::norm2(dx);
2223             real len = r2 * gmx::invsqrt(r2);
2224             real d   = std::abs(len / bllen[b] - 1.0_real);
2225             if (d > result.maxDeviation && (nlocat.empty() || nlocat[b]))
2226             {
2227                 result.maxDeviation        = d;
2228                 result.indexOfMaxDeviation = b;
2229             }
2230             if (nlocat.empty())
2231             {
2232                 result.sumSquaredDeviation += d * d;
2233                 result.numConstraints++;
2234             }
2235             else
2236             {
2237                 result.sumSquaredDeviation += nlocat[b] * d * d;
2238                 result.numConstraints += nlocat[b];
2239             }
2240         }
2241     }
2242
2243     if (!nlocat.empty())
2244     {
2245         result.numConstraints /= 2;
2246         result.sumSquaredDeviation *= 0.5;
2247     }
2248     return result;
2249 }
2250
2251 bool constrain_lincs(bool                            computeRmsd,
2252                      const t_inputrec&               ir,
2253                      int64_t                         step,
2254                      Lincs*                          lincsd,
2255                      const t_mdatoms&                md,
2256                      const t_commrec*                cr,
2257                      const gmx_multisim_t*           ms,
2258                      ArrayRefWithPadding<const RVec> xPadded,
2259                      ArrayRefWithPadding<RVec>       xprimePadded,
2260                      ArrayRef<RVec>                  min_proj,
2261                      const matrix                    box,
2262                      t_pbc*                          pbc,
2263                      real                            lambda,
2264                      real*                           dvdlambda,
2265                      real                            invdt,
2266                      ArrayRef<RVec>                  v,
2267                      bool                            bCalcVir,
2268                      tensor                          vir_r_m_dr,
2269                      ConstraintVariable              econq,
2270                      t_nrnb*                         nrnb,
2271                      int                             maxwarn,
2272                      int*                            warncount)
2273 {
2274     bool bOK = TRUE;
2275
2276     /* This boolean should be set by a flag passed to this routine.
2277      * We can also easily check if any constraint length is changed,
2278      * if not dH/dlambda=0 and we can also set the boolean to FALSE.
2279      */
2280     bool bCalcDHDL = (ir.efep != efepNO && dvdlambda != nullptr);
2281
2282     if (lincsd->nc == 0 && cr->dd == nullptr)
2283     {
2284         if (computeRmsd)
2285         {
2286             lincsd->rmsdData = { { 0 } };
2287         }
2288
2289         return bOK;
2290     }
2291
2292     ArrayRef<const RVec> x      = xPadded.unpaddedArrayRef();
2293     ArrayRef<RVec>       xprime = xprimePadded.unpaddedArrayRef();
2294
2295     if (econq == ConstraintVariable::Positions)
2296     {
2297         /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
2298          * also with efep!=fepNO.
2299          */
2300         if (ir.efep != efepNO)
2301         {
2302             if (md.nMassPerturbed && lincsd->matlam != md.lambda)
2303             {
2304                 set_lincs_matrix(lincsd, md.invmass, md.lambda);
2305             }
2306
2307             for (int i = 0; i < lincsd->nc; i++)
2308             {
2309                 lincsd->bllen[i] = lincsd->bllen0[i] + lambda * lincsd->ddist[i];
2310             }
2311         }
2312
2313         if (lincsd->ncg_flex)
2314         {
2315             /* Set the flexible constraint lengths to the old lengths */
2316             if (pbc != nullptr)
2317             {
2318                 for (int i = 0; i < lincsd->nc; i++)
2319                 {
2320                     if (lincsd->bllen[i] == 0)
2321                     {
2322                         rvec dx;
2323                         pbc_dx_aiuc(pbc, x[lincsd->atoms[i].index1], x[lincsd->atoms[i].index2], dx);
2324                         lincsd->bllen[i] = norm(dx);
2325                     }
2326                 }
2327             }
2328             else
2329             {
2330                 for (int i = 0; i < lincsd->nc; i++)
2331                 {
2332                     if (lincsd->bllen[i] == 0)
2333                     {
2334                         lincsd->bllen[i] = std::sqrt(
2335                                 distance2(x[lincsd->atoms[i].index1], x[lincsd->atoms[i].index2]));
2336                     }
2337                 }
2338             }
2339         }
2340
2341         const bool printDebugOutput = ((debug != nullptr) && lincsd->nc > 0);
2342         if (printDebugOutput)
2343         {
2344             LincsDeviations deviations = makeLincsDeviations(*lincsd, xprime, pbc);
2345             fprintf(debug, "   Rel. Constraint Deviation:  RMS         MAX     between atoms\n");
2346             fprintf(debug, "       Before LINCS          %.6f    %.6f %6d %6d\n",
2347                     std::sqrt(deviations.sumSquaredDeviation / deviations.numConstraints),
2348                     deviations.maxDeviation,
2349                     ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2350                     ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2351         }
2352
2353         /* This bWarn var can be updated by multiple threads
2354          * at the same time. But as we only need to detect
2355          * if a warning occurred or not, this is not an issue.
2356          */
2357         bool bWarn = FALSE;
2358
2359         /* The OpenMP parallel region of constrain_lincs for coords */
2360 #pragma omp parallel num_threads(lincsd->ntask)
2361         {
2362             try
2363             {
2364                 int th = gmx_omp_get_thread_num();
2365
2366                 clear_mat(lincsd->task[th].vir_r_m_dr);
2367
2368                 do_lincs(xPadded, xprimePadded, box, pbc, lincsd, th, md.invmass, cr, bCalcDHDL,
2369                          ir.LincsWarnAngle, &bWarn, invdt, v, bCalcVir,
2370                          th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2371             }
2372             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2373         }
2374
2375         if (computeRmsd || printDebugOutput || bWarn)
2376         {
2377             LincsDeviations deviations = makeLincsDeviations(*lincsd, xprime, pbc);
2378
2379             if (computeRmsd)
2380             {
2381                 // This is reduced across domains in compute_globals and
2382                 // reported to the log file.
2383                 lincsd->rmsdData[0] = deviations.numConstraints;
2384                 lincsd->rmsdData[1] = deviations.sumSquaredDeviation;
2385             }
2386             else
2387             {
2388                 // This is never read
2389                 lincsd->rmsdData = { { 0 } };
2390             }
2391             if (printDebugOutput)
2392             {
2393                 fprintf(debug, "        After LINCS          %.6f    %.6f %6d %6d\n\n",
2394                         std::sqrt(deviations.sumSquaredDeviation / deviations.numConstraints),
2395                         deviations.maxDeviation,
2396                         ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2397                         ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2398             }
2399
2400             if (bWarn)
2401             {
2402                 if (maxwarn < INT_MAX)
2403                 {
2404                     std::string simMesg;
2405                     if (isMultiSim(ms))
2406                     {
2407                         simMesg += gmx::formatString(" in simulation %d", ms->sim);
2408                     }
2409                     fprintf(stderr,
2410                             "\nStep %" PRId64
2411                             ", time %g (ps)  LINCS WARNING%s\n"
2412                             "relative constraint deviation after LINCS:\n"
2413                             "rms %.6f, max %.6f (between atoms %d and %d)\n",
2414                             step, ir.init_t + step * ir.delta_t, simMesg.c_str(),
2415                             std::sqrt(deviations.sumSquaredDeviation / deviations.numConstraints),
2416                             deviations.maxDeviation,
2417                             ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index1),
2418                             ddglatnr(cr->dd, lincsd->atoms[deviations.indexOfMaxDeviation].index2));
2419
2420                     lincs_warning(cr->dd, x, xprime, pbc, lincsd->nc, lincsd->atoms, lincsd->bllen,
2421                                   ir.LincsWarnAngle, maxwarn, warncount);
2422                 }
2423                 bOK = (deviations.maxDeviation < 0.5);
2424             }
2425         }
2426
2427         if (lincsd->ncg_flex)
2428         {
2429             for (int i = 0; (i < lincsd->nc); i++)
2430             {
2431                 if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
2432                 {
2433                     lincsd->bllen[i] = 0;
2434                 }
2435             }
2436         }
2437     }
2438     else
2439     {
2440         /* The OpenMP parallel region of constrain_lincs for derivatives */
2441 #pragma omp parallel num_threads(lincsd->ntask)
2442         {
2443             try
2444             {
2445                 int th = gmx_omp_get_thread_num();
2446
2447                 do_lincsp(xPadded, xprimePadded, min_proj, pbc, lincsd, th, md.invmass, econq,
2448                           bCalcDHDL, bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2449             }
2450             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2451         }
2452     }
2453
2454     if (bCalcDHDL)
2455     {
2456         /* Reduce the dH/dlambda contributions over the threads */
2457         real dhdlambda;
2458         int  th;
2459
2460         dhdlambda = 0;
2461         for (th = 0; th < lincsd->ntask; th++)
2462         {
2463             dhdlambda += lincsd->task[th].dhdlambda;
2464         }
2465         if (econq == ConstraintVariable::Positions)
2466         {
2467             /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2468             /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2469             dhdlambda /= ir.delta_t * ir.delta_t;
2470         }
2471         *dvdlambda += dhdlambda;
2472     }
2473
2474     if (bCalcVir && lincsd->ntask > 1)
2475     {
2476         for (int i = 1; i < lincsd->ntask; i++)
2477         {
2478             m_add(vir_r_m_dr, lincsd->task[i].vir_r_m_dr, vir_r_m_dr);
2479         }
2480     }
2481
2482     /* count assuming nit=1 */
2483     inc_nrnb(nrnb, eNR_LINCS, lincsd->nc_real);
2484     inc_nrnb(nrnb, eNR_LINCSMAT, (2 + lincsd->nOrder) * lincsd->ncc);
2485     if (lincsd->ntriangle > 0)
2486     {
2487         inc_nrnb(nrnb, eNR_LINCSMAT, lincsd->nOrder * lincsd->ncc_triangle);
2488     }
2489     if (!v.empty())
2490     {
2491         inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc_real * 2);
2492     }
2493     if (bCalcVir)
2494     {
2495         inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc_real);
2496     }
2497
2498     return bOK;
2499 }
2500
2501 } // namespace gmx