src/gromacs/mdlib/vsite.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 The GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "vsite.h"
  41
  42 #include <cstdio>
  43
  44 #include <algorithm>
  45 #include <memory>
  46 #include <vector>
  47
  48 #include "gromacs/domdec/domdec.h"
  49 #include "gromacs/domdec/domdec_struct.h"
  50 #include "gromacs/gmxlib/network.h"
  51 #include "gromacs/gmxlib/nrnb.h"
  52 #include "gromacs/math/functions.h"
  53 #include "gromacs/math/vec.h"
  54 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  55 #include "gromacs/mdtypes/commrec.h"
  56 #include "gromacs/mdtypes/mdatom.h"
  57 #include "gromacs/pbcutil/ishift.h"
  58 #include "gromacs/pbcutil/mshift.h"
  59 #include "gromacs/pbcutil/pbc.h"
  60 #include "gromacs/timing/wallcycle.h"
  61 #include "gromacs/topology/ifunc.h"
  62 #include "gromacs/topology/mtop_util.h"
  63 #include "gromacs/topology/topology.h"
  64 #include "gromacs/utility/exceptions.h"
  65 #include "gromacs/utility/fatalerror.h"
  66 #include "gromacs/utility/gmxassert.h"
  67 #include "gromacs/utility/gmxomp.h"
  68
  69 /* The strategy used here for assigning virtual sites to (thread-)tasks
  70  * is as follows:
  71  *
  72  * We divide the atom range that vsites operate on (natoms_local with DD,
  73  * 0 - last atom involved in vsites without DD) equally over all threads.
  74  *
  75  * Vsites in the local range constructed from atoms in the local range
  76  * and/or other vsites that are fully local are assigned to a simple,
  77  * independent task.
  78  *
  79  * Vsites that are not assigned after using the above criterion get assigned
  80  * to a so called "interdependent" thread task when none of the constructing
  81  * atoms is a vsite. These tasks are called interdependent, because one task
  82  * accesses atoms assigned to a different task/thread.
  83  * Note that this option is turned off with large (local) atom counts
  84  * to avoid high memory usage.
  85  *
  86  * Any remaining vsites are assigned to a separate master thread task.
  87  */
  88
  89 using gmx::ArrayRef;
  90 using gmx::RVec;
  91
  92 /*! \brief List of atom indices belonging to a task */
  93 struct AtomIndex
  94 {
  95     //! List of atom indices
  96     std::vector<int> atom;
  97 };
  98
  99 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
 100 struct InterdependentTask
 101 {
 102     //! The interaction lists, only vsite entries are used
 103     InteractionLists ilist;
 104     //! Thread/task-local force buffer
 105     std::vector<RVec> force;
 106     //! The atom indices of the vsites of our task
 107     std::vector<int> vsite;
 108     //! Flags if elements in force are spread to or not
 109     std::vector<bool> use;
 110     //! The number of entries set to true in use
 111     int nuse = 0;
 112     //! Array of atoms indices, size nthreads, covering all nuse set elements in use
 113     std::vector<AtomIndex> atomIndex;
 114     //! List of tasks (force blocks) this task spread forces to
 115     std::vector<int> spreadTask;
 116     //! List of tasks that write to this tasks force block range
 117     std::vector<int> reduceTask;
 118 };
 119
 120 /*! \brief Vsite thread task data structure */
 121 struct VsiteThread
 122 {
 123     //! Start of atom range of this task
 124     int rangeStart;
 125     //! End of atom range of this task
 126     int rangeEnd;
 127     //! The interaction lists, only vsite entries are used
 128     std::array<InteractionList, F_NRE> ilist;
 129     //! Local fshift accumulation buffer
 130     rvec fshift[SHIFTS];
 131     //! Local virial dx*df accumulation buffer
 132     matrix dxdf;
 133     //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
 134     bool useInterdependentTask;
 135     //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
 136     InterdependentTask idTask;
 137
 138     /*! \brief Constructor */
 139     VsiteThread()
 140     {
 141         rangeStart = -1;
 142         rangeEnd   = -1;
 143         clear_rvecs(SHIFTS, fshift);
 144         clear_mat(dxdf);
 145         useInterdependentTask = false;
 146     }
 147 };
 148
 149 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
 150  *
 151  * \param[in] ilist  The interaction list
 152  */
 153 static int vsiteIlistNrCount(ArrayRef<const InteractionList> ilist)
 154 {
 155     int nr = 0;
 156     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 157     {
 158         nr += ilist[ftype].size();
 159     }
 160
 161     return nr;
 162 }
 163
 164 static int pbc_rvec_sub(const t_pbc* pbc, const rvec xi, const rvec xj, rvec dx)
 165 {
 166     if (pbc)
 167     {
 168         return pbc_dx_aiuc(pbc, xi, xj, dx);
 169     }
 170     else
 171     {
 172         rvec_sub(xi, xj, dx);
 173         return CENTRAL;
 174     }
 175 }
 176
 177 static inline real inverseNorm(const rvec x)
 178 {
 179     return gmx::invsqrt(iprod(x, x));
 180 }
 181
 182 /* Vsite construction routines */
 183
 184 static void constr_vsite2(const rvec xi, const rvec xj, rvec x, real a, const t_pbc* pbc)
 185 {
 186     real b = 1 - a;
 187     /* 1 flop */
 188
 189     if (pbc)
 190     {
 191         rvec dx;
 192         pbc_dx_aiuc(pbc, xj, xi, dx);
 193         x[XX] = xi[XX] + a * dx[XX];
 194         x[YY] = xi[YY] + a * dx[YY];
 195         x[ZZ] = xi[ZZ] + a * dx[ZZ];
 196     }
 197     else
 198     {
 199         x[XX] = b * xi[XX] + a * xj[XX];
 200         x[YY] = b * xi[YY] + a * xj[YY];
 201         x[ZZ] = b * xi[ZZ] + a * xj[ZZ];
 202         /* 9 Flops */
 203     }
 204
 205     /* TOTAL: 10 flops */
 206 }
 207
 208 static void constr_vsite2FD(const rvec xi, const rvec xj, rvec x, real a, const t_pbc* pbc)
 209 {
 210     rvec xij;
 211     pbc_rvec_sub(pbc, xj, xi, xij);
 212     /* 3 flops */
 213
 214     const real b = a * inverseNorm(xij);
 215     /* 6 + 10 flops */
 216
 217     x[XX] = xi[XX] + b * xij[XX];
 218     x[YY] = xi[YY] + b * xij[YY];
 219     x[ZZ] = xi[ZZ] + b * xij[ZZ];
 220     /* 6 Flops */
 221
 222     /* TOTAL: 25 flops */
 223 }
 224
 225 static void constr_vsite3(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 226 {
 227     real c = 1 - a - b;
 228     /* 2 flops */
 229
 230     if (pbc)
 231     {
 232         rvec dxj, dxk;
 233
 234         pbc_dx_aiuc(pbc, xj, xi, dxj);
 235         pbc_dx_aiuc(pbc, xk, xi, dxk);
 236         x[XX] = xi[XX] + a * dxj[XX] + b * dxk[XX];
 237         x[YY] = xi[YY] + a * dxj[YY] + b * dxk[YY];
 238         x[ZZ] = xi[ZZ] + a * dxj[ZZ] + b * dxk[ZZ];
 239     }
 240     else
 241     {
 242         x[XX] = c * xi[XX] + a * xj[XX] + b * xk[XX];
 243         x[YY] = c * xi[YY] + a * xj[YY] + b * xk[YY];
 244         x[ZZ] = c * xi[ZZ] + a * xj[ZZ] + b * xk[ZZ];
 245         /* 15 Flops */
 246     }
 247
 248     /* TOTAL: 17 flops */
 249 }
 250
 251 static void constr_vsite3FD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 252 {
 253     rvec xij, xjk, temp;
 254     real c;
 255
 256     pbc_rvec_sub(pbc, xj, xi, xij);
 257     pbc_rvec_sub(pbc, xk, xj, xjk);
 258     /* 6 flops */
 259
 260     /* temp goes from i to a point on the line jk */
 261     temp[XX] = xij[XX] + a * xjk[XX];
 262     temp[YY] = xij[YY] + a * xjk[YY];
 263     temp[ZZ] = xij[ZZ] + a * xjk[ZZ];
 264     /* 6 flops */
 265
 266     c = b * inverseNorm(temp);
 267     /* 6 + 10 flops */
 268
 269     x[XX] = xi[XX] + c * temp[XX];
 270     x[YY] = xi[YY] + c * temp[YY];
 271     x[ZZ] = xi[ZZ] + c * temp[ZZ];
 272     /* 6 Flops */
 273
 274     /* TOTAL: 34 flops */
 275 }
 276
 277 static void constr_vsite3FAD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 278 {
 279     rvec xij, xjk, xp;
 280     real a1, b1, c1, invdij;
 281
 282     pbc_rvec_sub(pbc, xj, xi, xij);
 283     pbc_rvec_sub(pbc, xk, xj, xjk);
 284     /* 6 flops */
 285
 286     invdij = inverseNorm(xij);
 287     c1     = invdij * invdij * iprod(xij, xjk);
 288     xp[XX] = xjk[XX] - c1 * xij[XX];
 289     xp[YY] = xjk[YY] - c1 * xij[YY];
 290     xp[ZZ] = xjk[ZZ] - c1 * xij[ZZ];
 291     a1     = a * invdij;
 292     b1     = b * inverseNorm(xp);
 293     /* 45 */
 294
 295     x[XX] = xi[XX] + a1 * xij[XX] + b1 * xp[XX];
 296     x[YY] = xi[YY] + a1 * xij[YY] + b1 * xp[YY];
 297     x[ZZ] = xi[ZZ] + a1 * xij[ZZ] + b1 * xp[ZZ];
 298     /* 12 Flops */
 299
 300     /* TOTAL: 63 flops */
 301 }
 302
 303 static void
 304 constr_vsite3OUT(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, real c, const t_pbc* pbc)
 305 {
 306     rvec xij, xik, temp;
 307
 308     pbc_rvec_sub(pbc, xj, xi, xij);
 309     pbc_rvec_sub(pbc, xk, xi, xik);
 310     cprod(xij, xik, temp);
 311     /* 15 Flops */
 312
 313     x[XX] = xi[XX] + a * xij[XX] + b * xik[XX] + c * temp[XX];
 314     x[YY] = xi[YY] + a * xij[YY] + b * xik[YY] + c * temp[YY];
 315     x[ZZ] = xi[ZZ] + a * xij[ZZ] + b * xik[ZZ] + c * temp[ZZ];
 316     /* 18 Flops */
 317
 318     /* TOTAL: 33 flops */
 319 }
 320
 321 static void constr_vsite4FD(const rvec   xi,
 322                             const rvec   xj,
 323                             const rvec   xk,
 324                             const rvec   xl,
 325                             rvec         x,
 326                             real         a,
 327                             real         b,
 328                             real         c,
 329                             const t_pbc* pbc)
 330 {
 331     rvec xij, xjk, xjl, temp;
 332     real d;
 333
 334     pbc_rvec_sub(pbc, xj, xi, xij);
 335     pbc_rvec_sub(pbc, xk, xj, xjk);
 336     pbc_rvec_sub(pbc, xl, xj, xjl);
 337     /* 9 flops */
 338
 339     /* temp goes from i to a point on the plane jkl */
 340     temp[XX] = xij[XX] + a * xjk[XX] + b * xjl[XX];
 341     temp[YY] = xij[YY] + a * xjk[YY] + b * xjl[YY];
 342     temp[ZZ] = xij[ZZ] + a * xjk[ZZ] + b * xjl[ZZ];
 343     /* 12 flops */
 344
 345     d = c * inverseNorm(temp);
 346     /* 6 + 10 flops */
 347
 348     x[XX] = xi[XX] + d * temp[XX];
 349     x[YY] = xi[YY] + d * temp[YY];
 350     x[ZZ] = xi[ZZ] + d * temp[ZZ];
 351     /* 6 Flops */
 352
 353     /* TOTAL: 43 flops */
 354 }
 355
 356 static void constr_vsite4FDN(const rvec   xi,
 357                              const rvec   xj,
 358                              const rvec   xk,
 359                              const rvec   xl,
 360                              rvec         x,
 361                              real         a,
 362                              real         b,
 363                              real         c,
 364                              const t_pbc* pbc)
 365 {
 366     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 367     real d;
 368
 369     pbc_rvec_sub(pbc, xj, xi, xij);
 370     pbc_rvec_sub(pbc, xk, xi, xik);
 371     pbc_rvec_sub(pbc, xl, xi, xil);
 372     /* 9 flops */
 373
 374     ra[XX] = a * xik[XX];
 375     ra[YY] = a * xik[YY];
 376     ra[ZZ] = a * xik[ZZ];
 377
 378     rb[XX] = b * xil[XX];
 379     rb[YY] = b * xil[YY];
 380     rb[ZZ] = b * xil[ZZ];
 381
 382     /* 6 flops */
 383
 384     rvec_sub(ra, xij, rja);
 385     rvec_sub(rb, xij, rjb);
 386     /* 6 flops */
 387
 388     cprod(rja, rjb, rm);
 389     /* 9 flops */
 390
 391     d = c * inverseNorm(rm);
 392     /* 5+5+1 flops */
 393
 394     x[XX] = xi[XX] + d * rm[XX];
 395     x[YY] = xi[YY] + d * rm[YY];
 396     x[ZZ] = xi[ZZ] + d * rm[ZZ];
 397     /* 6 Flops */
 398
 399     /* TOTAL: 47 flops */
 400 }
 401
 402
 403 static int constr_vsiten(const t_iatom* ia, ArrayRef<const t_iparams> ip, rvec* x, const t_pbc* pbc)
 404 {
 405     rvec x1, dx;
 406     dvec dsum;
 407     int  n3, av, ai;
 408     real a;
 409
 410     n3 = 3 * ip[ia[0]].vsiten.n;
 411     av = ia[1];
 412     ai = ia[2];
 413     copy_rvec(x[ai], x1);
 414     clear_dvec(dsum);
 415     for (int i = 3; i < n3; i += 3)
 416     {
 417         ai = ia[i + 2];
 418         a  = ip[ia[i]].vsiten.a;
 419         if (pbc)
 420         {
 421             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 422         }
 423         else
 424         {
 425             rvec_sub(x[ai], x1, dx);
 426         }
 427         dsum[XX] += a * dx[XX];
 428         dsum[YY] += a * dx[YY];
 429         dsum[ZZ] += a * dx[ZZ];
 430         /* 9 Flops */
 431     }
 432
 433     x[av][XX] = x1[XX] + dsum[XX];
 434     x[av][YY] = x1[YY] + dsum[YY];
 435     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 436
 437     return n3;
 438 }
 439
 440 /*! \brief PBC modes for vsite construction and spreading */
 441 enum class PbcMode
 442 {
 443     all, // Apply normal, simple PBC for all vsites
 444     none // No PBC treatment needed
 445 };
 446
 447 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
 448  *
 449  * \param[in] pbcPtr  A pointer to a PBC struct or nullptr when no PBC treatment is required
 450  */
 451 static PbcMode getPbcMode(const t_pbc* pbcPtr)
 452 {
 453     if (pbcPtr == nullptr)
 454     {
 455         return PbcMode::none;
 456     }
 457     else
 458     {
 459         return PbcMode::all;
 460     }
 461 }
 462
 463 static void construct_vsites_thread(rvec                            x[],
 464                                     real                            dt,
 465                                     rvec*                           v,
 466                                     ArrayRef<const t_iparams>       ip,
 467                                     ArrayRef<const InteractionList> ilist,
 468                                     const t_pbc*                    pbc_null)
 469 {
 470     real inv_dt;
 471     if (v != nullptr)
 472     {
 473         inv_dt = 1.0 / dt;
 474     }
 475     else
 476     {
 477         inv_dt = 1.0;
 478     }
 479
 480     const PbcMode pbcMode = getPbcMode(pbc_null);
 481     /* We need another pbc pointer, as with charge groups we switch per vsite */
 482     const t_pbc* pbc_null2 = pbc_null;
 483
 484     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 485     {
 486         if (ilist[ftype].empty())
 487         {
 488             continue;
 489         }
 490
 491         { // TODO remove me
 492             int nra = interaction_function[ftype].nratoms;
 493             int inc = 1 + nra;
 494             int nr  = ilist[ftype].size();
 495
 496             const t_iatom* ia = ilist[ftype].iatoms.data();
 497
 498             for (int i = 0; i < nr;)
 499             {
 500                 int tp = ia[0];
 501                 /* The vsite and constructing atoms */
 502                 int avsite = ia[1];
 503                 int ai     = ia[2];
 504                 /* Constants for constructing vsites */
 505                 real a1 = ip[tp].vsite.a;
 506                 /* Copy the old position */
 507                 rvec xv;
 508                 copy_rvec(x[avsite], xv);
 509
 510                 /* Construct the vsite depending on type */
 511                 int  aj, ak, al;
 512                 real b1, c1;
 513                 switch (ftype)
 514                 {
 515                     case F_VSITE2:
 516                         aj = ia[3];
 517                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 518                         break;
 519                     case F_VSITE2FD:
 520                         aj = ia[3];
 521                         constr_vsite2FD(x[ai], x[aj], x[avsite], a1, pbc_null2);
 522                         break;
 523                     case F_VSITE3:
 524                         aj = ia[3];
 525                         ak = ia[4];
 526                         b1 = ip[tp].vsite.b;
 527                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 528                         break;
 529                     case F_VSITE3FD:
 530                         aj = ia[3];
 531                         ak = ia[4];
 532                         b1 = ip[tp].vsite.b;
 533                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 534                         break;
 535                     case F_VSITE3FAD:
 536                         aj = ia[3];
 537                         ak = ia[4];
 538                         b1 = ip[tp].vsite.b;
 539                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 540                         break;
 541                     case F_VSITE3OUT:
 542                         aj = ia[3];
 543                         ak = ia[4];
 544                         b1 = ip[tp].vsite.b;
 545                         c1 = ip[tp].vsite.c;
 546                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 547                         break;
 548                     case F_VSITE4FD:
 549                         aj = ia[3];
 550                         ak = ia[4];
 551                         al = ia[5];
 552                         b1 = ip[tp].vsite.b;
 553                         c1 = ip[tp].vsite.c;
 554                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1, pbc_null2);
 555                         break;
 556                     case F_VSITE4FDN:
 557                         aj = ia[3];
 558                         ak = ia[4];
 559                         al = ia[5];
 560                         b1 = ip[tp].vsite.b;
 561                         c1 = ip[tp].vsite.c;
 562                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1, pbc_null2);
 563                         break;
 564                     case F_VSITEN: inc = constr_vsiten(ia, ip, x, pbc_null2); break;
 565                     default:
 566                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d", ftype, __FILE__, __LINE__);
 567                 }
 568
 569                 if (pbcMode == PbcMode::all)
 570                 {
 571                     /* Keep the vsite in the same periodic image as before */
 572                     rvec dx;
 573                     int  ishift = pbc_dx_aiuc(pbc_null, x[avsite], xv, dx);
 574                     if (ishift != CENTRAL)
 575                     {
 576                         rvec_add(xv, dx, x[avsite]);
 577                     }
 578                 }
 579                 if (v != nullptr)
 580                 {
 581                     /* Calculate velocity of vsite... */
 582                     rvec vv;
 583                     rvec_sub(x[avsite], xv, vv);
 584                     svmul(inv_dt, vv, v[avsite]);
 585                 }
 586
 587                 /* Increment loop variables */
 588                 i += inc;
 589                 ia += inc;
 590             }
 591         }
 592     }
 593 }
 594
 595 void construct_vsites(const gmx_vsite_t*              vsite,
 596                       rvec                            x[],
 597                       real                            dt,
 598                       rvec*                           v,
 599                       ArrayRef<const t_iparams>       ip,
 600                       ArrayRef<const InteractionList> ilist,
 601                       PbcType                         pbcType,
 602                       gmx_bool                        bMolPBC,
 603                       const t_commrec*                cr,
 604                       const matrix                    box)
 605 {
 606     const bool useDomdec = (vsite != nullptr && vsite->useDomdec);
 607     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)),
 608                "When vsites are set up with domain decomposition, we need a valid commrec");
 609     // TODO: Remove this assertion when we remove charge groups
 610     GMX_ASSERT(vsite != nullptr || pbcType == PbcType::No,
 611                "Without a vsite struct we can not do PBC (in case we have charge groups)");
 612
 613     t_pbc pbc, *pbc_null;
 614
 615     /* We only need to do pbc when we have inter-cg vsites.
 616      * Note that with domain decomposition we do not need to apply PBC here
 617      * when we have at least 3 domains along each dimension. Currently we
 618      * do not optimize this case.
 619      */
 620     if (pbcType != PbcType::No && (useDomdec || bMolPBC)
 621         && !(vsite != nullptr && vsite->numInterUpdategroupVsites == 0))
 622     {
 623         /* This is wasting some CPU time as we now do this multiple times
 624          * per MD step.
 625          */
 626         ivec null_ivec;
 627         clear_ivec(null_ivec);
 628         pbc_null = set_pbc_dd(&pbc, pbcType, useDomdec ? cr->dd->numCells : null_ivec, FALSE, box);
 629     }
 630     else
 631     {
 632         pbc_null = nullptr;
 633     }
 634
 635     if (useDomdec)
 636     {
 637         dd_move_x_vsites(cr->dd, box, x);
 638     }
 639
 640     if (vsite == nullptr || vsite->nthreads == 1)
 641     {
 642         construct_vsites_thread(x, dt, v, ip, ilist, pbc_null);
 643     }
 644     else
 645     {
 646 #pragma omp parallel num_threads(vsite->nthreads)
 647         {
 648             try
 649             {
 650                 const int          th    = gmx_omp_get_thread_num();
 651                 const VsiteThread& tData = *vsite->tData[th];
 652                 GMX_ASSERT(tData.rangeStart >= 0,
 653                            "The thread data should be initialized before calling construct_vsites");
 654
 655                 construct_vsites_thread(x, dt, v, ip, tData.ilist, pbc_null);
 656                 if (tData.useInterdependentTask)
 657                 {
 658                     /* Here we don't need a barrier (unlike the spreading),
 659                      * since both tasks only construct vsites from particles,
 660                      * or local vsites, not from non-local vsites.
 661                      */
 662                     construct_vsites_thread(x, dt, v, ip, tData.idTask.ilist, pbc_null);
 663                 }
 664             }
 665             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 666         }
 667         /* Now we can construct the vsites that might depend on other vsites */
 668         construct_vsites_thread(x, dt, v, ip, vsite->tData[vsite->nthreads]->ilist, pbc_null);
 669     }
 670 }
 671
 672 static void spread_vsite2(const t_iatom  ia[],
 673                           real           a,
 674                           const rvec     x[],
 675                           rvec           f[],
 676                           rvec           fshift[],
 677                           const t_pbc*   pbc,
 678                           const t_graph* g)
 679 {
 680     rvec    fi, fj, dx;
 681     t_iatom av, ai, aj;
 682     ivec    di;
 683     int     siv, sij;
 684
 685     av = ia[1];
 686     ai = ia[2];
 687     aj = ia[3];
 688
 689     svmul(1 - a, f[av], fi);
 690     svmul(a, f[av], fj);
 691     /* 7 flop */
 692
 693     rvec_inc(f[ai], fi);
 694     rvec_inc(f[aj], fj);
 695     /* 6 Flops */
 696
 697     if (g)
 698     {
 699         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 700         siv = IVEC2IS(di);
 701         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 702         sij = IVEC2IS(di);
 703     }
 704     else if (pbc)
 705     {
 706         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 707         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 708     }
 709     else
 710     {
 711         siv = CENTRAL;
 712         sij = CENTRAL;
 713     }
 714
 715     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 716     {
 717         rvec_inc(fshift[siv], f[av]);
 718         rvec_dec(fshift[CENTRAL], fi);
 719         rvec_dec(fshift[sij], fj);
 720     }
 721
 722     /* TOTAL: 13 flops */
 723 }
 724
 725 void constructVsitesGlobal(const gmx_mtop_t& mtop, gmx::ArrayRef<gmx::RVec> x)
 726 {
 727     GMX_ASSERT(x.ssize() >= mtop.natoms, "x should contain the whole system");
 728     GMX_ASSERT(!mtop.moleculeBlockIndices.empty(),
 729                "molblock indices are needed in constructVsitesGlobal");
 730
 731     for (size_t mb = 0; mb < mtop.molblock.size(); mb++)
 732     {
 733         const gmx_molblock_t& molb = mtop.molblock[mb];
 734         const gmx_moltype_t&  molt = mtop.moltype[molb.type];
 735         if (vsiteIlistNrCount(molt.ilist) > 0)
 736         {
 737             int atomOffset = mtop.moleculeBlockIndices[mb].globalAtomStart;
 738             for (int mol = 0; mol < molb.nmol; mol++)
 739             {
 740                 construct_vsites(nullptr, as_rvec_array(x.data()) + atomOffset, 0.0, nullptr,
 741                                  mtop.ffparams.iparams, molt.ilist, PbcType::No, TRUE, nullptr, nullptr);
 742                 atomOffset += molt.atoms.nr;
 743             }
 744         }
 745     }
 746 }
 747
 748 static void spread_vsite2FD(const t_iatom  ia[],
 749                             real           a,
 750                             const rvec     x[],
 751                             rvec           f[],
 752                             rvec           fshift[],
 753                             gmx_bool       VirCorr,
 754                             matrix         dxdf,
 755                             const t_pbc*   pbc,
 756                             const t_graph* g)
 757 {
 758     const int av = ia[1];
 759     const int ai = ia[2];
 760     const int aj = ia[3];
 761     rvec      fv;
 762     copy_rvec(f[av], fv);
 763
 764     rvec xij;
 765     int  sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 766     /* 6 flops */
 767
 768     const real invDistance = inverseNorm(xij);
 769     const real b           = a * invDistance;
 770     /* 4 + ?10? flops */
 771
 772     const real fproj = iprod(xij, fv) * invDistance * invDistance;
 773
 774     rvec fj;
 775     fj[XX] = b * (fv[XX] - fproj * xij[XX]);
 776     fj[YY] = b * (fv[YY] - fproj * xij[YY]);
 777     fj[ZZ] = b * (fv[ZZ] - fproj * xij[ZZ]);
 778     /* 9 */
 779
 780     /* b is already calculated in constr_vsite2FD
 781        storing b somewhere will save flops.     */
 782
 783     f[ai][XX] += fv[XX] - fj[XX];
 784     f[ai][YY] += fv[YY] - fj[YY];
 785     f[ai][ZZ] += fv[ZZ] - fj[ZZ];
 786     f[aj][XX] += fj[XX];
 787     f[aj][YY] += fj[YY];
 788     f[aj][ZZ] += fj[ZZ];
 789     /* 9 Flops */
 790
 791     if (fshift)
 792     {
 793         int svi;
 794         if (g)
 795         {
 796             ivec di;
 797             ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 798             svi = IVEC2IS(di);
 799             ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 800             sji = IVEC2IS(di);
 801         }
 802         else if (pbc)
 803         {
 804             rvec xvi;
 805             svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 806         }
 807         else
 808         {
 809             svi = CENTRAL;
 810         }
 811
 812         if (svi != CENTRAL || sji != CENTRAL)
 813         {
 814             rvec_dec(fshift[svi], fv);
 815             fshift[CENTRAL][XX] += fv[XX] - fj[XX];
 816             fshift[CENTRAL][YY] += fv[YY] - fj[YY];
 817             fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ];
 818             fshift[sji][XX] += fj[XX];
 819             fshift[sji][YY] += fj[YY];
 820             fshift[sji][ZZ] += fj[ZZ];
 821         }
 822     }
 823
 824     if (VirCorr)
 825     {
 826         /* When VirCorr=TRUE, the virial for the current forces is not
 827          * calculated from the redistributed forces. This means that
 828          * the effect of non-linear virtual site constructions on the virial
 829          * needs to be added separately. This contribution can be calculated
 830          * in many ways, but the simplest and cheapest way is to use
 831          * the first constructing atom ai as a reference position in space:
 832          * subtract (xv-xi)*fv and add (xj-xi)*fj.
 833          */
 834         rvec xiv;
 835
 836         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 837
 838         for (int i = 0; i < DIM; i++)
 839         {
 840             for (int j = 0; j < DIM; j++)
 841             {
 842                 /* As xix is a linear combination of j and k, use that here */
 843                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j];
 844             }
 845         }
 846     }
 847
 848     /* TOTAL: 38 flops */
 849 }
 850
 851 static void spread_vsite3(const t_iatom  ia[],
 852                           real           a,
 853                           real           b,
 854                           const rvec     x[],
 855                           rvec           f[],
 856                           rvec           fshift[],
 857                           const t_pbc*   pbc,
 858                           const t_graph* g)
 859 {
 860     rvec fi, fj, fk, dx;
 861     int  av, ai, aj, ak;
 862     ivec di;
 863     int  siv, sij, sik;
 864
 865     av = ia[1];
 866     ai = ia[2];
 867     aj = ia[3];
 868     ak = ia[4];
 869
 870     svmul(1 - a - b, f[av], fi);
 871     svmul(a, f[av], fj);
 872     svmul(b, f[av], fk);
 873     /* 11 flops */
 874
 875     rvec_inc(f[ai], fi);
 876     rvec_inc(f[aj], fj);
 877     rvec_inc(f[ak], fk);
 878     /* 9 Flops */
 879
 880     if (g)
 881     {
 882         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 883         siv = IVEC2IS(di);
 884         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 885         sij = IVEC2IS(di);
 886         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 887         sik = IVEC2IS(di);
 888     }
 889     else if (pbc)
 890     {
 891         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 892         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 893         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 894     }
 895     else
 896     {
 897         siv = CENTRAL;
 898         sij = CENTRAL;
 899         sik = CENTRAL;
 900     }
 901
 902     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 903     {
 904         rvec_inc(fshift[siv], f[av]);
 905         rvec_dec(fshift[CENTRAL], fi);
 906         rvec_dec(fshift[sij], fj);
 907         rvec_dec(fshift[sik], fk);
 908     }
 909
 910     /* TOTAL: 20 flops */
 911 }
 912
 913 static void spread_vsite3FD(const t_iatom  ia[],
 914                             real           a,
 915                             real           b,
 916                             const rvec     x[],
 917                             rvec           f[],
 918                             rvec           fshift[],
 919                             gmx_bool       VirCorr,
 920                             matrix         dxdf,
 921                             const t_pbc*   pbc,
 922                             const t_graph* g)
 923 {
 924     real    fproj, a1;
 925     rvec    xvi, xij, xjk, xix, fv, temp;
 926     t_iatom av, ai, aj, ak;
 927     int     svi, sji, skj;
 928     ivec    di;
 929
 930     av = ia[1];
 931     ai = ia[2];
 932     aj = ia[3];
 933     ak = ia[4];
 934     copy_rvec(f[av], fv);
 935
 936     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 937     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 938     /* 6 flops */
 939
 940     /* xix goes from i to point x on the line jk */
 941     xix[XX] = xij[XX] + a * xjk[XX];
 942     xix[YY] = xij[YY] + a * xjk[YY];
 943     xix[ZZ] = xij[ZZ] + a * xjk[ZZ];
 944     /* 6 flops */
 945
 946     const real invDistance = inverseNorm(xix);
 947     const real c           = b * invDistance;
 948     /* 4 + ?10? flops */
 949
 950     fproj = iprod(xix, fv) * invDistance * invDistance; /* = (xix . f)/(xix . xix) */
 951
 952     temp[XX] = c * (fv[XX] - fproj * xix[XX]);
 953     temp[YY] = c * (fv[YY] - fproj * xix[YY]);
 954     temp[ZZ] = c * (fv[ZZ] - fproj * xix[ZZ]);
 955     /* 16 */
 956
 957     /* c is already calculated in constr_vsite3FD
 958        storing c somewhere will save 26 flops!     */
 959
 960     a1 = 1 - a;
 961     f[ai][XX] += fv[XX] - temp[XX];
 962     f[ai][YY] += fv[YY] - temp[YY];
 963     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 964     f[aj][XX] += a1 * temp[XX];
 965     f[aj][YY] += a1 * temp[YY];
 966     f[aj][ZZ] += a1 * temp[ZZ];
 967     f[ak][XX] += a * temp[XX];
 968     f[ak][YY] += a * temp[YY];
 969     f[ak][ZZ] += a * temp[ZZ];
 970     /* 19 Flops */
 971
 972     if (g)
 973     {
 974         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 975         svi = IVEC2IS(di);
 976         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 977         sji = IVEC2IS(di);
 978         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 979         skj = IVEC2IS(di);
 980     }
 981     else if (pbc)
 982     {
 983         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 984     }
 985     else
 986     {
 987         svi = CENTRAL;
 988     }
 989
 990     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 991     {
 992         rvec_dec(fshift[svi], fv);
 993         fshift[CENTRAL][XX] += fv[XX] - (1 + a) * temp[XX];
 994         fshift[CENTRAL][YY] += fv[YY] - (1 + a) * temp[YY];
 995         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a) * temp[ZZ];
 996         fshift[sji][XX] += temp[XX];
 997         fshift[sji][YY] += temp[YY];
 998         fshift[sji][ZZ] += temp[ZZ];
 999         fshift[skj][XX] += a * temp[XX];
1000         fshift[skj][YY] += a * temp[YY];
1001         fshift[skj][ZZ] += a * temp[ZZ];
1002     }
1003
1004     if (VirCorr)
1005     {
1006         /* When VirCorr=TRUE, the virial for the current forces is not
1007          * calculated from the redistributed forces. This means that
1008          * the effect of non-linear virtual site constructions on the virial
1009          * needs to be added separately. This contribution can be calculated
1010          * in many ways, but the simplest and cheapest way is to use
1011          * the first constructing atom ai as a reference position in space:
1012          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
1013          */
1014         rvec xiv;
1015
1016         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1017
1018         for (int i = 0; i < DIM; i++)
1019         {
1020             for (int j = 0; j < DIM; j++)
1021             {
1022                 /* As xix is a linear combination of j and k, use that here */
1023                 dxdf[i][j] += -xiv[i] * fv[j] + xix[i] * temp[j];
1024             }
1025         }
1026     }
1027
1028     /* TOTAL: 61 flops */
1029 }
1030
1031 static void spread_vsite3FAD(const t_iatom  ia[],
1032                              real           a,
1033                              real           b,
1034                              const rvec     x[],
1035                              rvec           f[],
1036                              rvec           fshift[],
1037                              gmx_bool       VirCorr,
1038                              matrix         dxdf,
1039                              const t_pbc*   pbc,
1040                              const t_graph* g)
1041 {
1042     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
1043     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
1044     t_iatom av, ai, aj, ak;
1045     int     svi, sji, skj, d;
1046     ivec    di;
1047
1048     av = ia[1];
1049     ai = ia[2];
1050     aj = ia[3];
1051     ak = ia[4];
1052     copy_rvec(f[ia[1]], fv);
1053
1054     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1055     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1056     /* 6 flops */
1057
1058     invdij    = inverseNorm(xij);
1059     invdij2   = invdij * invdij;
1060     c1        = iprod(xij, xjk) * invdij2;
1061     xperp[XX] = xjk[XX] - c1 * xij[XX];
1062     xperp[YY] = xjk[YY] - c1 * xij[YY];
1063     xperp[ZZ] = xjk[ZZ] - c1 * xij[ZZ];
1064     /* xperp in plane ijk, perp. to ij */
1065     invdp = inverseNorm(xperp);
1066     a1    = a * invdij;
1067     b1    = b * invdp;
1068     /* 45 flops */
1069
1070     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
1071        storing them somewhere will save 45 flops!     */
1072
1073     fproj = iprod(xij, fv) * invdij2;
1074     svmul(fproj, xij, Fpij);                              /* proj. f on xij */
1075     svmul(iprod(xperp, fv) * invdp * invdp, xperp, Fppp); /* proj. f on xperp */
1076     svmul(b1 * fproj, xperp, f3);
1077     /* 23 flops */
1078
1079     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
1080     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
1081     for (d = 0; (d < DIM); d++)
1082     {
1083         f1[d] *= a1;
1084         f2[d] *= b1;
1085     }
1086     /* 12 flops */
1087
1088     c2 = 1 + c1;
1089     f[ai][XX] += fv[XX] - f1[XX] + c1 * f2[XX] + f3[XX];
1090     f[ai][YY] += fv[YY] - f1[YY] + c1 * f2[YY] + f3[YY];
1091     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1 * f2[ZZ] + f3[ZZ];
1092     f[aj][XX] += f1[XX] - c2 * f2[XX] - f3[XX];
1093     f[aj][YY] += f1[YY] - c2 * f2[YY] - f3[YY];
1094     f[aj][ZZ] += f1[ZZ] - c2 * f2[ZZ] - f3[ZZ];
1095     f[ak][XX] += f2[XX];
1096     f[ak][YY] += f2[YY];
1097     f[ak][ZZ] += f2[ZZ];
1098     /* 30 Flops */
1099
1100     if (g)
1101     {
1102         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1103         svi = IVEC2IS(di);
1104         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1105         sji = IVEC2IS(di);
1106         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1107         skj = IVEC2IS(di);
1108     }
1109     else if (pbc)
1110     {
1111         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1112     }
1113     else
1114     {
1115         svi = CENTRAL;
1116     }
1117
1118     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1119     {
1120         rvec_dec(fshift[svi], fv);
1121         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1 - c1) * f2[XX] + f3[XX];
1122         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1 - c1) * f2[YY] + f3[YY];
1123         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1 - c1) * f2[ZZ] + f3[ZZ];
1124         fshift[sji][XX] += f1[XX] - c1 * f2[XX] - f3[XX];
1125         fshift[sji][YY] += f1[YY] - c1 * f2[YY] - f3[YY];
1126         fshift[sji][ZZ] += f1[ZZ] - c1 * f2[ZZ] - f3[ZZ];
1127         fshift[skj][XX] += f2[XX];
1128         fshift[skj][YY] += f2[YY];
1129         fshift[skj][ZZ] += f2[ZZ];
1130     }
1131
1132     if (VirCorr)
1133     {
1134         rvec xiv;
1135         int  i, j;
1136
1137         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1138
1139         for (i = 0; i < DIM; i++)
1140         {
1141             for (j = 0; j < DIM; j++)
1142             {
1143                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1144                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * (f1[j] + (1 - c2) * f2[j] - f3[j])
1145                               + xjk[i] * f2[j];
1146             }
1147         }
1148     }
1149
1150     /* TOTAL: 113 flops */
1151 }
1152
1153 static void spread_vsite3OUT(const t_iatom  ia[],
1154                              real           a,
1155                              real           b,
1156                              real           c,
1157                              const rvec     x[],
1158                              rvec           f[],
1159                              rvec           fshift[],
1160                              gmx_bool       VirCorr,
1161                              matrix         dxdf,
1162                              const t_pbc*   pbc,
1163                              const t_graph* g)
1164 {
1165     rvec xvi, xij, xik, fv, fj, fk;
1166     real cfx, cfy, cfz;
1167     int  av, ai, aj, ak;
1168     ivec di;
1169     int  svi, sji, ski;
1170
1171     av = ia[1];
1172     ai = ia[2];
1173     aj = ia[3];
1174     ak = ia[4];
1175
1176     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1177     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1178     /* 6 Flops */
1179
1180     copy_rvec(f[av], fv);
1181
1182     cfx = c * fv[XX];
1183     cfy = c * fv[YY];
1184     cfz = c * fv[ZZ];
1185     /* 3 Flops */
1186
1187     fj[XX] = a * fv[XX] - xik[ZZ] * cfy + xik[YY] * cfz;
1188     fj[YY] = xik[ZZ] * cfx + a * fv[YY] - xik[XX] * cfz;
1189     fj[ZZ] = -xik[YY] * cfx + xik[XX] * cfy + a * fv[ZZ];
1190
1191     fk[XX] = b * fv[XX] + xij[ZZ] * cfy - xij[YY] * cfz;
1192     fk[YY] = -xij[ZZ] * cfx + b * fv[YY] + xij[XX] * cfz;
1193     fk[ZZ] = xij[YY] * cfx - xij[XX] * cfy + b * fv[ZZ];
1194     /* 30 Flops */
1195
1196     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1197     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1198     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1199     rvec_inc(f[aj], fj);
1200     rvec_inc(f[ak], fk);
1201     /* 15 Flops */
1202
1203     if (g)
1204     {
1205         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1206         svi = IVEC2IS(di);
1207         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1208         sji = IVEC2IS(di);
1209         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1210         ski = IVEC2IS(di);
1211     }
1212     else if (pbc)
1213     {
1214         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1215     }
1216     else
1217     {
1218         svi = CENTRAL;
1219     }
1220
1221     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1222     {
1223         rvec_dec(fshift[svi], fv);
1224         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1225         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1226         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1227         rvec_inc(fshift[sji], fj);
1228         rvec_inc(fshift[ski], fk);
1229     }
1230
1231     if (VirCorr)
1232     {
1233         rvec xiv;
1234
1235         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1236
1237         for (int i = 0; i < DIM; i++)
1238         {
1239             for (int j = 0; j < DIM; j++)
1240             {
1241                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j] + xik[i] * fk[j];
1242             }
1243         }
1244     }
1245
1246     /* TOTAL: 54 flops */
1247 }
1248
1249 static void spread_vsite4FD(const t_iatom  ia[],
1250                             real           a,
1251                             real           b,
1252                             real           c,
1253                             const rvec     x[],
1254                             rvec           f[],
1255                             rvec           fshift[],
1256                             gmx_bool       VirCorr,
1257                             matrix         dxdf,
1258                             const t_pbc*   pbc,
1259                             const t_graph* g)
1260 {
1261     real fproj, a1;
1262     rvec xvi, xij, xjk, xjl, xix, fv, temp;
1263     int  av, ai, aj, ak, al;
1264     ivec di;
1265     int  svi, sji, skj, slj, m;
1266
1267     av = ia[1];
1268     ai = ia[2];
1269     aj = ia[3];
1270     ak = ia[4];
1271     al = ia[5];
1272
1273     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1274     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1275     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1276     /* 9 flops */
1277
1278     /* xix goes from i to point x on the plane jkl */
1279     for (m = 0; m < DIM; m++)
1280     {
1281         xix[m] = xij[m] + a * xjk[m] + b * xjl[m];
1282     }
1283     /* 12 flops */
1284
1285     const real invDistance = inverseNorm(xix);
1286     const real d           = c * invDistance;
1287     /* 4 + ?10? flops */
1288
1289     copy_rvec(f[av], fv);
1290
1291     fproj = iprod(xix, fv) * invDistance * invDistance; /* = (xix . f)/(xix . xix) */
1292
1293     for (m = 0; m < DIM; m++)
1294     {
1295         temp[m] = d * (fv[m] - fproj * xix[m]);
1296     }
1297     /* 16 */
1298
1299     /* c is already calculated in constr_vsite3FD
1300        storing c somewhere will save 35 flops!     */
1301
1302     a1 = 1 - a - b;
1303     for (m = 0; m < DIM; m++)
1304     {
1305         f[ai][m] += fv[m] - temp[m];
1306         f[aj][m] += a1 * temp[m];
1307         f[ak][m] += a * temp[m];
1308         f[al][m] += b * temp[m];
1309     }
1310     /* 26 Flops */
1311
1312     if (g)
1313     {
1314         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1315         svi = IVEC2IS(di);
1316         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1317         sji = IVEC2IS(di);
1318         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1319         skj = IVEC2IS(di);
1320         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1321         slj = IVEC2IS(di);
1322     }
1323     else if (pbc)
1324     {
1325         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1326     }
1327     else
1328     {
1329         svi = CENTRAL;
1330     }
1331
1332     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1333     {
1334         rvec_dec(fshift[svi], fv);
1335         for (m = 0; m < DIM; m++)
1336         {
1337             fshift[CENTRAL][m] += fv[m] - (1 + a + b) * temp[m];
1338             fshift[sji][m] += temp[m];
1339             fshift[skj][m] += a * temp[m];
1340             fshift[slj][m] += b * temp[m];
1341         }
1342     }
1343
1344     if (VirCorr)
1345     {
1346         rvec xiv;
1347         int  i, j;
1348
1349         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1350
1351         for (i = 0; i < DIM; i++)
1352         {
1353             for (j = 0; j < DIM; j++)
1354             {
1355                 dxdf[i][j] += -xiv[i] * fv[j] + xix[i] * temp[j];
1356             }
1357         }
1358     }
1359
1360     /* TOTAL: 77 flops */
1361 }
1362
1363
1364 static void spread_vsite4FDN(const t_iatom  ia[],
1365                              real           a,
1366                              real           b,
1367                              real           c,
1368                              const rvec     x[],
1369                              rvec           f[],
1370                              rvec           fshift[],
1371                              gmx_bool       VirCorr,
1372                              matrix         dxdf,
1373                              const t_pbc*   pbc,
1374                              const t_graph* g)
1375 {
1376     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1377     rvec fv, fj, fk, fl;
1378     real invrm, denom;
1379     real cfx, cfy, cfz;
1380     ivec di;
1381     int  av, ai, aj, ak, al;
1382     int  svi, sij, sik, sil;
1383
1384     /* DEBUG: check atom indices */
1385     av = ia[1];
1386     ai = ia[2];
1387     aj = ia[3];
1388     ak = ia[4];
1389     al = ia[5];
1390
1391     copy_rvec(f[av], fv);
1392
1393     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1394     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1395     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1396     /* 9 flops */
1397
1398     ra[XX] = a * xik[XX];
1399     ra[YY] = a * xik[YY];
1400     ra[ZZ] = a * xik[ZZ];
1401
1402     rb[XX] = b * xil[XX];
1403     rb[YY] = b * xil[YY];
1404     rb[ZZ] = b * xil[ZZ];
1405
1406     /* 6 flops */
1407
1408     rvec_sub(ra, xij, rja);
1409     rvec_sub(rb, xij, rjb);
1410     rvec_sub(rb, ra, rab);
1411     /* 9 flops */
1412
1413     cprod(rja, rjb, rm);
1414     /* 9 flops */
1415
1416     invrm = inverseNorm(rm);
1417     denom = invrm * invrm;
1418     /* 5+5+2 flops */
1419
1420     cfx = c * invrm * fv[XX];
1421     cfy = c * invrm * fv[YY];
1422     cfz = c * invrm * fv[ZZ];
1423     /* 6 Flops */
1424
1425     cprod(rm, rab, rt);
1426     /* 9 flops */
1427
1428     rt[XX] *= denom;
1429     rt[YY] *= denom;
1430     rt[ZZ] *= denom;
1431     /* 3flops */
1432
1433     fj[XX] = (-rm[XX] * rt[XX]) * cfx + (rab[ZZ] - rm[YY] * rt[XX]) * cfy
1434              + (-rab[YY] - rm[ZZ] * rt[XX]) * cfz;
1435     fj[YY] = (-rab[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1436              + (rab[XX] - rm[ZZ] * rt[YY]) * cfz;
1437     fj[ZZ] = (rab[YY] - rm[XX] * rt[ZZ]) * cfx + (-rab[XX] - rm[YY] * rt[ZZ]) * cfy
1438              + (-rm[ZZ] * rt[ZZ]) * cfz;
1439     /* 30 flops */
1440
1441     cprod(rjb, rm, rt);
1442     /* 9 flops */
1443
1444     rt[XX] *= denom * a;
1445     rt[YY] *= denom * a;
1446     rt[ZZ] *= denom * a;
1447     /* 3flops */
1448
1449     fk[XX] = (-rm[XX] * rt[XX]) * cfx + (-a * rjb[ZZ] - rm[YY] * rt[XX]) * cfy
1450              + (a * rjb[YY] - rm[ZZ] * rt[XX]) * cfz;
1451     fk[YY] = (a * rjb[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1452              + (-a * rjb[XX] - rm[ZZ] * rt[YY]) * cfz;
1453     fk[ZZ] = (-a * rjb[YY] - rm[XX] * rt[ZZ]) * cfx + (a * rjb[XX] - rm[YY] * rt[ZZ]) * cfy
1454              + (-rm[ZZ] * rt[ZZ]) * cfz;
1455     /* 36 flops */
1456
1457     cprod(rm, rja, rt);
1458     /* 9 flops */
1459
1460     rt[XX] *= denom * b;
1461     rt[YY] *= denom * b;
1462     rt[ZZ] *= denom * b;
1463     /* 3flops */
1464
1465     fl[XX] = (-rm[XX] * rt[XX]) * cfx + (b * rja[ZZ] - rm[YY] * rt[XX]) * cfy
1466              + (-b * rja[YY] - rm[ZZ] * rt[XX]) * cfz;
1467     fl[YY] = (-b * rja[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1468              + (b * rja[XX] - rm[ZZ] * rt[YY]) * cfz;
1469     fl[ZZ] = (b * rja[YY] - rm[XX] * rt[ZZ]) * cfx + (-b * rja[XX] - rm[YY] * rt[ZZ]) * cfy
1470              + (-rm[ZZ] * rt[ZZ]) * cfz;
1471     /* 36 flops */
1472
1473     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1474     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1475     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1476     rvec_inc(f[aj], fj);
1477     rvec_inc(f[ak], fk);
1478     rvec_inc(f[al], fl);
1479     /* 21 flops */
1480
1481     if (g)
1482     {
1483         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1484         svi = IVEC2IS(di);
1485         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1486         sij = IVEC2IS(di);
1487         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1488         sik = IVEC2IS(di);
1489         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1490         sil = IVEC2IS(di);
1491     }
1492     else if (pbc)
1493     {
1494         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1495     }
1496     else
1497     {
1498         svi = CENTRAL;
1499     }
1500
1501     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1502     {
1503         rvec_dec(fshift[svi], fv);
1504         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1505         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1506         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1507         rvec_inc(fshift[sij], fj);
1508         rvec_inc(fshift[sik], fk);
1509         rvec_inc(fshift[sil], fl);
1510     }
1511
1512     if (VirCorr)
1513     {
1514         rvec xiv;
1515         int  i, j;
1516
1517         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1518
1519         for (i = 0; i < DIM; i++)
1520         {
1521             for (j = 0; j < DIM; j++)
1522             {
1523                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j] + xik[i] * fk[j] + xil[i] * fl[j];
1524             }
1525         }
1526     }
1527
1528     /* Total: 207 flops (Yuck!) */
1529 }
1530
1531
1532 static int spread_vsiten(const t_iatom             ia[],
1533                          ArrayRef<const t_iparams> ip,
1534                          const rvec                x[],
1535                          rvec                      f[],
1536                          rvec                      fshift[],
1537                          const t_pbc*              pbc,
1538                          const t_graph*            g)
1539 {
1540     rvec xv, dx, fi;
1541     int  n3, av, i, ai;
1542     real a;
1543     ivec di;
1544     int  siv;
1545
1546     n3 = 3 * ip[ia[0]].vsiten.n;
1547     av = ia[1];
1548     copy_rvec(x[av], xv);
1549
1550     for (i = 0; i < n3; i += 3)
1551     {
1552         ai = ia[i + 2];
1553         if (g)
1554         {
1555             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1556             siv = IVEC2IS(di);
1557         }
1558         else if (pbc)
1559         {
1560             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1561         }
1562         else
1563         {
1564             siv = CENTRAL;
1565         }
1566         a = ip[ia[i]].vsiten.a;
1567         svmul(a, f[av], fi);
1568         rvec_inc(f[ai], fi);
1569         if (fshift && siv != CENTRAL)
1570         {
1571             rvec_inc(fshift[siv], fi);
1572             rvec_dec(fshift[CENTRAL], fi);
1573         }
1574         /* 6 Flops */
1575     }
1576
1577     return n3;
1578 }
1579
1580
1581 static int vsite_count(ArrayRef<const InteractionList> ilist, int ftype)
1582 {
1583     if (ftype == F_VSITEN)
1584     {
1585         return ilist[ftype].size() / 3;
1586     }
1587     else
1588     {
1589         return ilist[ftype].size() / (1 + interaction_function[ftype].nratoms);
1590     }
1591 }
1592
1593 static void spread_vsite_f_thread(const rvec                      x[],
1594                                   rvec                            f[],
1595                                   rvec*                           fshift,
1596                                   gmx_bool                        VirCorr,
1597                                   matrix                          dxdf,
1598                                   ArrayRef<const t_iparams>       ip,
1599                                   ArrayRef<const InteractionList> ilist,
1600                                   const t_graph*                  g,
1601                                   const t_pbc*                    pbc_null)
1602 {
1603     const PbcMode pbcMode = getPbcMode(pbc_null);
1604     /* We need another pbc pointer, as with charge groups we switch per vsite */
1605     const t_pbc*             pbc_null2 = pbc_null;
1606     gmx::ArrayRef<const int> vsite_pbc;
1607
1608     /* this loop goes backwards to be able to build *
1609      * higher type vsites from lower types         */
1610     for (int ftype = c_ftypeVsiteEnd - 1; ftype >= c_ftypeVsiteStart; ftype--)
1611     {
1612         if (ilist[ftype].empty())
1613         {
1614             continue;
1615         }
1616
1617         { // TODO remove me
1618             int nra = interaction_function[ftype].nratoms;
1619             int inc = 1 + nra;
1620             int nr  = ilist[ftype].size();
1621
1622             const t_iatom* ia = ilist[ftype].iatoms.data();
1623
1624             if (pbcMode == PbcMode::all)
1625             {
1626                 pbc_null2 = pbc_null;
1627             }
1628
1629             for (int i = 0; i < nr;)
1630             {
1631                 int tp = ia[0];
1632
1633                 /* Constants for constructing */
1634                 real a1, b1, c1;
1635                 a1 = ip[tp].vsite.a;
1636                 /* Construct the vsite depending on type */
1637                 switch (ftype)
1638                 {
1639                     case F_VSITE2: spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g); break;
1640                     case F_VSITE2FD:
1641                         spread_vsite2FD(ia, a1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1642                         break;
1643                     case F_VSITE3:
1644                         b1 = ip[tp].vsite.b;
1645                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1646                         break;
1647                     case F_VSITE3FD:
1648                         b1 = ip[tp].vsite.b;
1649                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1650                         break;
1651                     case F_VSITE3FAD:
1652                         b1 = ip[tp].vsite.b;
1653                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1654                         break;
1655                     case F_VSITE3OUT:
1656                         b1 = ip[tp].vsite.b;
1657                         c1 = ip[tp].vsite.c;
1658                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1659                         break;
1660                     case F_VSITE4FD:
1661                         b1 = ip[tp].vsite.b;
1662                         c1 = ip[tp].vsite.c;
1663                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1664                         break;
1665                     case F_VSITE4FDN:
1666                         b1 = ip[tp].vsite.b;
1667                         c1 = ip[tp].vsite.c;
1668                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1669                         break;
1670                     case F_VSITEN: inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g); break;
1671                     default:
1672                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d", ftype, __FILE__, __LINE__);
1673                 }
1674                 clear_rvec(f[ia[1]]);
1675
1676                 /* Increment loop variables */
1677                 i += inc;
1678                 ia += inc;
1679             }
1680         }
1681     }
1682 }
1683
1684 /*! \brief Clears the task force buffer elements that are written by task idTask */
1685 static void clearTaskForceBufferUsedElements(InterdependentTask* idTask)
1686 {
1687     int ntask = idTask->spreadTask.size();
1688     for (int ti = 0; ti < ntask; ti++)
1689     {
1690         const AtomIndex* atomList = &idTask->atomIndex[idTask->spreadTask[ti]];
1691         int              natom    = atomList->atom.size();
1692         RVec*            force    = idTask->force.data();
1693         for (int i = 0; i < natom; i++)
1694         {
1695             clear_rvec(force[atomList->atom[i]]);
1696         }
1697     }
1698 }
1699
1700 void spread_vsite_f(const gmx_vsite_t* vsite,
1701                     const rvec* gmx_restrict x,
1702                     rvec* gmx_restrict f,
1703                     rvec* gmx_restrict            fshift,
1704                     gmx_bool                      VirCorr,
1705                     matrix                        vir,
1706                     t_nrnb*                       nrnb,
1707                     const InteractionDefinitions& idef,
1708                     PbcType                       pbcType,
1709                     gmx_bool                      bMolPBC,
1710                     const t_graph*                g,
1711                     const matrix                  box,
1712                     const t_commrec*              cr,
1713                     gmx_wallcycle*                wcycle)
1714 {
1715     wallcycle_start(wcycle, ewcVSITESPREAD);
1716     const bool useDomdec = vsite->useDomdec;
1717     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)),
1718                "When vsites are set up with domain decomposition, we need a valid commrec");
1719
1720     t_pbc pbc, *pbc_null;
1721
1722     /* We only need to do pbc when we have inter-cg vsites */
1723     if ((useDomdec || bMolPBC) && vsite->numInterUpdategroupVsites)
1724     {
1725         /* This is wasting some CPU time as we now do this multiple times
1726          * per MD step.
1727          */
1728         pbc_null = set_pbc_dd(&pbc, pbcType, useDomdec ? cr->dd->numCells : nullptr, FALSE, box);
1729     }
1730     else
1731     {
1732         pbc_null = nullptr;
1733     }
1734
1735     if (useDomdec)
1736     {
1737         dd_clear_f_vsites(cr->dd, f);
1738     }
1739
1740     if (vsite->nthreads == 1)
1741     {
1742         matrix dxdf;
1743         if (VirCorr)
1744         {
1745             clear_mat(dxdf);
1746         }
1747         spread_vsite_f_thread(x, f, fshift, VirCorr, dxdf, idef.iparams, idef.il, g, pbc_null);
1748
1749         if (VirCorr)
1750         {
1751             for (int i = 0; i < DIM; i++)
1752             {
1753                 for (int j = 0; j < DIM; j++)
1754                 {
1755                     vir[i][j] += -0.5 * dxdf[i][j];
1756                 }
1757             }
1758         }
1759     }
1760     else
1761     {
1762         /* First spread the vsites that might depend on non-local vsites */
1763         if (VirCorr)
1764         {
1765             clear_mat(vsite->tData[vsite->nthreads]->dxdf);
1766         }
1767         spread_vsite_f_thread(x, f, fshift, VirCorr, vsite->tData[vsite->nthreads]->dxdf,
1768                               idef.iparams, vsite->tData[vsite->nthreads]->ilist, g, pbc_null);
1769
1770 #pragma omp parallel num_threads(vsite->nthreads)
1771         {
1772             try
1773             {
1774                 int          thread = gmx_omp_get_thread_num();
1775                 VsiteThread& tData  = *vsite->tData[thread];
1776
1777                 rvec* fshift_t;
1778                 if (thread == 0 || fshift == nullptr)
1779                 {
1780                     fshift_t = fshift;
1781                 }
1782                 else
1783                 {
1784                     fshift_t = tData.fshift;
1785
1786                     for (int i = 0; i < SHIFTS; i++)
1787                     {
1788                         clear_rvec(fshift_t[i]);
1789                     }
1790                 }
1791                 if (VirCorr)
1792                 {
1793                     clear_mat(tData.dxdf);
1794                 }
1795
1796                 if (tData.useInterdependentTask)
1797                 {
1798                     /* Spread the vsites that spread outside our local range.
1799                      * This is done using a thread-local force buffer force.
1800                      * First we need to copy the input vsite forces to force.
1801                      */
1802                     InterdependentTask* idTask = &tData.idTask;
1803
1804                     /* Clear the buffer elements set by our task during
1805                      * the last call to spread_vsite_f.
1806                      */
1807                     clearTaskForceBufferUsedElements(idTask);
1808
1809                     int nvsite = idTask->vsite.size();
1810                     for (int i = 0; i < nvsite; i++)
1811                     {
1812                         copy_rvec(f[idTask->vsite[i]], idTask->force[idTask->vsite[i]]);
1813                     }
1814                     spread_vsite_f_thread(x, as_rvec_array(idTask->force.data()), fshift_t, VirCorr,
1815                                           tData.dxdf, idef.iparams, tData.idTask.ilist, g, pbc_null);
1816
1817                     /* We need a barrier before reducing forces below
1818                      * that have been produced by a different thread above.
1819                      */
1820 #pragma omp barrier
1821
1822                     /* Loop over all thread task and reduce forces they
1823                      * produced on atoms that fall in our range.
1824                      * Note that atomic reduction would be a simpler solution,
1825                      * but that might not have good support on all platforms.
1826                      */
1827                     int ntask = idTask->reduceTask.size();
1828                     for (int ti = 0; ti < ntask; ti++)
1829                     {
1830                         const InterdependentTask* idt_foreign =
1831                                 &vsite->tData[idTask->reduceTask[ti]]->idTask;
1832                         const AtomIndex* atomList  = &idt_foreign->atomIndex[thread];
1833                         const RVec*      f_foreign = idt_foreign->force.data();
1834
1835                         int natom = atomList->atom.size();
1836                         for (int i = 0; i < natom; i++)
1837                         {
1838                             int ind = atomList->atom[i];
1839                             rvec_inc(f[ind], f_foreign[ind]);
1840                             /* Clearing of f_foreign is done at the next step */
1841                         }
1842                     }
1843                     /* Clear the vsite forces, both in f and force */
1844                     for (int i = 0; i < nvsite; i++)
1845                     {
1846                         int ind = tData.idTask.vsite[i];
1847                         clear_rvec(f[ind]);
1848                         clear_rvec(tData.idTask.force[ind]);
1849                     }
1850                 }
1851
1852                 /* Spread the vsites that spread locally only */
1853                 spread_vsite_f_thread(x, f, fshift_t, VirCorr, tData.dxdf, idef.iparams,
1854                                       tData.ilist, g, pbc_null);
1855             }
1856             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1857         }
1858
1859         if (fshift != nullptr)
1860         {
1861             for (int th = 1; th < vsite->nthreads; th++)
1862             {
1863                 for (int i = 0; i < SHIFTS; i++)
1864                 {
1865                     rvec_inc(fshift[i], vsite->tData[th]->fshift[i]);
1866                 }
1867             }
1868         }
1869
1870         if (VirCorr)
1871         {
1872             for (int th = 0; th < vsite->nthreads + 1; th++)
1873             {
1874                 /* MSVC doesn't like matrix references, so we use a pointer */
1875                 const matrix* dxdf = &vsite->tData[th]->dxdf;
1876
1877                 for (int i = 0; i < DIM; i++)
1878                 {
1879                     for (int j = 0; j < DIM; j++)
1880                     {
1881                         vir[i][j] += -0.5 * (*dxdf)[i][j];
1882                     }
1883                 }
1884             }
1885         }
1886     }
1887
1888     if (useDomdec)
1889     {
1890         dd_move_f_vsites(cr->dd, f, fshift);
1891     }
1892
1893     inc_nrnb(nrnb, eNR_VSITE2, vsite_count(idef.il, F_VSITE2));
1894     inc_nrnb(nrnb, eNR_VSITE2FD, vsite_count(idef.il, F_VSITE2FD));
1895     inc_nrnb(nrnb, eNR_VSITE3, vsite_count(idef.il, F_VSITE3));
1896     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef.il, F_VSITE3FD));
1897     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef.il, F_VSITE3FAD));
1898     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef.il, F_VSITE3OUT));
1899     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef.il, F_VSITE4FD));
1900     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef.il, F_VSITE4FDN));
1901     inc_nrnb(nrnb, eNR_VSITEN, vsite_count(idef.il, F_VSITEN));
1902
1903     wallcycle_stop(wcycle, ewcVSITESPREAD);
1904 }
1905
1906 /*! \brief Returns the an array with group indices for each atom
1907  *
1908  * \param[in] grouping  The paritioning of the atom range into atom groups
1909  */
1910 static std::vector<int> makeAtomToGroupMapping(const gmx::RangePartitioning& grouping)
1911 {
1912     std::vector<int> atomToGroup(grouping.fullRange().end(), 0);
1913
1914     for (int group = 0; group < grouping.numBlocks(); group++)
1915     {
1916         auto block = grouping.block(group);
1917         std::fill(atomToGroup.begin() + block.begin(), atomToGroup.begin() + block.end(), group);
1918     }
1919
1920     return atomToGroup;
1921 }
1922
1923 int countNonlinearVsites(const gmx_mtop_t& mtop)
1924 {
1925     int numNonlinearVsites = 0;
1926     for (const gmx_molblock_t& molb : mtop.molblock)
1927     {
1928         const gmx_moltype_t& molt = mtop.moltype[molb.type];
1929
1930         for (const auto& ilist : extractILists(molt.ilist, IF_VSITE))
1931         {
1932             if (ilist.functionType != F_VSITE2 && ilist.functionType != F_VSITE3
1933                 && ilist.functionType != F_VSITEN)
1934             {
1935                 numNonlinearVsites += molb.nmol * ilist.iatoms.size() / (1 + NRAL(ilist.functionType));
1936             }
1937         }
1938     }
1939
1940     return numNonlinearVsites;
1941 }
1942
1943 int countInterUpdategroupVsites(const gmx_mtop_t&                           mtop,
1944                                 gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingPerMoleculetype)
1945 {
1946     int n_intercg_vsite = 0;
1947     for (const gmx_molblock_t& molb : mtop.molblock)
1948     {
1949         const gmx_moltype_t& molt = mtop.moltype[molb.type];
1950
1951         std::vector<int> atomToGroup;
1952         if (!updateGroupingPerMoleculetype.empty())
1953         {
1954             atomToGroup = makeAtomToGroupMapping(updateGroupingPerMoleculetype[molb.type]);
1955         }
1956         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1957         {
1958             const int              nral = NRAL(ftype);
1959             const InteractionList& il   = molt.ilist[ftype];
1960             for (int i = 0; i < il.size(); i += 1 + nral)
1961             {
1962                 bool isInterGroup = atomToGroup.empty();
1963                 if (!isInterGroup)
1964                 {
1965                     const int group = atomToGroup[il.iatoms[1 + i]];
1966                     for (int a = 1; a < nral; a++)
1967                     {
1968                         if (atomToGroup[il.iatoms[1 + a]] != group)
1969                         {
1970                             isInterGroup = true;
1971                             break;
1972                         }
1973                     }
1974                 }
1975                 if (isInterGroup)
1976                 {
1977                     n_intercg_vsite += molb.nmol;
1978                 }
1979             }
1980         }
1981     }
1982
1983     return n_intercg_vsite;
1984 }
1985
1986 std::unique_ptr<gmx_vsite_t> initVsite(const gmx_mtop_t& mtop, const t_commrec* cr)
1987 {
1988     GMX_RELEASE_ASSERT(cr != nullptr, "We need a valid commrec");
1989
1990     std::unique_ptr<gmx_vsite_t> vsite;
1991
1992     /* check if there are vsites */
1993     int nvsite = 0;
1994     for (int ftype = 0; ftype < F_NRE; ftype++)
1995     {
1996         if (interaction_function[ftype].flags & IF_VSITE)
1997         {
1998             GMX_ASSERT(ftype >= c_ftypeVsiteStart && ftype < c_ftypeVsiteEnd,
1999                        "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2000
2001             nvsite += gmx_mtop_ftype_count(&mtop, ftype);
2002         }
2003         else
2004         {
2005             GMX_ASSERT(ftype < c_ftypeVsiteStart || ftype >= c_ftypeVsiteEnd,
2006                        "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2007         }
2008     }
2009
2010     if (nvsite == 0)
2011     {
2012         return vsite;
2013     }
2014
2015     vsite = std::make_unique<gmx_vsite_t>();
2016
2017     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingPerMoleculetype;
2018     if (DOMAINDECOMP(cr))
2019     {
2020         updateGroupingPerMoleculetype = getUpdateGroupingPerMoleculetype(*cr->dd);
2021     }
2022     vsite->numInterUpdategroupVsites = countInterUpdategroupVsites(mtop, updateGroupingPerMoleculetype);
2023
2024     vsite->useDomdec = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
2025
2026     vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
2027
2028     if (vsite->nthreads > 1)
2029     {
2030         /* We need one extra thread data structure for the overlap vsites */
2031         vsite->tData.resize(vsite->nthreads + 1);
2032 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2033         for (int thread = 0; thread < vsite->nthreads; thread++)
2034         {
2035             try
2036             {
2037                 vsite->tData[thread] = std::make_unique<VsiteThread>();
2038
2039                 InterdependentTask& idTask = vsite->tData[thread]->idTask;
2040                 idTask.nuse                = 0;
2041                 idTask.atomIndex.resize(vsite->nthreads);
2042             }
2043             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2044         }
2045         if (vsite->nthreads > 1)
2046         {
2047             vsite->tData[vsite->nthreads] = std::make_unique<VsiteThread>();
2048         }
2049     }
2050
2051     return vsite;
2052 }
2053
2054 gmx_vsite_t::gmx_vsite_t() {}
2055
2056 gmx_vsite_t::~gmx_vsite_t() {}
2057
2058 static inline void flagAtom(InterdependentTask* idTask, int atom, int thread, int nthread, int natperthread)
2059 {
2060     if (!idTask->use[atom])
2061     {
2062         idTask->use[atom] = true;
2063         thread            = atom / natperthread;
2064         /* Assign all non-local atom force writes to thread 0 */
2065         if (thread >= nthread)
2066         {
2067             thread = 0;
2068         }
2069         idTask->atomIndex[thread].atom.push_back(atom);
2070     }
2071 }
2072
2073 /*\brief Here we try to assign all vsites that are in our local range.
2074  *
2075  * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2076  * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2077  * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2078  * but not on other vsites are assigned to task tData->id_task.ilist.
2079  * taskIndex[] is set for all vsites in our range, either to our local tasks
2080  * or to the single last task as taskIndex[]=2*nthreads.
2081  */
2082 static void assignVsitesToThread(VsiteThread*                    tData,
2083                                  int                             thread,
2084                                  int                             nthread,
2085                                  int                             natperthread,
2086                                  gmx::ArrayRef<int>              taskIndex,
2087                                  ArrayRef<const InteractionList> ilist,
2088                                  ArrayRef<const t_iparams>       ip,
2089                                  const unsigned short*           ptype)
2090 {
2091     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2092     {
2093         tData->ilist[ftype].clear();
2094         tData->idTask.ilist[ftype].clear();
2095
2096         int        nral1 = 1 + NRAL(ftype);
2097         int        inc   = nral1;
2098         const int* iat   = ilist[ftype].iatoms.data();
2099         for (int i = 0; i < ilist[ftype].size();)
2100         {
2101             if (ftype == F_VSITEN)
2102             {
2103                 /* The 3 below is from 1+NRAL(ftype)=3 */
2104                 inc = ip[iat[i]].vsiten.n * 3;
2105             }
2106
2107             if (iat[1 + i] < tData->rangeStart || iat[1 + i] >= tData->rangeEnd)
2108             {
2109                 /* This vsite belongs to a different thread */
2110                 i += inc;
2111                 continue;
2112             }
2113
2114             /* We would like to assign this vsite to task thread,
2115              * but it might depend on atoms outside the atom range of thread
2116              * or on another vsite not assigned to task thread.
2117              */
2118             int task = thread;
2119             if (ftype != F_VSITEN)
2120             {
2121                 for (int j = i + 2; j < i + nral1; j++)
2122                 {
2123                     /* Do a range check to avoid a harmless race on taskIndex */
2124                     if (iat[j] < tData->rangeStart || iat[j] >= tData->rangeEnd || taskIndex[iat[j]] != thread)
2125                     {
2126                         if (!tData->useInterdependentTask || ptype[iat[j]] == eptVSite)
2127                         {
2128                             /* At least one constructing atom is a vsite
2129                              * that is not assigned to the same thread.
2130                              * Put this vsite into a separate task.
2131                              */
2132                             task = 2 * nthread;
2133                             break;
2134                         }
2135
2136                         /* There are constructing atoms outside our range,
2137                          * put this vsite into a second task to be executed
2138                          * on the same thread. During construction no barrier
2139                          * is needed between the two tasks on the same thread.
2140                          * During spreading we need to run this task with
2141                          * an additional thread-local intermediate force buffer
2142                          * (or atomic reduction) and a barrier between the two
2143                          * tasks.
2144                          */
2145                         task = nthread + thread;
2146                     }
2147                 }
2148             }
2149             else
2150             {
2151                 for (int j = i + 2; j < i + inc; j += 3)
2152                 {
2153                     /* Do a range check to avoid a harmless race on taskIndex */
2154                     if (iat[j] < tData->rangeStart || iat[j] >= tData->rangeEnd || taskIndex[iat[j]] != thread)
2155                     {
2156                         GMX_ASSERT(ptype[iat[j]] != eptVSite,
2157                                    "A vsite to be assigned in assignVsitesToThread has a vsite as "
2158                                    "a constructing atom that does not belong to our task, such "
2159                                    "vsites should be assigned to the single 'master' task");
2160
2161                         task = nthread + thread;
2162                     }
2163                 }
2164             }
2165
2166             /* Update this vsite's thread index entry */
2167             taskIndex[iat[1 + i]] = task;
2168
2169             if (task == thread || task == nthread + thread)
2170             {
2171                 /* Copy this vsite to the thread data struct of thread */
2172                 InteractionList* il_task;
2173                 if (task == thread)
2174                 {
2175                     il_task = &tData->ilist[ftype];
2176                 }
2177                 else
2178                 {
2179                     il_task = &tData->idTask.ilist[ftype];
2180                 }
2181                 /* Copy the vsite data to the thread-task local array */
2182                 il_task->push_back(iat[i], nral1 - 1, iat + i + 1);
2183                 if (task == nthread + thread)
2184                 {
2185                     /* This vsite write outside our own task force block.
2186                      * Put it into the interdependent task list and flag
2187                      * the atoms involved for reduction.
2188                      */
2189                     tData->idTask.vsite.push_back(iat[i + 1]);
2190                     if (ftype != F_VSITEN)
2191                     {
2192                         for (int j = i + 2; j < i + nral1; j++)
2193                         {
2194                             flagAtom(&tData->idTask, iat[j], thread, nthread, natperthread);
2195                         }
2196                     }
2197                     else
2198                     {
2199                         for (int j = i + 2; j < i + inc; j += 3)
2200                         {
2201                             flagAtom(&tData->idTask, iat[j], thread, nthread, natperthread);
2202                         }
2203                     }
2204                 }
2205             }
2206
2207             i += inc;
2208         }
2209     }
2210 }
2211
2212 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2213 static void assignVsitesToSingleTask(VsiteThread*                    tData,
2214                                      int                             task,
2215                                      gmx::ArrayRef<const int>        taskIndex,
2216                                      ArrayRef<const InteractionList> ilist,
2217                                      ArrayRef<const t_iparams>       ip)
2218 {
2219     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2220     {
2221         tData->ilist[ftype].clear();
2222         tData->idTask.ilist[ftype].clear();
2223
2224         int              nral1   = 1 + NRAL(ftype);
2225         int              inc     = nral1;
2226         const int*       iat     = ilist[ftype].iatoms.data();
2227         InteractionList* il_task = &tData->ilist[ftype];
2228
2229         for (int i = 0; i < ilist[ftype].size();)
2230         {
2231             if (ftype == F_VSITEN)
2232             {
2233                 /* The 3 below is from 1+NRAL(ftype)=3 */
2234                 inc = ip[iat[i]].vsiten.n * 3;
2235             }
2236             /* Check if the vsite is assigned to our task */
2237             if (taskIndex[iat[1 + i]] == task)
2238             {
2239                 /* Copy the vsite data to the thread-task local array */
2240                 il_task->push_back(iat[i], inc - 1, iat + i + 1);
2241             }
2242
2243             i += inc;
2244         }
2245     }
2246 }
2247
2248 void split_vsites_over_threads(ArrayRef<const InteractionList> ilist,
2249                                ArrayRef<const t_iparams>       ip,
2250                                const t_mdatoms*                mdatoms,
2251                                gmx_vsite_t*                    vsite)
2252 {
2253     int vsite_atom_range, natperthread;
2254
2255     if (vsite->nthreads == 1)
2256     {
2257         /* Nothing to do */
2258         return;
2259     }
2260
2261     /* The current way of distributing the vsites over threads in primitive.
2262      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2263      * without taking into account how the vsites are distributed.
2264      * Without domain decomposition we at least tighten the upper bound
2265      * of the range (useful for common systems such as a vsite-protein
2266      * in 3-site water).
2267      * With domain decomposition, as long as the vsites are distributed
2268      * uniformly in each domain along the major dimension, usually x,
2269      * it will also perform well.
2270      */
2271     if (!vsite->useDomdec)
2272     {
2273         vsite_atom_range = -1;
2274         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2275         {
2276             { // TODO remove me
2277                 if (ftype != F_VSITEN)
2278                 {
2279                     int                 nral1 = 1 + NRAL(ftype);
2280                     ArrayRef<const int> iat   = ilist[ftype].iatoms;
2281                     for (int i = 0; i < ilist[ftype].size(); i += nral1)
2282                     {
2283                         for (int j = i + 1; j < i + nral1; j++)
2284                         {
2285                             vsite_atom_range = std::max(vsite_atom_range, iat[j]);
2286                         }
2287                     }
2288                 }
2289                 else
2290                 {
2291                     int vs_ind_end;
2292
2293                     ArrayRef<const int> iat = ilist[ftype].iatoms;
2294
2295                     int i = 0;
2296                     while (i < ilist[ftype].size())
2297                     {
2298                         /* The 3 below is from 1+NRAL(ftype)=3 */
2299                         vs_ind_end = i + ip[iat[i]].vsiten.n * 3;
2300
2301                         vsite_atom_range = std::max(vsite_atom_range, iat[i + 1]);
2302                         while (i < vs_ind_end)
2303                         {
2304                             vsite_atom_range = std::max(vsite_atom_range, iat[i + 2]);
2305                             i += 3;
2306                         }
2307                     }
2308                 }
2309             }
2310         }
2311         vsite_atom_range++;
2312         natperthread = (vsite_atom_range + vsite->nthreads - 1) / vsite->nthreads;
2313     }
2314     else
2315     {
2316         /* Any local or not local atom could be involved in virtual sites.
2317          * But since we usually have very few non-local virtual sites
2318          * (only non-local vsites that depend on local vsites),
2319          * we distribute the local atom range equally over the threads.
2320          * When assigning vsites to threads, we should take care that the last
2321          * threads also covers the non-local range.
2322          */
2323         vsite_atom_range = mdatoms->nr;
2324         natperthread     = (mdatoms->homenr + vsite->nthreads - 1) / vsite->nthreads;
2325     }
2326
2327     if (debug)
2328     {
2329         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n",
2330                 mdatoms->nr, vsite_atom_range, natperthread);
2331     }
2332
2333     /* To simplify the vsite assignment, we make an index which tells us
2334      * to which task particles, both non-vsites and vsites, are assigned.
2335      */
2336     vsite->taskIndex.resize(mdatoms->nr);
2337
2338     /* Initialize the task index array. Here we assign the non-vsite
2339      * particles to task=thread, so we easily figure out if vsites
2340      * depend on local and/or non-local particles in assignVsitesToThread.
2341      */
2342     gmx::ArrayRef<int> taskIndex = vsite->taskIndex;
2343     {
2344         int thread = 0;
2345         for (int i = 0; i < mdatoms->nr; i++)
2346         {
2347             if (mdatoms->ptype[i] == eptVSite)
2348             {
2349                 /* vsites are not assigned to a task yet */
2350                 taskIndex[i] = -1;
2351             }
2352             else
2353             {
2354                 /* assign non-vsite particles to task thread */
2355                 taskIndex[i] = thread;
2356             }
2357             if (i == (thread + 1) * natperthread && thread < vsite->nthreads)
2358             {
2359                 thread++;
2360             }
2361         }
2362     }
2363
2364 #pragma omp parallel num_threads(vsite->nthreads)
2365     {
2366         try
2367         {
2368             int          thread = gmx_omp_get_thread_num();
2369             VsiteThread& tData  = *vsite->tData[thread];
2370
2371             /* Clear the buffer use flags that were set before */
2372             if (tData.useInterdependentTask)
2373             {
2374                 InterdependentTask& idTask = tData.idTask;
2375
2376                 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2377                  * we clear the force buffer at the next step,
2378                  * so we need to do it here as well.
2379                  */
2380                 clearTaskForceBufferUsedElements(&idTask);
2381
2382                 idTask.vsite.resize(0);
2383                 for (int t = 0; t < vsite->nthreads; t++)
2384                 {
2385                     AtomIndex& atomIndex = idTask.atomIndex[t];
2386                     int        natom     = atomIndex.atom.size();
2387                     for (int i = 0; i < natom; i++)
2388                     {
2389                         idTask.use[atomIndex.atom[i]] = false;
2390                     }
2391                     atomIndex.atom.resize(0);
2392                 }
2393                 idTask.nuse = 0;
2394             }
2395
2396             /* To avoid large f_buf allocations of #threads*vsite_atom_range
2397              * we don't use task2 with more than 200000 atoms. This doesn't
2398              * affect performance, since with such a large range relatively few
2399              * vsites will end up in the separate task.
2400              * Note that useTask2 should be the same for all threads.
2401              */
2402             tData.useInterdependentTask = (vsite_atom_range <= 200000);
2403             if (tData.useInterdependentTask)
2404             {
2405                 size_t              natoms_use_in_vsites = vsite_atom_range;
2406                 InterdependentTask& idTask               = tData.idTask;
2407                 /* To avoid resizing and re-clearing every nstlist steps,
2408                  * we never down size the force buffer.
2409                  */
2410                 if (natoms_use_in_vsites > idTask.force.size() || natoms_use_in_vsites > idTask.use.size())
2411                 {
2412                     idTask.force.resize(natoms_use_in_vsites, { 0, 0, 0 });
2413                     idTask.use.resize(natoms_use_in_vsites, false);
2414                 }
2415             }
2416
2417             /* Assign all vsites that can execute independently on threads */
2418             tData.rangeStart = thread * natperthread;
2419             if (thread < vsite->nthreads - 1)
2420             {
2421                 tData.rangeEnd = (thread + 1) * natperthread;
2422             }
2423             else
2424             {
2425                 /* The last thread should cover up to the end of the range */
2426                 tData.rangeEnd = mdatoms->nr;
2427             }
2428             assignVsitesToThread(&tData, thread, vsite->nthreads, natperthread, taskIndex, ilist,
2429                                  ip, mdatoms->ptype);
2430
2431             if (tData.useInterdependentTask)
2432             {
2433                 /* In the worst case, all tasks write to force ranges of
2434                  * all other tasks, leading to #tasks^2 scaling (this is only
2435                  * the overhead, the actual flops remain constant).
2436                  * But in most cases there is far less coupling. To improve
2437                  * scaling at high thread counts we therefore construct
2438                  * an index to only loop over the actually affected tasks.
2439                  */
2440                 InterdependentTask& idTask = tData.idTask;
2441
2442                 /* Ensure assignVsitesToThread finished on other threads */
2443 #pragma omp barrier
2444
2445                 idTask.spreadTask.resize(0);
2446                 idTask.reduceTask.resize(0);
2447                 for (int t = 0; t < vsite->nthreads; t++)
2448                 {
2449                     /* Do we write to the force buffer of task t? */
2450                     if (!idTask.atomIndex[t].atom.empty())
2451                     {
2452                         idTask.spreadTask.push_back(t);
2453                     }
2454                     /* Does task t write to our force buffer? */
2455                     if (!vsite->tData[t]->idTask.atomIndex[thread].atom.empty())
2456                     {
2457                         idTask.reduceTask.push_back(t);
2458                     }
2459                 }
2460             }
2461         }
2462         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2463     }
2464     /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2465      * to a single task that will not run in parallel with other tasks.
2466      */
2467     assignVsitesToSingleTask(vsite->tData[vsite->nthreads].get(), 2 * vsite->nthreads, taskIndex,
2468                              ilist, ip);
2469
2470     if (debug && vsite->nthreads > 1)
2471     {
2472         fprintf(debug, "virtual site useInterdependentTask %d, nuse:\n",
2473                 static_cast<int>(vsite->tData[0]->useInterdependentTask));
2474         for (int th = 0; th < vsite->nthreads + 1; th++)
2475         {
2476             fprintf(debug, " %4d", vsite->tData[th]->idTask.nuse);
2477         }
2478         fprintf(debug, "\n");
2479
2480         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2481         {
2482             if (!ilist[ftype].empty())
2483             {
2484                 fprintf(debug, "%-20s thread dist:", interaction_function[ftype].longname);
2485                 for (int th = 0; th < vsite->nthreads + 1; th++)
2486                 {
2487                     fprintf(debug, " %4d %4d ", vsite->tData[th]->ilist[ftype].size(),
2488                             vsite->tData[th]->idTask.ilist[ftype].size());
2489                 }
2490                 fprintf(debug, "\n");
2491             }
2492         }
2493     }
2494
2495 #ifndef NDEBUG
2496     int nrOrig     = vsiteIlistNrCount(ilist);
2497     int nrThreaded = 0;
2498     for (int th = 0; th < vsite->nthreads + 1; th++)
2499     {
2500         nrThreaded += vsiteIlistNrCount(vsite->tData[th]->ilist)
2501                       + vsiteIlistNrCount(vsite->tData[th]->idTask.ilist);
2502     }
2503     GMX_ASSERT(nrThreaded == nrOrig,
2504                "The number of virtual sites assigned to all thread task has to match the total "
2505                "number of virtual sites");
2506 #endif
2507 }
2508
2509 void set_vsite_top(gmx_vsite_t* vsite, const gmx_localtop_t* top, const t_mdatoms* md)
2510 {
2511     if (vsite->nthreads > 1)
2512     {
2513         split_vsites_over_threads(top->idef.il, top->idef.iparams, md, vsite);
2514     }
2515 }