src/gromacs/mdlib/vsite.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 The GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "vsite.h"
  41
  42 #include <cstdio>
  43
  44 #include <algorithm>
  45 #include <memory>
  46 #include <vector>
  47
  48 #include "gromacs/domdec/domdec.h"
  49 #include "gromacs/domdec/domdec_struct.h"
  50 #include "gromacs/gmxlib/network.h"
  51 #include "gromacs/gmxlib/nrnb.h"
  52 #include "gromacs/math/functions.h"
  53 #include "gromacs/math/vec.h"
  54 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  55 #include "gromacs/mdtypes/commrec.h"
  56 #include "gromacs/mdtypes/mdatom.h"
  57 #include "gromacs/pbcutil/ishift.h"
  58 #include "gromacs/pbcutil/pbc.h"
  59 #include "gromacs/timing/wallcycle.h"
  60 #include "gromacs/topology/ifunc.h"
  61 #include "gromacs/topology/mtop_util.h"
  62 #include "gromacs/topology/topology.h"
  63 #include "gromacs/utility/exceptions.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/gmxassert.h"
  66 #include "gromacs/utility/gmxomp.h"
  67
  68 /* The strategy used here for assigning virtual sites to (thread-)tasks
  69  * is as follows:
  70  *
  71  * We divide the atom range that vsites operate on (natoms_local with DD,
  72  * 0 - last atom involved in vsites without DD) equally over all threads.
  73  *
  74  * Vsites in the local range constructed from atoms in the local range
  75  * and/or other vsites that are fully local are assigned to a simple,
  76  * independent task.
  77  *
  78  * Vsites that are not assigned after using the above criterion get assigned
  79  * to a so called "interdependent" thread task when none of the constructing
  80  * atoms is a vsite. These tasks are called interdependent, because one task
  81  * accesses atoms assigned to a different task/thread.
  82  * Note that this option is turned off with large (local) atom counts
  83  * to avoid high memory usage.
  84  *
  85  * Any remaining vsites are assigned to a separate master thread task.
  86  */
  87
  88 using gmx::ArrayRef;
  89 using gmx::RVec;
  90
  91 /*! \brief List of atom indices belonging to a task */
  92 struct AtomIndex
  93 {
  94     //! List of atom indices
  95     std::vector<int> atom;
  96 };
  97
  98 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
  99 struct InterdependentTask
 100 {
 101     //! The interaction lists, only vsite entries are used
 102     InteractionLists ilist;
 103     //! Thread/task-local force buffer
 104     std::vector<RVec> force;
 105     //! The atom indices of the vsites of our task
 106     std::vector<int> vsite;
 107     //! Flags if elements in force are spread to or not
 108     std::vector<bool> use;
 109     //! The number of entries set to true in use
 110     int nuse = 0;
 111     //! Array of atoms indices, size nthreads, covering all nuse set elements in use
 112     std::vector<AtomIndex> atomIndex;
 113     //! List of tasks (force blocks) this task spread forces to
 114     std::vector<int> spreadTask;
 115     //! List of tasks that write to this tasks force block range
 116     std::vector<int> reduceTask;
 117 };
 118
 119 /*! \brief Vsite thread task data structure */
 120 struct VsiteThread
 121 {
 122     //! Start of atom range of this task
 123     int rangeStart;
 124     //! End of atom range of this task
 125     int rangeEnd;
 126     //! The interaction lists, only vsite entries are used
 127     std::array<InteractionList, F_NRE> ilist;
 128     //! Local fshift accumulation buffer
 129     rvec fshift[SHIFTS];
 130     //! Local virial dx*df accumulation buffer
 131     matrix dxdf;
 132     //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
 133     bool useInterdependentTask;
 134     //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
 135     InterdependentTask idTask;
 136
 137     /*! \brief Constructor */
 138     VsiteThread()
 139     {
 140         rangeStart = -1;
 141         rangeEnd   = -1;
 142         clear_rvecs(SHIFTS, fshift);
 143         clear_mat(dxdf);
 144         useInterdependentTask = false;
 145     }
 146 };
 147
 148 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
 149  *
 150  * \param[in] ilist  The interaction list
 151  */
 152 static int vsiteIlistNrCount(ArrayRef<const InteractionList> ilist)
 153 {
 154     int nr = 0;
 155     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 156     {
 157         nr += ilist[ftype].size();
 158     }
 159
 160     return nr;
 161 }
 162
 163 static int pbc_rvec_sub(const t_pbc* pbc, const rvec xi, const rvec xj, rvec dx)
 164 {
 165     if (pbc)
 166     {
 167         return pbc_dx_aiuc(pbc, xi, xj, dx);
 168     }
 169     else
 170     {
 171         rvec_sub(xi, xj, dx);
 172         return CENTRAL;
 173     }
 174 }
 175
 176 static inline real inverseNorm(const rvec x)
 177 {
 178     return gmx::invsqrt(iprod(x, x));
 179 }
 180
 181 /* Vsite construction routines */
 182
 183 static void constr_vsite2(const rvec xi, const rvec xj, rvec x, real a, const t_pbc* pbc)
 184 {
 185     real b = 1 - a;
 186     /* 1 flop */
 187
 188     if (pbc)
 189     {
 190         rvec dx;
 191         pbc_dx_aiuc(pbc, xj, xi, dx);
 192         x[XX] = xi[XX] + a * dx[XX];
 193         x[YY] = xi[YY] + a * dx[YY];
 194         x[ZZ] = xi[ZZ] + a * dx[ZZ];
 195     }
 196     else
 197     {
 198         x[XX] = b * xi[XX] + a * xj[XX];
 199         x[YY] = b * xi[YY] + a * xj[YY];
 200         x[ZZ] = b * xi[ZZ] + a * xj[ZZ];
 201         /* 9 Flops */
 202     }
 203
 204     /* TOTAL: 10 flops */
 205 }
 206
 207 static void constr_vsite2FD(const rvec xi, const rvec xj, rvec x, real a, const t_pbc* pbc)
 208 {
 209     rvec xij;
 210     pbc_rvec_sub(pbc, xj, xi, xij);
 211     /* 3 flops */
 212
 213     const real b = a * inverseNorm(xij);
 214     /* 6 + 10 flops */
 215
 216     x[XX] = xi[XX] + b * xij[XX];
 217     x[YY] = xi[YY] + b * xij[YY];
 218     x[ZZ] = xi[ZZ] + b * xij[ZZ];
 219     /* 6 Flops */
 220
 221     /* TOTAL: 25 flops */
 222 }
 223
 224 static void constr_vsite3(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 225 {
 226     real c = 1 - a - b;
 227     /* 2 flops */
 228
 229     if (pbc)
 230     {
 231         rvec dxj, dxk;
 232
 233         pbc_dx_aiuc(pbc, xj, xi, dxj);
 234         pbc_dx_aiuc(pbc, xk, xi, dxk);
 235         x[XX] = xi[XX] + a * dxj[XX] + b * dxk[XX];
 236         x[YY] = xi[YY] + a * dxj[YY] + b * dxk[YY];
 237         x[ZZ] = xi[ZZ] + a * dxj[ZZ] + b * dxk[ZZ];
 238     }
 239     else
 240     {
 241         x[XX] = c * xi[XX] + a * xj[XX] + b * xk[XX];
 242         x[YY] = c * xi[YY] + a * xj[YY] + b * xk[YY];
 243         x[ZZ] = c * xi[ZZ] + a * xj[ZZ] + b * xk[ZZ];
 244         /* 15 Flops */
 245     }
 246
 247     /* TOTAL: 17 flops */
 248 }
 249
 250 static void constr_vsite3FD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 251 {
 252     rvec xij, xjk, temp;
 253     real c;
 254
 255     pbc_rvec_sub(pbc, xj, xi, xij);
 256     pbc_rvec_sub(pbc, xk, xj, xjk);
 257     /* 6 flops */
 258
 259     /* temp goes from i to a point on the line jk */
 260     temp[XX] = xij[XX] + a * xjk[XX];
 261     temp[YY] = xij[YY] + a * xjk[YY];
 262     temp[ZZ] = xij[ZZ] + a * xjk[ZZ];
 263     /* 6 flops */
 264
 265     c = b * inverseNorm(temp);
 266     /* 6 + 10 flops */
 267
 268     x[XX] = xi[XX] + c * temp[XX];
 269     x[YY] = xi[YY] + c * temp[YY];
 270     x[ZZ] = xi[ZZ] + c * temp[ZZ];
 271     /* 6 Flops */
 272
 273     /* TOTAL: 34 flops */
 274 }
 275
 276 static void constr_vsite3FAD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 277 {
 278     rvec xij, xjk, xp;
 279     real a1, b1, c1, invdij;
 280
 281     pbc_rvec_sub(pbc, xj, xi, xij);
 282     pbc_rvec_sub(pbc, xk, xj, xjk);
 283     /* 6 flops */
 284
 285     invdij = inverseNorm(xij);
 286     c1     = invdij * invdij * iprod(xij, xjk);
 287     xp[XX] = xjk[XX] - c1 * xij[XX];
 288     xp[YY] = xjk[YY] - c1 * xij[YY];
 289     xp[ZZ] = xjk[ZZ] - c1 * xij[ZZ];
 290     a1     = a * invdij;
 291     b1     = b * inverseNorm(xp);
 292     /* 45 */
 293
 294     x[XX] = xi[XX] + a1 * xij[XX] + b1 * xp[XX];
 295     x[YY] = xi[YY] + a1 * xij[YY] + b1 * xp[YY];
 296     x[ZZ] = xi[ZZ] + a1 * xij[ZZ] + b1 * xp[ZZ];
 297     /* 12 Flops */
 298
 299     /* TOTAL: 63 flops */
 300 }
 301
 302 static void
 303 constr_vsite3OUT(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, real c, const t_pbc* pbc)
 304 {
 305     rvec xij, xik, temp;
 306
 307     pbc_rvec_sub(pbc, xj, xi, xij);
 308     pbc_rvec_sub(pbc, xk, xi, xik);
 309     cprod(xij, xik, temp);
 310     /* 15 Flops */
 311
 312     x[XX] = xi[XX] + a * xij[XX] + b * xik[XX] + c * temp[XX];
 313     x[YY] = xi[YY] + a * xij[YY] + b * xik[YY] + c * temp[YY];
 314     x[ZZ] = xi[ZZ] + a * xij[ZZ] + b * xik[ZZ] + c * temp[ZZ];
 315     /* 18 Flops */
 316
 317     /* TOTAL: 33 flops */
 318 }
 319
 320 static void constr_vsite4FD(const rvec   xi,
 321                             const rvec   xj,
 322                             const rvec   xk,
 323                             const rvec   xl,
 324                             rvec         x,
 325                             real         a,
 326                             real         b,
 327                             real         c,
 328                             const t_pbc* pbc)
 329 {
 330     rvec xij, xjk, xjl, temp;
 331     real d;
 332
 333     pbc_rvec_sub(pbc, xj, xi, xij);
 334     pbc_rvec_sub(pbc, xk, xj, xjk);
 335     pbc_rvec_sub(pbc, xl, xj, xjl);
 336     /* 9 flops */
 337
 338     /* temp goes from i to a point on the plane jkl */
 339     temp[XX] = xij[XX] + a * xjk[XX] + b * xjl[XX];
 340     temp[YY] = xij[YY] + a * xjk[YY] + b * xjl[YY];
 341     temp[ZZ] = xij[ZZ] + a * xjk[ZZ] + b * xjl[ZZ];
 342     /* 12 flops */
 343
 344     d = c * inverseNorm(temp);
 345     /* 6 + 10 flops */
 346
 347     x[XX] = xi[XX] + d * temp[XX];
 348     x[YY] = xi[YY] + d * temp[YY];
 349     x[ZZ] = xi[ZZ] + d * temp[ZZ];
 350     /* 6 Flops */
 351
 352     /* TOTAL: 43 flops */
 353 }
 354
 355 static void constr_vsite4FDN(const rvec   xi,
 356                              const rvec   xj,
 357                              const rvec   xk,
 358                              const rvec   xl,
 359                              rvec         x,
 360                              real         a,
 361                              real         b,
 362                              real         c,
 363                              const t_pbc* pbc)
 364 {
 365     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 366     real d;
 367
 368     pbc_rvec_sub(pbc, xj, xi, xij);
 369     pbc_rvec_sub(pbc, xk, xi, xik);
 370     pbc_rvec_sub(pbc, xl, xi, xil);
 371     /* 9 flops */
 372
 373     ra[XX] = a * xik[XX];
 374     ra[YY] = a * xik[YY];
 375     ra[ZZ] = a * xik[ZZ];
 376
 377     rb[XX] = b * xil[XX];
 378     rb[YY] = b * xil[YY];
 379     rb[ZZ] = b * xil[ZZ];
 380
 381     /* 6 flops */
 382
 383     rvec_sub(ra, xij, rja);
 384     rvec_sub(rb, xij, rjb);
 385     /* 6 flops */
 386
 387     cprod(rja, rjb, rm);
 388     /* 9 flops */
 389
 390     d = c * inverseNorm(rm);
 391     /* 5+5+1 flops */
 392
 393     x[XX] = xi[XX] + d * rm[XX];
 394     x[YY] = xi[YY] + d * rm[YY];
 395     x[ZZ] = xi[ZZ] + d * rm[ZZ];
 396     /* 6 Flops */
 397
 398     /* TOTAL: 47 flops */
 399 }
 400
 401
 402 static int constr_vsiten(const t_iatom* ia, ArrayRef<const t_iparams> ip, rvec* x, const t_pbc* pbc)
 403 {
 404     rvec x1, dx;
 405     dvec dsum;
 406     int  n3, av, ai;
 407     real a;
 408
 409     n3 = 3 * ip[ia[0]].vsiten.n;
 410     av = ia[1];
 411     ai = ia[2];
 412     copy_rvec(x[ai], x1);
 413     clear_dvec(dsum);
 414     for (int i = 3; i < n3; i += 3)
 415     {
 416         ai = ia[i + 2];
 417         a  = ip[ia[i]].vsiten.a;
 418         if (pbc)
 419         {
 420             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 421         }
 422         else
 423         {
 424             rvec_sub(x[ai], x1, dx);
 425         }
 426         dsum[XX] += a * dx[XX];
 427         dsum[YY] += a * dx[YY];
 428         dsum[ZZ] += a * dx[ZZ];
 429         /* 9 Flops */
 430     }
 431
 432     x[av][XX] = x1[XX] + dsum[XX];
 433     x[av][YY] = x1[YY] + dsum[YY];
 434     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 435
 436     return n3;
 437 }
 438
 439 /*! \brief PBC modes for vsite construction and spreading */
 440 enum class PbcMode
 441 {
 442     all, // Apply normal, simple PBC for all vsites
 443     none // No PBC treatment needed
 444 };
 445
 446 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
 447  *
 448  * \param[in] pbcPtr  A pointer to a PBC struct or nullptr when no PBC treatment is required
 449  */
 450 static PbcMode getPbcMode(const t_pbc* pbcPtr)
 451 {
 452     if (pbcPtr == nullptr)
 453     {
 454         return PbcMode::none;
 455     }
 456     else
 457     {
 458         return PbcMode::all;
 459     }
 460 }
 461
 462 static void construct_vsites_thread(rvec                            x[],
 463                                     real                            dt,
 464                                     rvec*                           v,
 465                                     ArrayRef<const t_iparams>       ip,
 466                                     ArrayRef<const InteractionList> ilist,
 467                                     const t_pbc*                    pbc_null)
 468 {
 469     real inv_dt;
 470     if (v != nullptr)
 471     {
 472         inv_dt = 1.0 / dt;
 473     }
 474     else
 475     {
 476         inv_dt = 1.0;
 477     }
 478
 479     const PbcMode pbcMode = getPbcMode(pbc_null);
 480     /* We need another pbc pointer, as with charge groups we switch per vsite */
 481     const t_pbc* pbc_null2 = pbc_null;
 482
 483     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 484     {
 485         if (ilist[ftype].empty())
 486         {
 487             continue;
 488         }
 489
 490         { // TODO remove me
 491             int nra = interaction_function[ftype].nratoms;
 492             int inc = 1 + nra;
 493             int nr  = ilist[ftype].size();
 494
 495             const t_iatom* ia = ilist[ftype].iatoms.data();
 496
 497             for (int i = 0; i < nr;)
 498             {
 499                 int tp = ia[0];
 500                 /* The vsite and constructing atoms */
 501                 int avsite = ia[1];
 502                 int ai     = ia[2];
 503                 /* Constants for constructing vsites */
 504                 real a1 = ip[tp].vsite.a;
 505                 /* Copy the old position */
 506                 rvec xv;
 507                 copy_rvec(x[avsite], xv);
 508
 509                 /* Construct the vsite depending on type */
 510                 int  aj, ak, al;
 511                 real b1, c1;
 512                 switch (ftype)
 513                 {
 514                     case F_VSITE2:
 515                         aj = ia[3];
 516                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 517                         break;
 518                     case F_VSITE2FD:
 519                         aj = ia[3];
 520                         constr_vsite2FD(x[ai], x[aj], x[avsite], a1, pbc_null2);
 521                         break;
 522                     case F_VSITE3:
 523                         aj = ia[3];
 524                         ak = ia[4];
 525                         b1 = ip[tp].vsite.b;
 526                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 527                         break;
 528                     case F_VSITE3FD:
 529                         aj = ia[3];
 530                         ak = ia[4];
 531                         b1 = ip[tp].vsite.b;
 532                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 533                         break;
 534                     case F_VSITE3FAD:
 535                         aj = ia[3];
 536                         ak = ia[4];
 537                         b1 = ip[tp].vsite.b;
 538                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 539                         break;
 540                     case F_VSITE3OUT:
 541                         aj = ia[3];
 542                         ak = ia[4];
 543                         b1 = ip[tp].vsite.b;
 544                         c1 = ip[tp].vsite.c;
 545                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 546                         break;
 547                     case F_VSITE4FD:
 548                         aj = ia[3];
 549                         ak = ia[4];
 550                         al = ia[5];
 551                         b1 = ip[tp].vsite.b;
 552                         c1 = ip[tp].vsite.c;
 553                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1, pbc_null2);
 554                         break;
 555                     case F_VSITE4FDN:
 556                         aj = ia[3];
 557                         ak = ia[4];
 558                         al = ia[5];
 559                         b1 = ip[tp].vsite.b;
 560                         c1 = ip[tp].vsite.c;
 561                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1, pbc_null2);
 562                         break;
 563                     case F_VSITEN: inc = constr_vsiten(ia, ip, x, pbc_null2); break;
 564                     default:
 565                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d", ftype, __FILE__, __LINE__);
 566                 }
 567
 568                 if (pbcMode == PbcMode::all)
 569                 {
 570                     /* Keep the vsite in the same periodic image as before */
 571                     rvec dx;
 572                     int  ishift = pbc_dx_aiuc(pbc_null, x[avsite], xv, dx);
 573                     if (ishift != CENTRAL)
 574                     {
 575                         rvec_add(xv, dx, x[avsite]);
 576                     }
 577                 }
 578                 if (v != nullptr)
 579                 {
 580                     /* Calculate velocity of vsite... */
 581                     rvec vv;
 582                     rvec_sub(x[avsite], xv, vv);
 583                     svmul(inv_dt, vv, v[avsite]);
 584                 }
 585
 586                 /* Increment loop variables */
 587                 i += inc;
 588                 ia += inc;
 589             }
 590         }
 591     }
 592 }
 593
 594 void construct_vsites(const gmx_vsite_t*              vsite,
 595                       rvec                            x[],
 596                       real                            dt,
 597                       rvec*                           v,
 598                       ArrayRef<const t_iparams>       ip,
 599                       ArrayRef<const InteractionList> ilist,
 600                       PbcType                         pbcType,
 601                       gmx_bool                        bMolPBC,
 602                       const t_commrec*                cr,
 603                       const matrix                    box)
 604 {
 605     const bool useDomdec = (vsite != nullptr && vsite->useDomdec);
 606     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)),
 607                "When vsites are set up with domain decomposition, we need a valid commrec");
 608     // TODO: Remove this assertion when we remove charge groups
 609     GMX_ASSERT(vsite != nullptr || pbcType == PbcType::No,
 610                "Without a vsite struct we can not do PBC (in case we have charge groups)");
 611
 612     t_pbc pbc, *pbc_null;
 613
 614     /* We only need to do pbc when we have inter-cg vsites.
 615      * Note that with domain decomposition we do not need to apply PBC here
 616      * when we have at least 3 domains along each dimension. Currently we
 617      * do not optimize this case.
 618      */
 619     if (pbcType != PbcType::No && (useDomdec || bMolPBC)
 620         && !(vsite != nullptr && vsite->numInterUpdategroupVsites == 0))
 621     {
 622         /* This is wasting some CPU time as we now do this multiple times
 623          * per MD step.
 624          */
 625         ivec null_ivec;
 626         clear_ivec(null_ivec);
 627         pbc_null = set_pbc_dd(&pbc, pbcType, useDomdec ? cr->dd->numCells : null_ivec, FALSE, box);
 628     }
 629     else
 630     {
 631         pbc_null = nullptr;
 632     }
 633
 634     if (useDomdec)
 635     {
 636         dd_move_x_vsites(cr->dd, box, x);
 637     }
 638
 639     if (vsite == nullptr || vsite->nthreads == 1)
 640     {
 641         construct_vsites_thread(x, dt, v, ip, ilist, pbc_null);
 642     }
 643     else
 644     {
 645 #pragma omp parallel num_threads(vsite->nthreads)
 646         {
 647             try
 648             {
 649                 const int          th    = gmx_omp_get_thread_num();
 650                 const VsiteThread& tData = *vsite->tData[th];
 651                 GMX_ASSERT(tData.rangeStart >= 0,
 652                            "The thread data should be initialized before calling construct_vsites");
 653
 654                 construct_vsites_thread(x, dt, v, ip, tData.ilist, pbc_null);
 655                 if (tData.useInterdependentTask)
 656                 {
 657                     /* Here we don't need a barrier (unlike the spreading),
 658                      * since both tasks only construct vsites from particles,
 659                      * or local vsites, not from non-local vsites.
 660                      */
 661                     construct_vsites_thread(x, dt, v, ip, tData.idTask.ilist, pbc_null);
 662                 }
 663             }
 664             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 665         }
 666         /* Now we can construct the vsites that might depend on other vsites */
 667         construct_vsites_thread(x, dt, v, ip, vsite->tData[vsite->nthreads]->ilist, pbc_null);
 668     }
 669 }
 670
 671 static void spread_vsite2(const t_iatom ia[], real a, const rvec x[], rvec f[], rvec fshift[], const t_pbc* pbc)
 672 {
 673     rvec    fi, fj, dx;
 674     t_iatom av, ai, aj;
 675     int     siv, sij;
 676
 677     av = ia[1];
 678     ai = ia[2];
 679     aj = ia[3];
 680
 681     svmul(1 - a, f[av], fi);
 682     svmul(a, f[av], fj);
 683     /* 7 flop */
 684
 685     rvec_inc(f[ai], fi);
 686     rvec_inc(f[aj], fj);
 687     /* 6 Flops */
 688
 689     if (pbc)
 690     {
 691         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 692         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 693     }
 694     else
 695     {
 696         siv = CENTRAL;
 697         sij = CENTRAL;
 698     }
 699
 700     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 701     {
 702         rvec_inc(fshift[siv], f[av]);
 703         rvec_dec(fshift[CENTRAL], fi);
 704         rvec_dec(fshift[sij], fj);
 705     }
 706
 707     /* TOTAL: 13 flops */
 708 }
 709
 710 void constructVsitesGlobal(const gmx_mtop_t& mtop, gmx::ArrayRef<gmx::RVec> x)
 711 {
 712     GMX_ASSERT(x.ssize() >= mtop.natoms, "x should contain the whole system");
 713     GMX_ASSERT(!mtop.moleculeBlockIndices.empty(),
 714                "molblock indices are needed in constructVsitesGlobal");
 715
 716     for (size_t mb = 0; mb < mtop.molblock.size(); mb++)
 717     {
 718         const gmx_molblock_t& molb = mtop.molblock[mb];
 719         const gmx_moltype_t&  molt = mtop.moltype[molb.type];
 720         if (vsiteIlistNrCount(molt.ilist) > 0)
 721         {
 722             int atomOffset = mtop.moleculeBlockIndices[mb].globalAtomStart;
 723             for (int mol = 0; mol < molb.nmol; mol++)
 724             {
 725                 construct_vsites(nullptr, as_rvec_array(x.data()) + atomOffset, 0.0, nullptr,
 726                                  mtop.ffparams.iparams, molt.ilist, PbcType::No, TRUE, nullptr, nullptr);
 727                 atomOffset += molt.atoms.nr;
 728             }
 729         }
 730     }
 731 }
 732
 733 static void spread_vsite2FD(const t_iatom ia[],
 734                             real          a,
 735                             const rvec    x[],
 736                             rvec          f[],
 737                             rvec          fshift[],
 738                             gmx_bool      VirCorr,
 739                             matrix        dxdf,
 740                             const t_pbc*  pbc)
 741 {
 742     const int av = ia[1];
 743     const int ai = ia[2];
 744     const int aj = ia[3];
 745     rvec      fv;
 746     copy_rvec(f[av], fv);
 747
 748     rvec xij;
 749     int  sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 750     /* 6 flops */
 751
 752     const real invDistance = inverseNorm(xij);
 753     const real b           = a * invDistance;
 754     /* 4 + ?10? flops */
 755
 756     const real fproj = iprod(xij, fv) * invDistance * invDistance;
 757
 758     rvec fj;
 759     fj[XX] = b * (fv[XX] - fproj * xij[XX]);
 760     fj[YY] = b * (fv[YY] - fproj * xij[YY]);
 761     fj[ZZ] = b * (fv[ZZ] - fproj * xij[ZZ]);
 762     /* 9 */
 763
 764     /* b is already calculated in constr_vsite2FD
 765        storing b somewhere will save flops.     */
 766
 767     f[ai][XX] += fv[XX] - fj[XX];
 768     f[ai][YY] += fv[YY] - fj[YY];
 769     f[ai][ZZ] += fv[ZZ] - fj[ZZ];
 770     f[aj][XX] += fj[XX];
 771     f[aj][YY] += fj[YY];
 772     f[aj][ZZ] += fj[ZZ];
 773     /* 9 Flops */
 774
 775     if (fshift)
 776     {
 777         int svi;
 778         if (pbc)
 779         {
 780             rvec xvi;
 781             svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 782         }
 783         else
 784         {
 785             svi = CENTRAL;
 786         }
 787
 788         if (svi != CENTRAL || sji != CENTRAL)
 789         {
 790             rvec_dec(fshift[svi], fv);
 791             fshift[CENTRAL][XX] += fv[XX] - fj[XX];
 792             fshift[CENTRAL][YY] += fv[YY] - fj[YY];
 793             fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ];
 794             fshift[sji][XX] += fj[XX];
 795             fshift[sji][YY] += fj[YY];
 796             fshift[sji][ZZ] += fj[ZZ];
 797         }
 798     }
 799
 800     if (VirCorr)
 801     {
 802         /* When VirCorr=TRUE, the virial for the current forces is not
 803          * calculated from the redistributed forces. This means that
 804          * the effect of non-linear virtual site constructions on the virial
 805          * needs to be added separately. This contribution can be calculated
 806          * in many ways, but the simplest and cheapest way is to use
 807          * the first constructing atom ai as a reference position in space:
 808          * subtract (xv-xi)*fv and add (xj-xi)*fj.
 809          */
 810         rvec xiv;
 811
 812         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 813
 814         for (int i = 0; i < DIM; i++)
 815         {
 816             for (int j = 0; j < DIM; j++)
 817             {
 818                 /* As xix is a linear combination of j and k, use that here */
 819                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j];
 820             }
 821         }
 822     }
 823
 824     /* TOTAL: 38 flops */
 825 }
 826
 827 static void spread_vsite3(const t_iatom ia[], real a, real b, const rvec x[], rvec f[], rvec fshift[], const t_pbc* pbc)
 828 {
 829     rvec fi, fj, fk, dx;
 830     int  av, ai, aj, ak;
 831     int  siv, sij, sik;
 832
 833     av = ia[1];
 834     ai = ia[2];
 835     aj = ia[3];
 836     ak = ia[4];
 837
 838     svmul(1 - a - b, f[av], fi);
 839     svmul(a, f[av], fj);
 840     svmul(b, f[av], fk);
 841     /* 11 flops */
 842
 843     rvec_inc(f[ai], fi);
 844     rvec_inc(f[aj], fj);
 845     rvec_inc(f[ak], fk);
 846     /* 9 Flops */
 847
 848     if (pbc)
 849     {
 850         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 851         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 852         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 853     }
 854     else
 855     {
 856         siv = CENTRAL;
 857         sij = CENTRAL;
 858         sik = CENTRAL;
 859     }
 860
 861     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 862     {
 863         rvec_inc(fshift[siv], f[av]);
 864         rvec_dec(fshift[CENTRAL], fi);
 865         rvec_dec(fshift[sij], fj);
 866         rvec_dec(fshift[sik], fk);
 867     }
 868
 869     /* TOTAL: 20 flops */
 870 }
 871
 872 static void spread_vsite3FD(const t_iatom ia[],
 873                             real          a,
 874                             real          b,
 875                             const rvec    x[],
 876                             rvec          f[],
 877                             rvec          fshift[],
 878                             gmx_bool      VirCorr,
 879                             matrix        dxdf,
 880                             const t_pbc*  pbc)
 881 {
 882     real    fproj, a1;
 883     rvec    xvi, xij, xjk, xix, fv, temp;
 884     t_iatom av, ai, aj, ak;
 885     int     svi, sji, skj;
 886
 887     av = ia[1];
 888     ai = ia[2];
 889     aj = ia[3];
 890     ak = ia[4];
 891     copy_rvec(f[av], fv);
 892
 893     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 894     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 895     /* 6 flops */
 896
 897     /* xix goes from i to point x on the line jk */
 898     xix[XX] = xij[XX] + a * xjk[XX];
 899     xix[YY] = xij[YY] + a * xjk[YY];
 900     xix[ZZ] = xij[ZZ] + a * xjk[ZZ];
 901     /* 6 flops */
 902
 903     const real invDistance = inverseNorm(xix);
 904     const real c           = b * invDistance;
 905     /* 4 + ?10? flops */
 906
 907     fproj = iprod(xix, fv) * invDistance * invDistance; /* = (xix . f)/(xix . xix) */
 908
 909     temp[XX] = c * (fv[XX] - fproj * xix[XX]);
 910     temp[YY] = c * (fv[YY] - fproj * xix[YY]);
 911     temp[ZZ] = c * (fv[ZZ] - fproj * xix[ZZ]);
 912     /* 16 */
 913
 914     /* c is already calculated in constr_vsite3FD
 915        storing c somewhere will save 26 flops!     */
 916
 917     a1 = 1 - a;
 918     f[ai][XX] += fv[XX] - temp[XX];
 919     f[ai][YY] += fv[YY] - temp[YY];
 920     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 921     f[aj][XX] += a1 * temp[XX];
 922     f[aj][YY] += a1 * temp[YY];
 923     f[aj][ZZ] += a1 * temp[ZZ];
 924     f[ak][XX] += a * temp[XX];
 925     f[ak][YY] += a * temp[YY];
 926     f[ak][ZZ] += a * temp[ZZ];
 927     /* 19 Flops */
 928
 929     if (pbc)
 930     {
 931         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 932     }
 933     else
 934     {
 935         svi = CENTRAL;
 936     }
 937
 938     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 939     {
 940         rvec_dec(fshift[svi], fv);
 941         fshift[CENTRAL][XX] += fv[XX] - (1 + a) * temp[XX];
 942         fshift[CENTRAL][YY] += fv[YY] - (1 + a) * temp[YY];
 943         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a) * temp[ZZ];
 944         fshift[sji][XX] += temp[XX];
 945         fshift[sji][YY] += temp[YY];
 946         fshift[sji][ZZ] += temp[ZZ];
 947         fshift[skj][XX] += a * temp[XX];
 948         fshift[skj][YY] += a * temp[YY];
 949         fshift[skj][ZZ] += a * temp[ZZ];
 950     }
 951
 952     if (VirCorr)
 953     {
 954         /* When VirCorr=TRUE, the virial for the current forces is not
 955          * calculated from the redistributed forces. This means that
 956          * the effect of non-linear virtual site constructions on the virial
 957          * needs to be added separately. This contribution can be calculated
 958          * in many ways, but the simplest and cheapest way is to use
 959          * the first constructing atom ai as a reference position in space:
 960          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 961          */
 962         rvec xiv;
 963
 964         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 965
 966         for (int i = 0; i < DIM; i++)
 967         {
 968             for (int j = 0; j < DIM; j++)
 969             {
 970                 /* As xix is a linear combination of j and k, use that here */
 971                 dxdf[i][j] += -xiv[i] * fv[j] + xix[i] * temp[j];
 972             }
 973         }
 974     }
 975
 976     /* TOTAL: 61 flops */
 977 }
 978
 979 static void spread_vsite3FAD(const t_iatom ia[],
 980                              real          a,
 981                              real          b,
 982                              const rvec    x[],
 983                              rvec          f[],
 984                              rvec          fshift[],
 985                              gmx_bool      VirCorr,
 986                              matrix        dxdf,
 987                              const t_pbc*  pbc)
 988 {
 989     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 990     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 991     t_iatom av, ai, aj, ak;
 992     int     svi, sji, skj, d;
 993
 994     av = ia[1];
 995     ai = ia[2];
 996     aj = ia[3];
 997     ak = ia[4];
 998     copy_rvec(f[ia[1]], fv);
 999
1000     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1001     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1002     /* 6 flops */
1003
1004     invdij    = inverseNorm(xij);
1005     invdij2   = invdij * invdij;
1006     c1        = iprod(xij, xjk) * invdij2;
1007     xperp[XX] = xjk[XX] - c1 * xij[XX];
1008     xperp[YY] = xjk[YY] - c1 * xij[YY];
1009     xperp[ZZ] = xjk[ZZ] - c1 * xij[ZZ];
1010     /* xperp in plane ijk, perp. to ij */
1011     invdp = inverseNorm(xperp);
1012     a1    = a * invdij;
1013     b1    = b * invdp;
1014     /* 45 flops */
1015
1016     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
1017        storing them somewhere will save 45 flops!     */
1018
1019     fproj = iprod(xij, fv) * invdij2;
1020     svmul(fproj, xij, Fpij);                              /* proj. f on xij */
1021     svmul(iprod(xperp, fv) * invdp * invdp, xperp, Fppp); /* proj. f on xperp */
1022     svmul(b1 * fproj, xperp, f3);
1023     /* 23 flops */
1024
1025     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
1026     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
1027     for (d = 0; (d < DIM); d++)
1028     {
1029         f1[d] *= a1;
1030         f2[d] *= b1;
1031     }
1032     /* 12 flops */
1033
1034     c2 = 1 + c1;
1035     f[ai][XX] += fv[XX] - f1[XX] + c1 * f2[XX] + f3[XX];
1036     f[ai][YY] += fv[YY] - f1[YY] + c1 * f2[YY] + f3[YY];
1037     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1 * f2[ZZ] + f3[ZZ];
1038     f[aj][XX] += f1[XX] - c2 * f2[XX] - f3[XX];
1039     f[aj][YY] += f1[YY] - c2 * f2[YY] - f3[YY];
1040     f[aj][ZZ] += f1[ZZ] - c2 * f2[ZZ] - f3[ZZ];
1041     f[ak][XX] += f2[XX];
1042     f[ak][YY] += f2[YY];
1043     f[ak][ZZ] += f2[ZZ];
1044     /* 30 Flops */
1045
1046     if (pbc)
1047     {
1048         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1049     }
1050     else
1051     {
1052         svi = CENTRAL;
1053     }
1054
1055     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1056     {
1057         rvec_dec(fshift[svi], fv);
1058         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1 - c1) * f2[XX] + f3[XX];
1059         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1 - c1) * f2[YY] + f3[YY];
1060         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1 - c1) * f2[ZZ] + f3[ZZ];
1061         fshift[sji][XX] += f1[XX] - c1 * f2[XX] - f3[XX];
1062         fshift[sji][YY] += f1[YY] - c1 * f2[YY] - f3[YY];
1063         fshift[sji][ZZ] += f1[ZZ] - c1 * f2[ZZ] - f3[ZZ];
1064         fshift[skj][XX] += f2[XX];
1065         fshift[skj][YY] += f2[YY];
1066         fshift[skj][ZZ] += f2[ZZ];
1067     }
1068
1069     if (VirCorr)
1070     {
1071         rvec xiv;
1072         int  i, j;
1073
1074         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1075
1076         for (i = 0; i < DIM; i++)
1077         {
1078             for (j = 0; j < DIM; j++)
1079             {
1080                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1081                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * (f1[j] + (1 - c2) * f2[j] - f3[j])
1082                               + xjk[i] * f2[j];
1083             }
1084         }
1085     }
1086
1087     /* TOTAL: 113 flops */
1088 }
1089
1090 static void spread_vsite3OUT(const t_iatom ia[],
1091                              real          a,
1092                              real          b,
1093                              real          c,
1094                              const rvec    x[],
1095                              rvec          f[],
1096                              rvec          fshift[],
1097                              gmx_bool      VirCorr,
1098                              matrix        dxdf,
1099                              const t_pbc*  pbc)
1100 {
1101     rvec xvi, xij, xik, fv, fj, fk;
1102     real cfx, cfy, cfz;
1103     int  av, ai, aj, ak;
1104     int  svi, sji, ski;
1105
1106     av = ia[1];
1107     ai = ia[2];
1108     aj = ia[3];
1109     ak = ia[4];
1110
1111     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1112     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1113     /* 6 Flops */
1114
1115     copy_rvec(f[av], fv);
1116
1117     cfx = c * fv[XX];
1118     cfy = c * fv[YY];
1119     cfz = c * fv[ZZ];
1120     /* 3 Flops */
1121
1122     fj[XX] = a * fv[XX] - xik[ZZ] * cfy + xik[YY] * cfz;
1123     fj[YY] = xik[ZZ] * cfx + a * fv[YY] - xik[XX] * cfz;
1124     fj[ZZ] = -xik[YY] * cfx + xik[XX] * cfy + a * fv[ZZ];
1125
1126     fk[XX] = b * fv[XX] + xij[ZZ] * cfy - xij[YY] * cfz;
1127     fk[YY] = -xij[ZZ] * cfx + b * fv[YY] + xij[XX] * cfz;
1128     fk[ZZ] = xij[YY] * cfx - xij[XX] * cfy + b * fv[ZZ];
1129     /* 30 Flops */
1130
1131     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1132     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1133     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1134     rvec_inc(f[aj], fj);
1135     rvec_inc(f[ak], fk);
1136     /* 15 Flops */
1137
1138     if (pbc)
1139     {
1140         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1141     }
1142     else
1143     {
1144         svi = CENTRAL;
1145     }
1146
1147     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1148     {
1149         rvec_dec(fshift[svi], fv);
1150         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1151         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1152         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1153         rvec_inc(fshift[sji], fj);
1154         rvec_inc(fshift[ski], fk);
1155     }
1156
1157     if (VirCorr)
1158     {
1159         rvec xiv;
1160
1161         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1162
1163         for (int i = 0; i < DIM; i++)
1164         {
1165             for (int j = 0; j < DIM; j++)
1166             {
1167                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j] + xik[i] * fk[j];
1168             }
1169         }
1170     }
1171
1172     /* TOTAL: 54 flops */
1173 }
1174
1175 static void spread_vsite4FD(const t_iatom ia[],
1176                             real          a,
1177                             real          b,
1178                             real          c,
1179                             const rvec    x[],
1180                             rvec          f[],
1181                             rvec          fshift[],
1182                             gmx_bool      VirCorr,
1183                             matrix        dxdf,
1184                             const t_pbc*  pbc)
1185 {
1186     real fproj, a1;
1187     rvec xvi, xij, xjk, xjl, xix, fv, temp;
1188     int  av, ai, aj, ak, al;
1189     int  svi, sji, skj, slj, m;
1190
1191     av = ia[1];
1192     ai = ia[2];
1193     aj = ia[3];
1194     ak = ia[4];
1195     al = ia[5];
1196
1197     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1198     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1199     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1200     /* 9 flops */
1201
1202     /* xix goes from i to point x on the plane jkl */
1203     for (m = 0; m < DIM; m++)
1204     {
1205         xix[m] = xij[m] + a * xjk[m] + b * xjl[m];
1206     }
1207     /* 12 flops */
1208
1209     const real invDistance = inverseNorm(xix);
1210     const real d           = c * invDistance;
1211     /* 4 + ?10? flops */
1212
1213     copy_rvec(f[av], fv);
1214
1215     fproj = iprod(xix, fv) * invDistance * invDistance; /* = (xix . f)/(xix . xix) */
1216
1217     for (m = 0; m < DIM; m++)
1218     {
1219         temp[m] = d * (fv[m] - fproj * xix[m]);
1220     }
1221     /* 16 */
1222
1223     /* c is already calculated in constr_vsite3FD
1224        storing c somewhere will save 35 flops!     */
1225
1226     a1 = 1 - a - b;
1227     for (m = 0; m < DIM; m++)
1228     {
1229         f[ai][m] += fv[m] - temp[m];
1230         f[aj][m] += a1 * temp[m];
1231         f[ak][m] += a * temp[m];
1232         f[al][m] += b * temp[m];
1233     }
1234     /* 26 Flops */
1235
1236     if (pbc)
1237     {
1238         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1239     }
1240     else
1241     {
1242         svi = CENTRAL;
1243     }
1244
1245     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1246     {
1247         rvec_dec(fshift[svi], fv);
1248         for (m = 0; m < DIM; m++)
1249         {
1250             fshift[CENTRAL][m] += fv[m] - (1 + a + b) * temp[m];
1251             fshift[sji][m] += temp[m];
1252             fshift[skj][m] += a * temp[m];
1253             fshift[slj][m] += b * temp[m];
1254         }
1255     }
1256
1257     if (VirCorr)
1258     {
1259         rvec xiv;
1260         int  i, j;
1261
1262         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1263
1264         for (i = 0; i < DIM; i++)
1265         {
1266             for (j = 0; j < DIM; j++)
1267             {
1268                 dxdf[i][j] += -xiv[i] * fv[j] + xix[i] * temp[j];
1269             }
1270         }
1271     }
1272
1273     /* TOTAL: 77 flops */
1274 }
1275
1276
1277 static void spread_vsite4FDN(const t_iatom ia[],
1278                              real          a,
1279                              real          b,
1280                              real          c,
1281                              const rvec    x[],
1282                              rvec          f[],
1283                              rvec          fshift[],
1284                              gmx_bool      VirCorr,
1285                              matrix        dxdf,
1286                              const t_pbc*  pbc)
1287 {
1288     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1289     rvec fv, fj, fk, fl;
1290     real invrm, denom;
1291     real cfx, cfy, cfz;
1292     int  av, ai, aj, ak, al;
1293     int  svi, sij, sik, sil;
1294
1295     /* DEBUG: check atom indices */
1296     av = ia[1];
1297     ai = ia[2];
1298     aj = ia[3];
1299     ak = ia[4];
1300     al = ia[5];
1301
1302     copy_rvec(f[av], fv);
1303
1304     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1305     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1306     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1307     /* 9 flops */
1308
1309     ra[XX] = a * xik[XX];
1310     ra[YY] = a * xik[YY];
1311     ra[ZZ] = a * xik[ZZ];
1312
1313     rb[XX] = b * xil[XX];
1314     rb[YY] = b * xil[YY];
1315     rb[ZZ] = b * xil[ZZ];
1316
1317     /* 6 flops */
1318
1319     rvec_sub(ra, xij, rja);
1320     rvec_sub(rb, xij, rjb);
1321     rvec_sub(rb, ra, rab);
1322     /* 9 flops */
1323
1324     cprod(rja, rjb, rm);
1325     /* 9 flops */
1326
1327     invrm = inverseNorm(rm);
1328     denom = invrm * invrm;
1329     /* 5+5+2 flops */
1330
1331     cfx = c * invrm * fv[XX];
1332     cfy = c * invrm * fv[YY];
1333     cfz = c * invrm * fv[ZZ];
1334     /* 6 Flops */
1335
1336     cprod(rm, rab, rt);
1337     /* 9 flops */
1338
1339     rt[XX] *= denom;
1340     rt[YY] *= denom;
1341     rt[ZZ] *= denom;
1342     /* 3flops */
1343
1344     fj[XX] = (-rm[XX] * rt[XX]) * cfx + (rab[ZZ] - rm[YY] * rt[XX]) * cfy
1345              + (-rab[YY] - rm[ZZ] * rt[XX]) * cfz;
1346     fj[YY] = (-rab[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1347              + (rab[XX] - rm[ZZ] * rt[YY]) * cfz;
1348     fj[ZZ] = (rab[YY] - rm[XX] * rt[ZZ]) * cfx + (-rab[XX] - rm[YY] * rt[ZZ]) * cfy
1349              + (-rm[ZZ] * rt[ZZ]) * cfz;
1350     /* 30 flops */
1351
1352     cprod(rjb, rm, rt);
1353     /* 9 flops */
1354
1355     rt[XX] *= denom * a;
1356     rt[YY] *= denom * a;
1357     rt[ZZ] *= denom * a;
1358     /* 3flops */
1359
1360     fk[XX] = (-rm[XX] * rt[XX]) * cfx + (-a * rjb[ZZ] - rm[YY] * rt[XX]) * cfy
1361              + (a * rjb[YY] - rm[ZZ] * rt[XX]) * cfz;
1362     fk[YY] = (a * rjb[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1363              + (-a * rjb[XX] - rm[ZZ] * rt[YY]) * cfz;
1364     fk[ZZ] = (-a * rjb[YY] - rm[XX] * rt[ZZ]) * cfx + (a * rjb[XX] - rm[YY] * rt[ZZ]) * cfy
1365              + (-rm[ZZ] * rt[ZZ]) * cfz;
1366     /* 36 flops */
1367
1368     cprod(rm, rja, rt);
1369     /* 9 flops */
1370
1371     rt[XX] *= denom * b;
1372     rt[YY] *= denom * b;
1373     rt[ZZ] *= denom * b;
1374     /* 3flops */
1375
1376     fl[XX] = (-rm[XX] * rt[XX]) * cfx + (b * rja[ZZ] - rm[YY] * rt[XX]) * cfy
1377              + (-b * rja[YY] - rm[ZZ] * rt[XX]) * cfz;
1378     fl[YY] = (-b * rja[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1379              + (b * rja[XX] - rm[ZZ] * rt[YY]) * cfz;
1380     fl[ZZ] = (b * rja[YY] - rm[XX] * rt[ZZ]) * cfx + (-b * rja[XX] - rm[YY] * rt[ZZ]) * cfy
1381              + (-rm[ZZ] * rt[ZZ]) * cfz;
1382     /* 36 flops */
1383
1384     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1385     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1386     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1387     rvec_inc(f[aj], fj);
1388     rvec_inc(f[ak], fk);
1389     rvec_inc(f[al], fl);
1390     /* 21 flops */
1391
1392     if (pbc)
1393     {
1394         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1395     }
1396     else
1397     {
1398         svi = CENTRAL;
1399     }
1400
1401     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1402     {
1403         rvec_dec(fshift[svi], fv);
1404         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1405         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1406         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1407         rvec_inc(fshift[sij], fj);
1408         rvec_inc(fshift[sik], fk);
1409         rvec_inc(fshift[sil], fl);
1410     }
1411
1412     if (VirCorr)
1413     {
1414         rvec xiv;
1415         int  i, j;
1416
1417         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1418
1419         for (i = 0; i < DIM; i++)
1420         {
1421             for (j = 0; j < DIM; j++)
1422             {
1423                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j] + xik[i] * fk[j] + xil[i] * fl[j];
1424             }
1425         }
1426     }
1427
1428     /* Total: 207 flops (Yuck!) */
1429 }
1430
1431
1432 static int spread_vsiten(const t_iatom             ia[],
1433                          ArrayRef<const t_iparams> ip,
1434                          const rvec                x[],
1435                          rvec                      f[],
1436                          rvec                      fshift[],
1437                          const t_pbc*              pbc)
1438 {
1439     rvec xv, dx, fi;
1440     int  n3, av, i, ai;
1441     real a;
1442     int  siv;
1443
1444     n3 = 3 * ip[ia[0]].vsiten.n;
1445     av = ia[1];
1446     copy_rvec(x[av], xv);
1447
1448     for (i = 0; i < n3; i += 3)
1449     {
1450         ai = ia[i + 2];
1451         if (pbc)
1452         {
1453             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1454         }
1455         else
1456         {
1457             siv = CENTRAL;
1458         }
1459         a = ip[ia[i]].vsiten.a;
1460         svmul(a, f[av], fi);
1461         rvec_inc(f[ai], fi);
1462         if (fshift && siv != CENTRAL)
1463         {
1464             rvec_inc(fshift[siv], fi);
1465             rvec_dec(fshift[CENTRAL], fi);
1466         }
1467         /* 6 Flops */
1468     }
1469
1470     return n3;
1471 }
1472
1473
1474 static int vsite_count(ArrayRef<const InteractionList> ilist, int ftype)
1475 {
1476     if (ftype == F_VSITEN)
1477     {
1478         return ilist[ftype].size() / 3;
1479     }
1480     else
1481     {
1482         return ilist[ftype].size() / (1 + interaction_function[ftype].nratoms);
1483     }
1484 }
1485
1486 static void spread_vsite_f_thread(const rvec                      x[],
1487                                   rvec                            f[],
1488                                   rvec*                           fshift,
1489                                   gmx_bool                        VirCorr,
1490                                   matrix                          dxdf,
1491                                   ArrayRef<const t_iparams>       ip,
1492                                   ArrayRef<const InteractionList> ilist,
1493                                   const t_pbc*                    pbc_null)
1494 {
1495     const PbcMode pbcMode = getPbcMode(pbc_null);
1496     /* We need another pbc pointer, as with charge groups we switch per vsite */
1497     const t_pbc*             pbc_null2 = pbc_null;
1498     gmx::ArrayRef<const int> vsite_pbc;
1499
1500     /* this loop goes backwards to be able to build *
1501      * higher type vsites from lower types         */
1502     for (int ftype = c_ftypeVsiteEnd - 1; ftype >= c_ftypeVsiteStart; ftype--)
1503     {
1504         if (ilist[ftype].empty())
1505         {
1506             continue;
1507         }
1508
1509         { // TODO remove me
1510             int nra = interaction_function[ftype].nratoms;
1511             int inc = 1 + nra;
1512             int nr  = ilist[ftype].size();
1513
1514             const t_iatom* ia = ilist[ftype].iatoms.data();
1515
1516             if (pbcMode == PbcMode::all)
1517             {
1518                 pbc_null2 = pbc_null;
1519             }
1520
1521             for (int i = 0; i < nr;)
1522             {
1523                 int tp = ia[0];
1524
1525                 /* Constants for constructing */
1526                 real a1, b1, c1;
1527                 a1 = ip[tp].vsite.a;
1528                 /* Construct the vsite depending on type */
1529                 switch (ftype)
1530                 {
1531                     case F_VSITE2: spread_vsite2(ia, a1, x, f, fshift, pbc_null2); break;
1532                     case F_VSITE2FD:
1533                         spread_vsite2FD(ia, a1, x, f, fshift, VirCorr, dxdf, pbc_null2);
1534                         break;
1535                     case F_VSITE3:
1536                         b1 = ip[tp].vsite.b;
1537                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2);
1538                         break;
1539                     case F_VSITE3FD:
1540                         b1 = ip[tp].vsite.b;
1541                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2);
1542                         break;
1543                     case F_VSITE3FAD:
1544                         b1 = ip[tp].vsite.b;
1545                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2);
1546                         break;
1547                     case F_VSITE3OUT:
1548                         b1 = ip[tp].vsite.b;
1549                         c1 = ip[tp].vsite.c;
1550                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2);
1551                         break;
1552                     case F_VSITE4FD:
1553                         b1 = ip[tp].vsite.b;
1554                         c1 = ip[tp].vsite.c;
1555                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2);
1556                         break;
1557                     case F_VSITE4FDN:
1558                         b1 = ip[tp].vsite.b;
1559                         c1 = ip[tp].vsite.c;
1560                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2);
1561                         break;
1562                     case F_VSITEN: inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2); break;
1563                     default:
1564                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d", ftype, __FILE__, __LINE__);
1565                 }
1566                 clear_rvec(f[ia[1]]);
1567
1568                 /* Increment loop variables */
1569                 i += inc;
1570                 ia += inc;
1571             }
1572         }
1573     }
1574 }
1575
1576 /*! \brief Clears the task force buffer elements that are written by task idTask */
1577 static void clearTaskForceBufferUsedElements(InterdependentTask* idTask)
1578 {
1579     int ntask = idTask->spreadTask.size();
1580     for (int ti = 0; ti < ntask; ti++)
1581     {
1582         const AtomIndex* atomList = &idTask->atomIndex[idTask->spreadTask[ti]];
1583         int              natom    = atomList->atom.size();
1584         RVec*            force    = idTask->force.data();
1585         for (int i = 0; i < natom; i++)
1586         {
1587             clear_rvec(force[atomList->atom[i]]);
1588         }
1589     }
1590 }
1591
1592 void spread_vsite_f(const gmx_vsite_t* vsite,
1593                     const rvec* gmx_restrict x,
1594                     rvec* gmx_restrict f,
1595                     rvec* gmx_restrict            fshift,
1596                     gmx_bool                      VirCorr,
1597                     matrix                        vir,
1598                     t_nrnb*                       nrnb,
1599                     const InteractionDefinitions& idef,
1600                     PbcType                       pbcType,
1601                     gmx_bool                      bMolPBC,
1602                     const matrix                  box,
1603                     const t_commrec*              cr,
1604                     gmx_wallcycle*                wcycle)
1605 {
1606     wallcycle_start(wcycle, ewcVSITESPREAD);
1607     const bool useDomdec = vsite->useDomdec;
1608     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)),
1609                "When vsites are set up with domain decomposition, we need a valid commrec");
1610
1611     t_pbc pbc, *pbc_null;
1612
1613     /* We only need to do pbc when we have inter-cg vsites */
1614     if ((useDomdec || bMolPBC) && vsite->numInterUpdategroupVsites)
1615     {
1616         /* This is wasting some CPU time as we now do this multiple times
1617          * per MD step.
1618          */
1619         pbc_null = set_pbc_dd(&pbc, pbcType, useDomdec ? cr->dd->numCells : nullptr, FALSE, box);
1620     }
1621     else
1622     {
1623         pbc_null = nullptr;
1624     }
1625
1626     if (useDomdec)
1627     {
1628         dd_clear_f_vsites(cr->dd, f);
1629     }
1630
1631     if (vsite->nthreads == 1)
1632     {
1633         matrix dxdf;
1634         if (VirCorr)
1635         {
1636             clear_mat(dxdf);
1637         }
1638         spread_vsite_f_thread(x, f, fshift, VirCorr, dxdf, idef.iparams, idef.il, pbc_null);
1639
1640         if (VirCorr)
1641         {
1642             for (int i = 0; i < DIM; i++)
1643             {
1644                 for (int j = 0; j < DIM; j++)
1645                 {
1646                     vir[i][j] += -0.5 * dxdf[i][j];
1647                 }
1648             }
1649         }
1650     }
1651     else
1652     {
1653         /* First spread the vsites that might depend on non-local vsites */
1654         if (VirCorr)
1655         {
1656             clear_mat(vsite->tData[vsite->nthreads]->dxdf);
1657         }
1658         spread_vsite_f_thread(x, f, fshift, VirCorr, vsite->tData[vsite->nthreads]->dxdf,
1659                               idef.iparams, vsite->tData[vsite->nthreads]->ilist, pbc_null);
1660
1661 #pragma omp parallel num_threads(vsite->nthreads)
1662         {
1663             try
1664             {
1665                 int          thread = gmx_omp_get_thread_num();
1666                 VsiteThread& tData  = *vsite->tData[thread];
1667
1668                 rvec* fshift_t;
1669                 if (thread == 0 || fshift == nullptr)
1670                 {
1671                     fshift_t = fshift;
1672                 }
1673                 else
1674                 {
1675                     fshift_t = tData.fshift;
1676
1677                     for (int i = 0; i < SHIFTS; i++)
1678                     {
1679                         clear_rvec(fshift_t[i]);
1680                     }
1681                 }
1682                 if (VirCorr)
1683                 {
1684                     clear_mat(tData.dxdf);
1685                 }
1686
1687                 if (tData.useInterdependentTask)
1688                 {
1689                     /* Spread the vsites that spread outside our local range.
1690                      * This is done using a thread-local force buffer force.
1691                      * First we need to copy the input vsite forces to force.
1692                      */
1693                     InterdependentTask* idTask = &tData.idTask;
1694
1695                     /* Clear the buffer elements set by our task during
1696                      * the last call to spread_vsite_f.
1697                      */
1698                     clearTaskForceBufferUsedElements(idTask);
1699
1700                     int nvsite = idTask->vsite.size();
1701                     for (int i = 0; i < nvsite; i++)
1702                     {
1703                         copy_rvec(f[idTask->vsite[i]], idTask->force[idTask->vsite[i]]);
1704                     }
1705                     spread_vsite_f_thread(x, as_rvec_array(idTask->force.data()), fshift_t, VirCorr,
1706                                           tData.dxdf, idef.iparams, tData.idTask.ilist, pbc_null);
1707
1708                     /* We need a barrier before reducing forces below
1709                      * that have been produced by a different thread above.
1710                      */
1711 #pragma omp barrier
1712
1713                     /* Loop over all thread task and reduce forces they
1714                      * produced on atoms that fall in our range.
1715                      * Note that atomic reduction would be a simpler solution,
1716                      * but that might not have good support on all platforms.
1717                      */
1718                     int ntask = idTask->reduceTask.size();
1719                     for (int ti = 0; ti < ntask; ti++)
1720                     {
1721                         const InterdependentTask* idt_foreign =
1722                                 &vsite->tData[idTask->reduceTask[ti]]->idTask;
1723                         const AtomIndex* atomList  = &idt_foreign->atomIndex[thread];
1724                         const RVec*      f_foreign = idt_foreign->force.data();
1725
1726                         int natom = atomList->atom.size();
1727                         for (int i = 0; i < natom; i++)
1728                         {
1729                             int ind = atomList->atom[i];
1730                             rvec_inc(f[ind], f_foreign[ind]);
1731                             /* Clearing of f_foreign is done at the next step */
1732                         }
1733                     }
1734                     /* Clear the vsite forces, both in f and force */
1735                     for (int i = 0; i < nvsite; i++)
1736                     {
1737                         int ind = tData.idTask.vsite[i];
1738                         clear_rvec(f[ind]);
1739                         clear_rvec(tData.idTask.force[ind]);
1740                     }
1741                 }
1742
1743                 /* Spread the vsites that spread locally only */
1744                 spread_vsite_f_thread(x, f, fshift_t, VirCorr, tData.dxdf, idef.iparams,
1745                                       tData.ilist, pbc_null);
1746             }
1747             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1748         }
1749
1750         if (fshift != nullptr)
1751         {
1752             for (int th = 1; th < vsite->nthreads; th++)
1753             {
1754                 for (int i = 0; i < SHIFTS; i++)
1755                 {
1756                     rvec_inc(fshift[i], vsite->tData[th]->fshift[i]);
1757                 }
1758             }
1759         }
1760
1761         if (VirCorr)
1762         {
1763             for (int th = 0; th < vsite->nthreads + 1; th++)
1764             {
1765                 /* MSVC doesn't like matrix references, so we use a pointer */
1766                 const matrix* dxdf = &vsite->tData[th]->dxdf;
1767
1768                 for (int i = 0; i < DIM; i++)
1769                 {
1770                     for (int j = 0; j < DIM; j++)
1771                     {
1772                         vir[i][j] += -0.5 * (*dxdf)[i][j];
1773                     }
1774                 }
1775             }
1776         }
1777     }
1778
1779     if (useDomdec)
1780     {
1781         dd_move_f_vsites(cr->dd, f, fshift);
1782     }
1783
1784     inc_nrnb(nrnb, eNR_VSITE2, vsite_count(idef.il, F_VSITE2));
1785     inc_nrnb(nrnb, eNR_VSITE2FD, vsite_count(idef.il, F_VSITE2FD));
1786     inc_nrnb(nrnb, eNR_VSITE3, vsite_count(idef.il, F_VSITE3));
1787     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef.il, F_VSITE3FD));
1788     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef.il, F_VSITE3FAD));
1789     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef.il, F_VSITE3OUT));
1790     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef.il, F_VSITE4FD));
1791     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef.il, F_VSITE4FDN));
1792     inc_nrnb(nrnb, eNR_VSITEN, vsite_count(idef.il, F_VSITEN));
1793
1794     wallcycle_stop(wcycle, ewcVSITESPREAD);
1795 }
1796
1797 /*! \brief Returns the an array with group indices for each atom
1798  *
1799  * \param[in] grouping  The paritioning of the atom range into atom groups
1800  */
1801 static std::vector<int> makeAtomToGroupMapping(const gmx::RangePartitioning& grouping)
1802 {
1803     std::vector<int> atomToGroup(grouping.fullRange().end(), 0);
1804
1805     for (int group = 0; group < grouping.numBlocks(); group++)
1806     {
1807         auto block = grouping.block(group);
1808         std::fill(atomToGroup.begin() + block.begin(), atomToGroup.begin() + block.end(), group);
1809     }
1810
1811     return atomToGroup;
1812 }
1813
1814 int countNonlinearVsites(const gmx_mtop_t& mtop)
1815 {
1816     int numNonlinearVsites = 0;
1817     for (const gmx_molblock_t& molb : mtop.molblock)
1818     {
1819         const gmx_moltype_t& molt = mtop.moltype[molb.type];
1820
1821         for (const auto& ilist : extractILists(molt.ilist, IF_VSITE))
1822         {
1823             if (ilist.functionType != F_VSITE2 && ilist.functionType != F_VSITE3
1824                 && ilist.functionType != F_VSITEN)
1825             {
1826                 numNonlinearVsites += molb.nmol * ilist.iatoms.size() / (1 + NRAL(ilist.functionType));
1827             }
1828         }
1829     }
1830
1831     return numNonlinearVsites;
1832 }
1833
1834 int countInterUpdategroupVsites(const gmx_mtop_t&                           mtop,
1835                                 gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingPerMoleculetype)
1836 {
1837     int n_intercg_vsite = 0;
1838     for (const gmx_molblock_t& molb : mtop.molblock)
1839     {
1840         const gmx_moltype_t& molt = mtop.moltype[molb.type];
1841
1842         std::vector<int> atomToGroup;
1843         if (!updateGroupingPerMoleculetype.empty())
1844         {
1845             atomToGroup = makeAtomToGroupMapping(updateGroupingPerMoleculetype[molb.type]);
1846         }
1847         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1848         {
1849             const int              nral = NRAL(ftype);
1850             const InteractionList& il   = molt.ilist[ftype];
1851             for (int i = 0; i < il.size(); i += 1 + nral)
1852             {
1853                 bool isInterGroup = atomToGroup.empty();
1854                 if (!isInterGroup)
1855                 {
1856                     const int group = atomToGroup[il.iatoms[1 + i]];
1857                     for (int a = 1; a < nral; a++)
1858                     {
1859                         if (atomToGroup[il.iatoms[1 + a]] != group)
1860                         {
1861                             isInterGroup = true;
1862                             break;
1863                         }
1864                     }
1865                 }
1866                 if (isInterGroup)
1867                 {
1868                     n_intercg_vsite += molb.nmol;
1869                 }
1870             }
1871         }
1872     }
1873
1874     return n_intercg_vsite;
1875 }
1876
1877 std::unique_ptr<gmx_vsite_t> initVsite(const gmx_mtop_t& mtop, const t_commrec* cr)
1878 {
1879     GMX_RELEASE_ASSERT(cr != nullptr, "We need a valid commrec");
1880
1881     std::unique_ptr<gmx_vsite_t> vsite;
1882
1883     /* check if there are vsites */
1884     int nvsite = 0;
1885     for (int ftype = 0; ftype < F_NRE; ftype++)
1886     {
1887         if (interaction_function[ftype].flags & IF_VSITE)
1888         {
1889             GMX_ASSERT(ftype >= c_ftypeVsiteStart && ftype < c_ftypeVsiteEnd,
1890                        "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
1891
1892             nvsite += gmx_mtop_ftype_count(&mtop, ftype);
1893         }
1894         else
1895         {
1896             GMX_ASSERT(ftype < c_ftypeVsiteStart || ftype >= c_ftypeVsiteEnd,
1897                        "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
1898         }
1899     }
1900
1901     if (nvsite == 0)
1902     {
1903         return vsite;
1904     }
1905
1906     vsite = std::make_unique<gmx_vsite_t>();
1907
1908     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingPerMoleculetype;
1909     if (DOMAINDECOMP(cr))
1910     {
1911         updateGroupingPerMoleculetype = getUpdateGroupingPerMoleculetype(*cr->dd);
1912     }
1913     vsite->numInterUpdategroupVsites = countInterUpdategroupVsites(mtop, updateGroupingPerMoleculetype);
1914
1915     vsite->useDomdec = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
1916
1917     vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
1918
1919     if (vsite->nthreads > 1)
1920     {
1921         /* We need one extra thread data structure for the overlap vsites */
1922         vsite->tData.resize(vsite->nthreads + 1);
1923 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
1924         for (int thread = 0; thread < vsite->nthreads; thread++)
1925         {
1926             try
1927             {
1928                 vsite->tData[thread] = std::make_unique<VsiteThread>();
1929
1930                 InterdependentTask& idTask = vsite->tData[thread]->idTask;
1931                 idTask.nuse                = 0;
1932                 idTask.atomIndex.resize(vsite->nthreads);
1933             }
1934             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1935         }
1936         if (vsite->nthreads > 1)
1937         {
1938             vsite->tData[vsite->nthreads] = std::make_unique<VsiteThread>();
1939         }
1940     }
1941
1942     return vsite;
1943 }
1944
1945 gmx_vsite_t::gmx_vsite_t() {}
1946
1947 gmx_vsite_t::~gmx_vsite_t() {}
1948
1949 static inline void flagAtom(InterdependentTask* idTask, int atom, int thread, int nthread, int natperthread)
1950 {
1951     if (!idTask->use[atom])
1952     {
1953         idTask->use[atom] = true;
1954         thread            = atom / natperthread;
1955         /* Assign all non-local atom force writes to thread 0 */
1956         if (thread >= nthread)
1957         {
1958             thread = 0;
1959         }
1960         idTask->atomIndex[thread].atom.push_back(atom);
1961     }
1962 }
1963
1964 /*\brief Here we try to assign all vsites that are in our local range.
1965  *
1966  * Our task local atom range is tData->rangeStart - tData->rangeEnd.
1967  * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
1968  * are assigned to task tData->ilist. Vsites that depend on non-local atoms
1969  * but not on other vsites are assigned to task tData->id_task.ilist.
1970  * taskIndex[] is set for all vsites in our range, either to our local tasks
1971  * or to the single last task as taskIndex[]=2*nthreads.
1972  */
1973 static void assignVsitesToThread(VsiteThread*                    tData,
1974                                  int                             thread,
1975                                  int                             nthread,
1976                                  int                             natperthread,
1977                                  gmx::ArrayRef<int>              taskIndex,
1978                                  ArrayRef<const InteractionList> ilist,
1979                                  ArrayRef<const t_iparams>       ip,
1980                                  const unsigned short*           ptype)
1981 {
1982     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1983     {
1984         tData->ilist[ftype].clear();
1985         tData->idTask.ilist[ftype].clear();
1986
1987         int        nral1 = 1 + NRAL(ftype);
1988         int        inc   = nral1;
1989         const int* iat   = ilist[ftype].iatoms.data();
1990         for (int i = 0; i < ilist[ftype].size();)
1991         {
1992             if (ftype == F_VSITEN)
1993             {
1994                 /* The 3 below is from 1+NRAL(ftype)=3 */
1995                 inc = ip[iat[i]].vsiten.n * 3;
1996             }
1997
1998             if (iat[1 + i] < tData->rangeStart || iat[1 + i] >= tData->rangeEnd)
1999             {
2000                 /* This vsite belongs to a different thread */
2001                 i += inc;
2002                 continue;
2003             }
2004
2005             /* We would like to assign this vsite to task thread,
2006              * but it might depend on atoms outside the atom range of thread
2007              * or on another vsite not assigned to task thread.
2008              */
2009             int task = thread;
2010             if (ftype != F_VSITEN)
2011             {
2012                 for (int j = i + 2; j < i + nral1; j++)
2013                 {
2014                     /* Do a range check to avoid a harmless race on taskIndex */
2015                     if (iat[j] < tData->rangeStart || iat[j] >= tData->rangeEnd || taskIndex[iat[j]] != thread)
2016                     {
2017                         if (!tData->useInterdependentTask || ptype[iat[j]] == eptVSite)
2018                         {
2019                             /* At least one constructing atom is a vsite
2020                              * that is not assigned to the same thread.
2021                              * Put this vsite into a separate task.
2022                              */
2023                             task = 2 * nthread;
2024                             break;
2025                         }
2026
2027                         /* There are constructing atoms outside our range,
2028                          * put this vsite into a second task to be executed
2029                          * on the same thread. During construction no barrier
2030                          * is needed between the two tasks on the same thread.
2031                          * During spreading we need to run this task with
2032                          * an additional thread-local intermediate force buffer
2033                          * (or atomic reduction) and a barrier between the two
2034                          * tasks.
2035                          */
2036                         task = nthread + thread;
2037                     }
2038                 }
2039             }
2040             else
2041             {
2042                 for (int j = i + 2; j < i + inc; j += 3)
2043                 {
2044                     /* Do a range check to avoid a harmless race on taskIndex */
2045                     if (iat[j] < tData->rangeStart || iat[j] >= tData->rangeEnd || taskIndex[iat[j]] != thread)
2046                     {
2047                         GMX_ASSERT(ptype[iat[j]] != eptVSite,
2048                                    "A vsite to be assigned in assignVsitesToThread has a vsite as "
2049                                    "a constructing atom that does not belong to our task, such "
2050                                    "vsites should be assigned to the single 'master' task");
2051
2052                         task = nthread + thread;
2053                     }
2054                 }
2055             }
2056
2057             /* Update this vsite's thread index entry */
2058             taskIndex[iat[1 + i]] = task;
2059
2060             if (task == thread || task == nthread + thread)
2061             {
2062                 /* Copy this vsite to the thread data struct of thread */
2063                 InteractionList* il_task;
2064                 if (task == thread)
2065                 {
2066                     il_task = &tData->ilist[ftype];
2067                 }
2068                 else
2069                 {
2070                     il_task = &tData->idTask.ilist[ftype];
2071                 }
2072                 /* Copy the vsite data to the thread-task local array */
2073                 il_task->push_back(iat[i], nral1 - 1, iat + i + 1);
2074                 if (task == nthread + thread)
2075                 {
2076                     /* This vsite write outside our own task force block.
2077                      * Put it into the interdependent task list and flag
2078                      * the atoms involved for reduction.
2079                      */
2080                     tData->idTask.vsite.push_back(iat[i + 1]);
2081                     if (ftype != F_VSITEN)
2082                     {
2083                         for (int j = i + 2; j < i + nral1; j++)
2084                         {
2085                             flagAtom(&tData->idTask, iat[j], thread, nthread, natperthread);
2086                         }
2087                     }
2088                     else
2089                     {
2090                         for (int j = i + 2; j < i + inc; j += 3)
2091                         {
2092                             flagAtom(&tData->idTask, iat[j], thread, nthread, natperthread);
2093                         }
2094                     }
2095                 }
2096             }
2097
2098             i += inc;
2099         }
2100     }
2101 }
2102
2103 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2104 static void assignVsitesToSingleTask(VsiteThread*                    tData,
2105                                      int                             task,
2106                                      gmx::ArrayRef<const int>        taskIndex,
2107                                      ArrayRef<const InteractionList> ilist,
2108                                      ArrayRef<const t_iparams>       ip)
2109 {
2110     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2111     {
2112         tData->ilist[ftype].clear();
2113         tData->idTask.ilist[ftype].clear();
2114
2115         int              nral1   = 1 + NRAL(ftype);
2116         int              inc     = nral1;
2117         const int*       iat     = ilist[ftype].iatoms.data();
2118         InteractionList* il_task = &tData->ilist[ftype];
2119
2120         for (int i = 0; i < ilist[ftype].size();)
2121         {
2122             if (ftype == F_VSITEN)
2123             {
2124                 /* The 3 below is from 1+NRAL(ftype)=3 */
2125                 inc = ip[iat[i]].vsiten.n * 3;
2126             }
2127             /* Check if the vsite is assigned to our task */
2128             if (taskIndex[iat[1 + i]] == task)
2129             {
2130                 /* Copy the vsite data to the thread-task local array */
2131                 il_task->push_back(iat[i], inc - 1, iat + i + 1);
2132             }
2133
2134             i += inc;
2135         }
2136     }
2137 }
2138
2139 void split_vsites_over_threads(ArrayRef<const InteractionList> ilist,
2140                                ArrayRef<const t_iparams>       ip,
2141                                const t_mdatoms*                mdatoms,
2142                                gmx_vsite_t*                    vsite)
2143 {
2144     int vsite_atom_range, natperthread;
2145
2146     if (vsite->nthreads == 1)
2147     {
2148         /* Nothing to do */
2149         return;
2150     }
2151
2152     /* The current way of distributing the vsites over threads in primitive.
2153      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2154      * without taking into account how the vsites are distributed.
2155      * Without domain decomposition we at least tighten the upper bound
2156      * of the range (useful for common systems such as a vsite-protein
2157      * in 3-site water).
2158      * With domain decomposition, as long as the vsites are distributed
2159      * uniformly in each domain along the major dimension, usually x,
2160      * it will also perform well.
2161      */
2162     if (!vsite->useDomdec)
2163     {
2164         vsite_atom_range = -1;
2165         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2166         {
2167             { // TODO remove me
2168                 if (ftype != F_VSITEN)
2169                 {
2170                     int                 nral1 = 1 + NRAL(ftype);
2171                     ArrayRef<const int> iat   = ilist[ftype].iatoms;
2172                     for (int i = 0; i < ilist[ftype].size(); i += nral1)
2173                     {
2174                         for (int j = i + 1; j < i + nral1; j++)
2175                         {
2176                             vsite_atom_range = std::max(vsite_atom_range, iat[j]);
2177                         }
2178                     }
2179                 }
2180                 else
2181                 {
2182                     int vs_ind_end;
2183
2184                     ArrayRef<const int> iat = ilist[ftype].iatoms;
2185
2186                     int i = 0;
2187                     while (i < ilist[ftype].size())
2188                     {
2189                         /* The 3 below is from 1+NRAL(ftype)=3 */
2190                         vs_ind_end = i + ip[iat[i]].vsiten.n * 3;
2191
2192                         vsite_atom_range = std::max(vsite_atom_range, iat[i + 1]);
2193                         while (i < vs_ind_end)
2194                         {
2195                             vsite_atom_range = std::max(vsite_atom_range, iat[i + 2]);
2196                             i += 3;
2197                         }
2198                     }
2199                 }
2200             }
2201         }
2202         vsite_atom_range++;
2203         natperthread = (vsite_atom_range + vsite->nthreads - 1) / vsite->nthreads;
2204     }
2205     else
2206     {
2207         /* Any local or not local atom could be involved in virtual sites.
2208          * But since we usually have very few non-local virtual sites
2209          * (only non-local vsites that depend on local vsites),
2210          * we distribute the local atom range equally over the threads.
2211          * When assigning vsites to threads, we should take care that the last
2212          * threads also covers the non-local range.
2213          */
2214         vsite_atom_range = mdatoms->nr;
2215         natperthread     = (mdatoms->homenr + vsite->nthreads - 1) / vsite->nthreads;
2216     }
2217
2218     if (debug)
2219     {
2220         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n",
2221                 mdatoms->nr, vsite_atom_range, natperthread);
2222     }
2223
2224     /* To simplify the vsite assignment, we make an index which tells us
2225      * to which task particles, both non-vsites and vsites, are assigned.
2226      */
2227     vsite->taskIndex.resize(mdatoms->nr);
2228
2229     /* Initialize the task index array. Here we assign the non-vsite
2230      * particles to task=thread, so we easily figure out if vsites
2231      * depend on local and/or non-local particles in assignVsitesToThread.
2232      */
2233     gmx::ArrayRef<int> taskIndex = vsite->taskIndex;
2234     {
2235         int thread = 0;
2236         for (int i = 0; i < mdatoms->nr; i++)
2237         {
2238             if (mdatoms->ptype[i] == eptVSite)
2239             {
2240                 /* vsites are not assigned to a task yet */
2241                 taskIndex[i] = -1;
2242             }
2243             else
2244             {
2245                 /* assign non-vsite particles to task thread */
2246                 taskIndex[i] = thread;
2247             }
2248             if (i == (thread + 1) * natperthread && thread < vsite->nthreads)
2249             {
2250                 thread++;
2251             }
2252         }
2253     }
2254
2255 #pragma omp parallel num_threads(vsite->nthreads)
2256     {
2257         try
2258         {
2259             int          thread = gmx_omp_get_thread_num();
2260             VsiteThread& tData  = *vsite->tData[thread];
2261
2262             /* Clear the buffer use flags that were set before */
2263             if (tData.useInterdependentTask)
2264             {
2265                 InterdependentTask& idTask = tData.idTask;
2266
2267                 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2268                  * we clear the force buffer at the next step,
2269                  * so we need to do it here as well.
2270                  */
2271                 clearTaskForceBufferUsedElements(&idTask);
2272
2273                 idTask.vsite.resize(0);
2274                 for (int t = 0; t < vsite->nthreads; t++)
2275                 {
2276                     AtomIndex& atomIndex = idTask.atomIndex[t];
2277                     int        natom     = atomIndex.atom.size();
2278                     for (int i = 0; i < natom; i++)
2279                     {
2280                         idTask.use[atomIndex.atom[i]] = false;
2281                     }
2282                     atomIndex.atom.resize(0);
2283                 }
2284                 idTask.nuse = 0;
2285             }
2286
2287             /* To avoid large f_buf allocations of #threads*vsite_atom_range
2288              * we don't use task2 with more than 200000 atoms. This doesn't
2289              * affect performance, since with such a large range relatively few
2290              * vsites will end up in the separate task.
2291              * Note that useTask2 should be the same for all threads.
2292              */
2293             tData.useInterdependentTask = (vsite_atom_range <= 200000);
2294             if (tData.useInterdependentTask)
2295             {
2296                 size_t              natoms_use_in_vsites = vsite_atom_range;
2297                 InterdependentTask& idTask               = tData.idTask;
2298                 /* To avoid resizing and re-clearing every nstlist steps,
2299                  * we never down size the force buffer.
2300                  */
2301                 if (natoms_use_in_vsites > idTask.force.size() || natoms_use_in_vsites > idTask.use.size())
2302                 {
2303                     idTask.force.resize(natoms_use_in_vsites, { 0, 0, 0 });
2304                     idTask.use.resize(natoms_use_in_vsites, false);
2305                 }
2306             }
2307
2308             /* Assign all vsites that can execute independently on threads */
2309             tData.rangeStart = thread * natperthread;
2310             if (thread < vsite->nthreads - 1)
2311             {
2312                 tData.rangeEnd = (thread + 1) * natperthread;
2313             }
2314             else
2315             {
2316                 /* The last thread should cover up to the end of the range */
2317                 tData.rangeEnd = mdatoms->nr;
2318             }
2319             assignVsitesToThread(&tData, thread, vsite->nthreads, natperthread, taskIndex, ilist,
2320                                  ip, mdatoms->ptype);
2321
2322             if (tData.useInterdependentTask)
2323             {
2324                 /* In the worst case, all tasks write to force ranges of
2325                  * all other tasks, leading to #tasks^2 scaling (this is only
2326                  * the overhead, the actual flops remain constant).
2327                  * But in most cases there is far less coupling. To improve
2328                  * scaling at high thread counts we therefore construct
2329                  * an index to only loop over the actually affected tasks.
2330                  */
2331                 InterdependentTask& idTask = tData.idTask;
2332
2333                 /* Ensure assignVsitesToThread finished on other threads */
2334 #pragma omp barrier
2335
2336                 idTask.spreadTask.resize(0);
2337                 idTask.reduceTask.resize(0);
2338                 for (int t = 0; t < vsite->nthreads; t++)
2339                 {
2340                     /* Do we write to the force buffer of task t? */
2341                     if (!idTask.atomIndex[t].atom.empty())
2342                     {
2343                         idTask.spreadTask.push_back(t);
2344                     }
2345                     /* Does task t write to our force buffer? */
2346                     if (!vsite->tData[t]->idTask.atomIndex[thread].atom.empty())
2347                     {
2348                         idTask.reduceTask.push_back(t);
2349                     }
2350                 }
2351             }
2352         }
2353         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2354     }
2355     /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2356      * to a single task that will not run in parallel with other tasks.
2357      */
2358     assignVsitesToSingleTask(vsite->tData[vsite->nthreads].get(), 2 * vsite->nthreads, taskIndex,
2359                              ilist, ip);
2360
2361     if (debug && vsite->nthreads > 1)
2362     {
2363         fprintf(debug, "virtual site useInterdependentTask %d, nuse:\n",
2364                 static_cast<int>(vsite->tData[0]->useInterdependentTask));
2365         for (int th = 0; th < vsite->nthreads + 1; th++)
2366         {
2367             fprintf(debug, " %4d", vsite->tData[th]->idTask.nuse);
2368         }
2369         fprintf(debug, "\n");
2370
2371         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2372         {
2373             if (!ilist[ftype].empty())
2374             {
2375                 fprintf(debug, "%-20s thread dist:", interaction_function[ftype].longname);
2376                 for (int th = 0; th < vsite->nthreads + 1; th++)
2377                 {
2378                     fprintf(debug, " %4d %4d ", vsite->tData[th]->ilist[ftype].size(),
2379                             vsite->tData[th]->idTask.ilist[ftype].size());
2380                 }
2381                 fprintf(debug, "\n");
2382             }
2383         }
2384     }
2385
2386 #ifndef NDEBUG
2387     int nrOrig     = vsiteIlistNrCount(ilist);
2388     int nrThreaded = 0;
2389     for (int th = 0; th < vsite->nthreads + 1; th++)
2390     {
2391         nrThreaded += vsiteIlistNrCount(vsite->tData[th]->ilist)
2392                       + vsiteIlistNrCount(vsite->tData[th]->idTask.ilist);
2393     }
2394     GMX_ASSERT(nrThreaded == nrOrig,
2395                "The number of virtual sites assigned to all thread task has to match the total "
2396                "number of virtual sites");
2397 #endif
2398 }
2399
2400 void set_vsite_top(gmx_vsite_t* vsite, const gmx_localtop_t* top, const t_mdatoms* md)
2401 {
2402     if (vsite->nthreads > 1)
2403     {
2404         split_vsites_over_threads(top->idef.il, top->idef.iparams, md, vsite);
2405     }
2406 }