src/gromacs/mdlib/vsite.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "vsite.h"
  40
  41 #include <cstdio>
  42
  43 #include <algorithm>
  44 #include <vector>
  45
  46 #include "gromacs/compat/make_unique.h"
  47 #include "gromacs/domdec/domdec.h"
  48 #include "gromacs/domdec/domdec_struct.h"
  49 #include "gromacs/gmxlib/network.h"
  50 #include "gromacs/gmxlib/nrnb.h"
  51 #include "gromacs/math/functions.h"
  52 #include "gromacs/math/vec.h"
  53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  54 #include "gromacs/mdtypes/commrec.h"
  55 #include "gromacs/mdtypes/mdatom.h"
  56 #include "gromacs/pbcutil/ishift.h"
  57 #include "gromacs/pbcutil/mshift.h"
  58 #include "gromacs/pbcutil/pbc.h"
  59 #include "gromacs/timing/wallcycle.h"
  60 #include "gromacs/topology/ifunc.h"
  61 #include "gromacs/topology/mtop_util.h"
  62 #include "gromacs/topology/topology.h"
  63 #include "gromacs/utility/exceptions.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/gmxassert.h"
  66 #include "gromacs/utility/gmxomp.h"
  67
  68 /* The strategy used here for assigning virtual sites to (thread-)tasks
  69  * is as follows:
  70  *
  71  * We divide the atom range that vsites operate on (natoms_local with DD,
  72  * 0 - last atom involved in vsites without DD) equally over all threads.
  73  *
  74  * Vsites in the local range constructed from atoms in the local range
  75  * and/or other vsites that are fully local are assigned to a simple,
  76  * independent task.
  77  *
  78  * Vsites that are not assigned after using the above criterion get assigned
  79  * to a so called "interdependent" thread task when none of the constructing
  80  * atoms is a vsite. These tasks are called interdependent, because one task
  81  * accesses atoms assigned to a different task/thread.
  82  * Note that this option is turned off with large (local) atom counts
  83  * to avoid high memory usage.
  84  *
  85  * Any remaining vsites are assigned to a separate master thread task.
  86  */
  87
  88 using gmx::RVec;
  89
  90 static void init_ilist(t_ilist *ilist)
  91 {
  92     for (int i = 0; i < F_NRE; i++)
  93     {
  94         ilist[i].nr     = 0;
  95         ilist[i].nalloc = 0;
  96         ilist[i].iatoms = nullptr;
  97     }
  98 }
  99
 100 /*! \brief List of atom indices belonging to a task */
 101 struct AtomIndex {
 102     //! List of atom indices
 103     std::vector<int> atom;
 104 };
 105
 106 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
 107 struct InterdependentTask
 108 {
 109     //! The interaction lists, only vsite entries are used
 110     t_ilist                ilist[F_NRE];
 111     //! Thread/task-local force buffer
 112     std::vector<RVec>      force;
 113     //! The atom indices of the vsites of our task
 114     std::vector<int>       vsite;
 115     //! Flags if elements in force are spread to or not
 116     std::vector<bool>      use;
 117     //! The number of entries set to true in use
 118     int                    nuse;
 119     //! Array of atoms indices, size nthreads, covering all nuse set elements in use
 120     std::vector<AtomIndex> atomIndex;
 121     //! List of tasks (force blocks) this task spread forces to
 122     std::vector<int>       spreadTask;
 123     //! List of tasks that write to this tasks force block range
 124     std::vector<int>       reduceTask;
 125
 126     InterdependentTask()
 127     {
 128         init_ilist(ilist);
 129         nuse = 0;
 130     }
 131 };
 132
 133 /*! \brief Vsite thread task data structure */
 134 struct VsiteThread
 135 {
 136     //! Start of atom range of this task
 137     int                rangeStart;
 138     //! End of atom range of this task
 139     int                rangeEnd;
 140     //! The interaction lists, only vsite entries are used
 141     t_ilist            ilist[F_NRE];
 142     //! Local fshift accumulation buffer
 143     rvec               fshift[SHIFTS];
 144     //! Local virial dx*df accumulation buffer
 145     matrix             dxdf;
 146     //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
 147     bool               useInterdependentTask;
 148     //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
 149     InterdependentTask idTask;
 150
 151     /*! \brief Constructor */
 152     VsiteThread()
 153     {
 154         rangeStart            = -1;
 155         rangeEnd              = -1;
 156         init_ilist(ilist);
 157         clear_rvecs(SHIFTS, fshift);
 158         clear_mat(dxdf);
 159         useInterdependentTask = false;
 160     }
 161 };
 162
 163 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
 164  *
 165  * \param[in] ilist  The interaction list
 166  */
 167 static int vsiteIlistNrCount(const t_ilist *ilist)
 168 {
 169     int nr = 0;
 170     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 171     {
 172         nr += ilist[ftype].nr;
 173     }
 174
 175     return nr;
 176 }
 177
 178 static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 179 {
 180     if (pbc)
 181     {
 182         return pbc_dx_aiuc(pbc, xi, xj, dx);
 183     }
 184     else
 185     {
 186         rvec_sub(xi, xj, dx);
 187         return CENTRAL;
 188     }
 189 }
 190
 191 /* Vsite construction routines */
 192
 193 static void constr_vsite2(const rvec xi, const rvec xj, rvec x, real a, const t_pbc *pbc)
 194 {
 195     real b = 1 - a;
 196     /* 1 flop */
 197
 198     if (pbc)
 199     {
 200         rvec dx;
 201         pbc_dx_aiuc(pbc, xj, xi, dx);
 202         x[XX] = xi[XX] + a*dx[XX];
 203         x[YY] = xi[YY] + a*dx[YY];
 204         x[ZZ] = xi[ZZ] + a*dx[ZZ];
 205     }
 206     else
 207     {
 208         x[XX] = b*xi[XX] + a*xj[XX];
 209         x[YY] = b*xi[YY] + a*xj[YY];
 210         x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 211         /* 9 Flops */
 212     }
 213
 214     /* TOTAL: 10 flops */
 215 }
 216
 217 static void constr_vsite3(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 218                           const t_pbc *pbc)
 219 {
 220     real c = 1 - a - b;
 221     /* 2 flops */
 222
 223     if (pbc)
 224     {
 225         rvec dxj, dxk;
 226
 227         pbc_dx_aiuc(pbc, xj, xi, dxj);
 228         pbc_dx_aiuc(pbc, xk, xi, dxk);
 229         x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 230         x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 231         x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 232     }
 233     else
 234     {
 235         x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 236         x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 237         x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 238         /* 15 Flops */
 239     }
 240
 241     /* TOTAL: 17 flops */
 242 }
 243
 244 static void constr_vsite3FD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 245                             const t_pbc *pbc)
 246 {
 247     rvec xij, xjk, temp;
 248     real c;
 249
 250     pbc_rvec_sub(pbc, xj, xi, xij);
 251     pbc_rvec_sub(pbc, xk, xj, xjk);
 252     /* 6 flops */
 253
 254     /* temp goes from i to a point on the line jk */
 255     temp[XX] = xij[XX] + a*xjk[XX];
 256     temp[YY] = xij[YY] + a*xjk[YY];
 257     temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 258     /* 6 flops */
 259
 260     c = b*gmx::invsqrt(iprod(temp, temp));
 261     /* 6 + 10 flops */
 262
 263     x[XX] = xi[XX] + c*temp[XX];
 264     x[YY] = xi[YY] + c*temp[YY];
 265     x[ZZ] = xi[ZZ] + c*temp[ZZ];
 266     /* 6 Flops */
 267
 268     /* TOTAL: 34 flops */
 269 }
 270
 271 static void constr_vsite3FAD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc *pbc)
 272 {
 273     rvec xij, xjk, xp;
 274     real a1, b1, c1, invdij;
 275
 276     pbc_rvec_sub(pbc, xj, xi, xij);
 277     pbc_rvec_sub(pbc, xk, xj, xjk);
 278     /* 6 flops */
 279
 280     invdij = gmx::invsqrt(iprod(xij, xij));
 281     c1     = invdij * invdij * iprod(xij, xjk);
 282     xp[XX] = xjk[XX] - c1*xij[XX];
 283     xp[YY] = xjk[YY] - c1*xij[YY];
 284     xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 285     a1     = a*invdij;
 286     b1     = b*gmx::invsqrt(iprod(xp, xp));
 287     /* 45 */
 288
 289     x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 290     x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 291     x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 292     /* 12 Flops */
 293
 294     /* TOTAL: 63 flops */
 295 }
 296
 297 static void constr_vsite3OUT(const rvec xi, const rvec xj, const rvec xk, rvec x,
 298                              real a, real b, real c, const t_pbc *pbc)
 299 {
 300     rvec xij, xik, temp;
 301
 302     pbc_rvec_sub(pbc, xj, xi, xij);
 303     pbc_rvec_sub(pbc, xk, xi, xik);
 304     cprod(xij, xik, temp);
 305     /* 15 Flops */
 306
 307     x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 308     x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 309     x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 310     /* 18 Flops */
 311
 312     /* TOTAL: 33 flops */
 313 }
 314
 315 static void constr_vsite4FD(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 316                             real a, real b, real c, const t_pbc *pbc)
 317 {
 318     rvec xij, xjk, xjl, temp;
 319     real d;
 320
 321     pbc_rvec_sub(pbc, xj, xi, xij);
 322     pbc_rvec_sub(pbc, xk, xj, xjk);
 323     pbc_rvec_sub(pbc, xl, xj, xjl);
 324     /* 9 flops */
 325
 326     /* temp goes from i to a point on the plane jkl */
 327     temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 328     temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 329     temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 330     /* 12 flops */
 331
 332     d = c*gmx::invsqrt(iprod(temp, temp));
 333     /* 6 + 10 flops */
 334
 335     x[XX] = xi[XX] + d*temp[XX];
 336     x[YY] = xi[YY] + d*temp[YY];
 337     x[ZZ] = xi[ZZ] + d*temp[ZZ];
 338     /* 6 Flops */
 339
 340     /* TOTAL: 43 flops */
 341 }
 342
 343 static void constr_vsite4FDN(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 344                              real a, real b, real c, const t_pbc *pbc)
 345 {
 346     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 347     real d;
 348
 349     pbc_rvec_sub(pbc, xj, xi, xij);
 350     pbc_rvec_sub(pbc, xk, xi, xik);
 351     pbc_rvec_sub(pbc, xl, xi, xil);
 352     /* 9 flops */
 353
 354     ra[XX] = a*xik[XX];
 355     ra[YY] = a*xik[YY];
 356     ra[ZZ] = a*xik[ZZ];
 357
 358     rb[XX] = b*xil[XX];
 359     rb[YY] = b*xil[YY];
 360     rb[ZZ] = b*xil[ZZ];
 361
 362     /* 6 flops */
 363
 364     rvec_sub(ra, xij, rja);
 365     rvec_sub(rb, xij, rjb);
 366     /* 6 flops */
 367
 368     cprod(rja, rjb, rm);
 369     /* 9 flops */
 370
 371     d = c*gmx::invsqrt(norm2(rm));
 372     /* 5+5+1 flops */
 373
 374     x[XX] = xi[XX] + d*rm[XX];
 375     x[YY] = xi[YY] + d*rm[YY];
 376     x[ZZ] = xi[ZZ] + d*rm[ZZ];
 377     /* 6 Flops */
 378
 379     /* TOTAL: 47 flops */
 380 }
 381
 382
 383 static int constr_vsiten(const t_iatom *ia, const t_iparams ip[],
 384                          rvec *x, const t_pbc *pbc)
 385 {
 386     rvec x1, dx;
 387     dvec dsum;
 388     int  n3, av, ai;
 389     real a;
 390
 391     n3 = 3*ip[ia[0]].vsiten.n;
 392     av = ia[1];
 393     ai = ia[2];
 394     copy_rvec(x[ai], x1);
 395     clear_dvec(dsum);
 396     for (int i = 3; i < n3; i += 3)
 397     {
 398         ai = ia[i+2];
 399         a  = ip[ia[i]].vsiten.a;
 400         if (pbc)
 401         {
 402             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 403         }
 404         else
 405         {
 406             rvec_sub(x[ai], x1, dx);
 407         }
 408         dsum[XX] += a*dx[XX];
 409         dsum[YY] += a*dx[YY];
 410         dsum[ZZ] += a*dx[ZZ];
 411         /* 9 Flops */
 412     }
 413
 414     x[av][XX] = x1[XX] + dsum[XX];
 415     x[av][YY] = x1[YY] + dsum[YY];
 416     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 417
 418     return n3;
 419 }
 420
 421 /*! \brief PBC modes for vsite construction and spreading */
 422 enum class PbcMode
 423 {
 424     all,         // Apply normal, simple PBC for all vsites
 425     chargeGroup, // Keep vsite in the same periodic image as the rest of it's charge group
 426     none         // No PBC treatment needed
 427 };
 428
 429 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
 430  *
 431  * \param[in] pbcPtr  A pointer to a PBC struct or nullptr when no PBC treatment is required
 432  * \param[in] vsite   A pointer to the vsite struct, can be nullptr
 433  */
 434 static PbcMode getPbcMode(const t_pbc       *pbcPtr,
 435                           const gmx_vsite_t *vsite)
 436 {
 437     if (pbcPtr == nullptr)
 438     {
 439         return PbcMode::none;
 440     }
 441     else if (vsite != nullptr && vsite->bHaveChargeGroups)
 442     {
 443         return PbcMode::chargeGroup;
 444     }
 445     else
 446     {
 447         return PbcMode::all;
 448     }
 449 }
 450
 451 static void construct_vsites_thread(const gmx_vsite_t *vsite,
 452                                     rvec x[],
 453                                     real dt, rvec *v,
 454                                     const t_iparams ip[], const t_ilist ilist[],
 455                                     const t_pbc *pbc_null)
 456 {
 457     real         inv_dt;
 458     if (v != nullptr)
 459     {
 460         inv_dt = 1.0/dt;
 461     }
 462     else
 463     {
 464         inv_dt = 1.0;
 465     }
 466
 467     const PbcMode            pbcMode   = getPbcMode(pbc_null, vsite);
 468     /* We need another pbc pointer, as with charge groups we switch per vsite */
 469     const t_pbc             *pbc_null2 = pbc_null;
 470     gmx::ArrayRef<const int> vsite_pbc;
 471
 472     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 473     {
 474         if (ilist[ftype].nr == 0)
 475         {
 476             continue;
 477         }
 478
 479         {   // TODO remove me
 480             int            nra = interaction_function[ftype].nratoms;
 481             int            inc = 1 + nra;
 482             int            nr  = ilist[ftype].nr;
 483
 484             const t_iatom *ia = ilist[ftype].iatoms;
 485
 486             if (pbcMode == PbcMode::chargeGroup)
 487             {
 488                 vsite_pbc = (*vsite->vsite_pbc_loc)[ftype - c_ftypeVsiteStart];
 489             }
 490
 491             for (int i = 0; i < nr; )
 492             {
 493                 int  tp     = ia[0];
 494                 /* The vsite and constructing atoms */
 495                 int  avsite = ia[1];
 496                 int  ai     = ia[2];
 497                 /* Constants for constructing vsites */
 498                 real a1     = ip[tp].vsite.a;
 499                 /* Check what kind of pbc we need to use */
 500                 int  pbc_atom;
 501                 rvec xpbc;
 502                 if (pbcMode == PbcMode::all)
 503                 {
 504                     /* No charge groups, vsite follows its own pbc */
 505                     pbc_atom = avsite;
 506                     copy_rvec(x[avsite], xpbc);
 507                 }
 508                 else if (pbcMode == PbcMode::chargeGroup)
 509                 {
 510                     pbc_atom = vsite_pbc[i/(1 + nra)];
 511                     if (pbc_atom > -2)
 512                     {
 513                         if (pbc_atom >= 0)
 514                         {
 515                             /* We need to copy the coordinates here,
 516                              * single for single atom cg's pbc_atom
 517                              * is the vsite itself.
 518                              */
 519                             copy_rvec(x[pbc_atom], xpbc);
 520                         }
 521                         pbc_null2 = pbc_null;
 522                     }
 523                     else
 524                     {
 525                         pbc_null2 = nullptr;
 526                     }
 527                 }
 528                 else
 529                 {
 530                     pbc_atom = -2;
 531                 }
 532                 /* Copy the old position */
 533                 rvec xv;
 534                 copy_rvec(x[avsite], xv);
 535
 536                 /* Construct the vsite depending on type */
 537                 int  aj, ak, al;
 538                 real b1, c1;
 539                 switch (ftype)
 540                 {
 541                     case F_VSITE2:
 542                         aj = ia[3];
 543                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 544                         break;
 545                     case F_VSITE3:
 546                         aj = ia[3];
 547                         ak = ia[4];
 548                         b1 = ip[tp].vsite.b;
 549                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 550                         break;
 551                     case F_VSITE3FD:
 552                         aj = ia[3];
 553                         ak = ia[4];
 554                         b1 = ip[tp].vsite.b;
 555                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 556                         break;
 557                     case F_VSITE3FAD:
 558                         aj = ia[3];
 559                         ak = ia[4];
 560                         b1 = ip[tp].vsite.b;
 561                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 562                         break;
 563                     case F_VSITE3OUT:
 564                         aj = ia[3];
 565                         ak = ia[4];
 566                         b1 = ip[tp].vsite.b;
 567                         c1 = ip[tp].vsite.c;
 568                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 569                         break;
 570                     case F_VSITE4FD:
 571                         aj = ia[3];
 572                         ak = ia[4];
 573                         al = ia[5];
 574                         b1 = ip[tp].vsite.b;
 575                         c1 = ip[tp].vsite.c;
 576                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 577                                         pbc_null2);
 578                         break;
 579                     case F_VSITE4FDN:
 580                         aj = ia[3];
 581                         ak = ia[4];
 582                         al = ia[5];
 583                         b1 = ip[tp].vsite.b;
 584                         c1 = ip[tp].vsite.c;
 585                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 586                                          pbc_null2);
 587                         break;
 588                     case F_VSITEN:
 589                         inc = constr_vsiten(ia, ip, x, pbc_null2);
 590                         break;
 591                     default:
 592                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 593                                   ftype, __FILE__, __LINE__);
 594                 }
 595
 596                 if (pbc_atom >= 0)
 597                 {
 598                     /* Match the pbc of this vsite to the rest of its charge group */
 599                     rvec dx;
 600                     int  ishift = pbc_dx_aiuc(pbc_null, x[avsite], xpbc, dx);
 601                     if (ishift != CENTRAL)
 602                     {
 603                         rvec_add(xpbc, dx, x[avsite]);
 604                     }
 605                 }
 606                 if (v != nullptr)
 607                 {
 608                     /* Calculate velocity of vsite... */
 609                     rvec vv;
 610                     rvec_sub(x[avsite], xv, vv);
 611                     svmul(inv_dt, vv, v[avsite]);
 612                 }
 613
 614                 /* Increment loop variables */
 615                 i  += inc;
 616                 ia += inc;
 617             }
 618         }
 619     }
 620 }
 621
 622 void construct_vsites(const gmx_vsite_t *vsite,
 623                       rvec x[],
 624                       real dt, rvec *v,
 625                       const t_iparams ip[], const t_ilist ilist[],
 626                       int ePBC, gmx_bool bMolPBC,
 627                       const t_commrec *cr,
 628                       const matrix box)
 629 {
 630     const bool useDomdec = (vsite != nullptr && vsite->useDomdec);
 631     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)), "When vsites are set up with domain decomposition, we need a valid commrec");
 632     // TODO: Remove this assertion when we remove charge groups
 633     GMX_ASSERT(vsite != nullptr || ePBC == epbcNONE, "Without a vsite struct we can not do PBC (in case we have charge groups)");
 634
 635     t_pbc     pbc, *pbc_null;
 636
 637     /* We only need to do pbc when we have inter-cg vsites.
 638      * Note that with domain decomposition we do not need to apply PBC here
 639      * when we have at least 3 domains along each dimension. Currently we
 640      * do not optimize this case.
 641      */
 642     if (ePBC != epbcNONE && (useDomdec || bMolPBC) &&
 643         !(vsite != nullptr && vsite->n_intercg_vsite == 0))
 644     {
 645         /* This is wasting some CPU time as we now do this multiple times
 646          * per MD step.
 647          */
 648         ivec null_ivec;
 649         clear_ivec(null_ivec);
 650         pbc_null = set_pbc_dd(&pbc, ePBC,
 651                               useDomdec ? cr->dd->nc : null_ivec,
 652                               FALSE, box);
 653     }
 654     else
 655     {
 656         pbc_null = nullptr;
 657     }
 658
 659     if (useDomdec)
 660     {
 661         dd_move_x_vsites(cr->dd, box, x);
 662     }
 663
 664     if (vsite == nullptr || vsite->nthreads == 1)
 665     {
 666         construct_vsites_thread(vsite,
 667                                 x, dt, v,
 668                                 ip, ilist,
 669                                 pbc_null);
 670     }
 671     else
 672     {
 673 #pragma omp parallel num_threads(vsite->nthreads)
 674         {
 675             try
 676             {
 677                 const int          th    = gmx_omp_get_thread_num();
 678                 const VsiteThread &tData = *vsite->tData[th];
 679                 GMX_ASSERT(tData.rangeStart >= 0, "The thread data should be initialized before calling construct_vsites");
 680
 681                 construct_vsites_thread(vsite,
 682                                         x, dt, v,
 683                                         ip, tData.ilist,
 684                                         pbc_null);
 685                 if (tData.useInterdependentTask)
 686                 {
 687                     /* Here we don't need a barrier (unlike the spreading),
 688                      * since both tasks only construct vsites from particles,
 689                      * or local vsites, not from non-local vsites.
 690                      */
 691                     construct_vsites_thread(vsite,
 692                                             x, dt, v,
 693                                             ip, tData.idTask.ilist,
 694                                             pbc_null);
 695                 }
 696             }
 697             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 698         }
 699         /* Now we can construct the vsites that might depend on other vsites */
 700         construct_vsites_thread(vsite,
 701                                 x, dt, v,
 702                                 ip, vsite->tData[vsite->nthreads]->ilist,
 703                                 pbc_null);
 704     }
 705 }
 706
 707 static void spread_vsite2(const t_iatom ia[], real a,
 708                           const rvec x[],
 709                           rvec f[], rvec fshift[],
 710                           const t_pbc *pbc, const t_graph *g)
 711 {
 712     rvec    fi, fj, dx;
 713     t_iatom av, ai, aj;
 714     ivec    di;
 715     int     siv, sij;
 716
 717     av = ia[1];
 718     ai = ia[2];
 719     aj = ia[3];
 720
 721     svmul(1 - a, f[av], fi);
 722     svmul(    a, f[av], fj);
 723     /* 7 flop */
 724
 725     rvec_inc(f[ai], fi);
 726     rvec_inc(f[aj], fj);
 727     /* 6 Flops */
 728
 729     if (g)
 730     {
 731         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 732         siv = IVEC2IS(di);
 733         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 734         sij = IVEC2IS(di);
 735     }
 736     else if (pbc)
 737     {
 738         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 739         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 740     }
 741     else
 742     {
 743         siv = CENTRAL;
 744         sij = CENTRAL;
 745     }
 746
 747     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 748     {
 749         rvec_inc(fshift[siv], f[av]);
 750         rvec_dec(fshift[CENTRAL], fi);
 751         rvec_dec(fshift[sij], fj);
 752     }
 753
 754     /* TOTAL: 13 flops */
 755 }
 756
 757 void constructVsitesGlobal(const gmx_mtop_t         &mtop,
 758                            gmx::ArrayRef<gmx::RVec>  x)
 759 {
 760     GMX_ASSERT(x.size() >= static_cast<gmx::index>(mtop.natoms), "x should contain the whole system");
 761     GMX_ASSERT(!mtop.moleculeBlockIndices.empty(), "molblock indices are needed in constructVsitesGlobal");
 762
 763     for (size_t mb = 0; mb < mtop.molblock.size(); mb++)
 764     {
 765         const gmx_molblock_t  &molb = mtop.molblock[mb];
 766         const gmx_moltype_t   &molt = mtop.moltype[molb.type];
 767         if (vsiteIlistNrCount(molt.ilist) > 0)
 768         {
 769             int atomOffset = mtop.moleculeBlockIndices[mb].globalAtomStart;
 770             for (int mol = 0; mol < molb.nmol; mol++)
 771             {
 772                 construct_vsites(nullptr, as_rvec_array(x.data()) + atomOffset,
 773                                  0.0, nullptr,
 774                                  mtop.ffparams.iparams, molt.ilist,
 775                                  epbcNONE, TRUE, nullptr, nullptr);
 776                 atomOffset += molt.atoms.nr;
 777             }
 778         }
 779     }
 780 }
 781
 782 static void spread_vsite3(const t_iatom ia[], real a, real b,
 783                           const rvec x[],
 784                           rvec f[], rvec fshift[],
 785                           const t_pbc *pbc, const t_graph *g)
 786 {
 787     rvec    fi, fj, fk, dx;
 788     int     av, ai, aj, ak;
 789     ivec    di;
 790     int     siv, sij, sik;
 791
 792     av = ia[1];
 793     ai = ia[2];
 794     aj = ia[3];
 795     ak = ia[4];
 796
 797     svmul(1 - a - b, f[av], fi);
 798     svmul(        a, f[av], fj);
 799     svmul(        b, f[av], fk);
 800     /* 11 flops */
 801
 802     rvec_inc(f[ai], fi);
 803     rvec_inc(f[aj], fj);
 804     rvec_inc(f[ak], fk);
 805     /* 9 Flops */
 806
 807     if (g)
 808     {
 809         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 810         siv = IVEC2IS(di);
 811         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 812         sij = IVEC2IS(di);
 813         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 814         sik = IVEC2IS(di);
 815     }
 816     else if (pbc)
 817     {
 818         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 819         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 820         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 821     }
 822     else
 823     {
 824         siv = CENTRAL;
 825         sij = CENTRAL;
 826         sik = CENTRAL;
 827     }
 828
 829     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 830     {
 831         rvec_inc(fshift[siv], f[av]);
 832         rvec_dec(fshift[CENTRAL], fi);
 833         rvec_dec(fshift[sij], fj);
 834         rvec_dec(fshift[sik], fk);
 835     }
 836
 837     /* TOTAL: 20 flops */
 838 }
 839
 840 static void spread_vsite3FD(const t_iatom ia[], real a, real b,
 841                             const rvec x[],
 842                             rvec f[], rvec fshift[],
 843                             gmx_bool VirCorr, matrix dxdf,
 844                             const t_pbc *pbc, const t_graph *g)
 845 {
 846     real    c, invl, fproj, a1;
 847     rvec    xvi, xij, xjk, xix, fv, temp;
 848     t_iatom av, ai, aj, ak;
 849     int     svi, sji, skj;
 850     ivec    di;
 851
 852     av = ia[1];
 853     ai = ia[2];
 854     aj = ia[3];
 855     ak = ia[4];
 856     copy_rvec(f[av], fv);
 857
 858     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 859     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 860     /* 6 flops */
 861
 862     /* xix goes from i to point x on the line jk */
 863     xix[XX] = xij[XX]+a*xjk[XX];
 864     xix[YY] = xij[YY]+a*xjk[YY];
 865     xix[ZZ] = xij[ZZ]+a*xjk[ZZ];
 866     /* 6 flops */
 867
 868     invl = gmx::invsqrt(iprod(xix, xix));
 869     c    = b*invl;
 870     /* 4 + ?10? flops */
 871
 872     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 873
 874     temp[XX] = c*(fv[XX]-fproj*xix[XX]);
 875     temp[YY] = c*(fv[YY]-fproj*xix[YY]);
 876     temp[ZZ] = c*(fv[ZZ]-fproj*xix[ZZ]);
 877     /* 16 */
 878
 879     /* c is already calculated in constr_vsite3FD
 880        storing c somewhere will save 26 flops!     */
 881
 882     a1         = 1 - a;
 883     f[ai][XX] += fv[XX] - temp[XX];
 884     f[ai][YY] += fv[YY] - temp[YY];
 885     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 886     f[aj][XX] += a1*temp[XX];
 887     f[aj][YY] += a1*temp[YY];
 888     f[aj][ZZ] += a1*temp[ZZ];
 889     f[ak][XX] += a*temp[XX];
 890     f[ak][YY] += a*temp[YY];
 891     f[ak][ZZ] += a*temp[ZZ];
 892     /* 19 Flops */
 893
 894     if (g)
 895     {
 896         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 897         svi = IVEC2IS(di);
 898         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 899         sji = IVEC2IS(di);
 900         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 901         skj = IVEC2IS(di);
 902     }
 903     else if (pbc)
 904     {
 905         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 906     }
 907     else
 908     {
 909         svi = CENTRAL;
 910     }
 911
 912     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 913     {
 914         rvec_dec(fshift[svi], fv);
 915         fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 916         fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 917         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 918         fshift[    sji][XX] += temp[XX];
 919         fshift[    sji][YY] += temp[YY];
 920         fshift[    sji][ZZ] += temp[ZZ];
 921         fshift[    skj][XX] += a*temp[XX];
 922         fshift[    skj][YY] += a*temp[YY];
 923         fshift[    skj][ZZ] += a*temp[ZZ];
 924     }
 925
 926     if (VirCorr)
 927     {
 928         /* When VirCorr=TRUE, the virial for the current forces is not
 929          * calculated from the redistributed forces. This means that
 930          * the effect of non-linear virtual site constructions on the virial
 931          * needs to be added separately. This contribution can be calculated
 932          * in many ways, but the simplest and cheapest way is to use
 933          * the first constructing atom ai as a reference position in space:
 934          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 935          */
 936         rvec xiv;
 937
 938         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 939
 940         for (int i = 0; i < DIM; i++)
 941         {
 942             for (int j = 0; j < DIM; j++)
 943             {
 944                 /* As xix is a linear combination of j and k, use that here */
 945                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 946             }
 947         }
 948     }
 949
 950     /* TOTAL: 61 flops */
 951 }
 952
 953 static void spread_vsite3FAD(const t_iatom ia[], real a, real b,
 954                              const rvec x[],
 955                              rvec f[], rvec fshift[],
 956                              gmx_bool VirCorr, matrix dxdf,
 957                              const t_pbc *pbc, const t_graph *g)
 958 {
 959     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 960     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 961     t_iatom av, ai, aj, ak;
 962     int     svi, sji, skj, d;
 963     ivec    di;
 964
 965     av = ia[1];
 966     ai = ia[2];
 967     aj = ia[3];
 968     ak = ia[4];
 969     copy_rvec(f[ia[1]], fv);
 970
 971     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 972     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 973     /* 6 flops */
 974
 975     invdij    = gmx::invsqrt(iprod(xij, xij));
 976     invdij2   = invdij * invdij;
 977     c1        = iprod(xij, xjk) * invdij2;
 978     xperp[XX] = xjk[XX] - c1*xij[XX];
 979     xperp[YY] = xjk[YY] - c1*xij[YY];
 980     xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 981     /* xperp in plane ijk, perp. to ij */
 982     invdp = gmx::invsqrt(iprod(xperp, xperp));
 983     a1    = a*invdij;
 984     b1    = b*invdp;
 985     /* 45 flops */
 986
 987     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 988        storing them somewhere will save 45 flops!     */
 989
 990     fproj = iprod(xij, fv)*invdij2;
 991     svmul(fproj,                      xij,  Fpij);    /* proj. f on xij */
 992     svmul(iprod(xperp, fv)*invdp*invdp, xperp, Fppp); /* proj. f on xperp */
 993     svmul(b1*fproj,                   xperp, f3);
 994     /* 23 flops */
 995
 996     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
 997     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
 998     for (d = 0; (d < DIM); d++)
 999     {
1000         f1[d] *= a1;
1001         f2[d] *= b1;
1002     }
1003     /* 12 flops */
1004
1005     c2         = 1 + c1;
1006     f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
1007     f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
1008     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
1009     f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
1010     f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
1011     f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
1012     f[ak][XX] +=                      f2[XX];
1013     f[ak][YY] +=                      f2[YY];
1014     f[ak][ZZ] +=                      f2[ZZ];
1015     /* 30 Flops */
1016
1017     if (g)
1018     {
1019         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1020         svi = IVEC2IS(di);
1021         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1022         sji = IVEC2IS(di);
1023         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1024         skj = IVEC2IS(di);
1025     }
1026     else if (pbc)
1027     {
1028         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1029     }
1030     else
1031     {
1032         svi = CENTRAL;
1033     }
1034
1035     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1036     {
1037         rvec_dec(fshift[svi], fv);
1038         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
1039         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
1040         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
1041         fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
1042         fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
1043         fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
1044         fshift[    skj][XX] +=                          f2[XX];
1045         fshift[    skj][YY] +=                          f2[YY];
1046         fshift[    skj][ZZ] +=                          f2[ZZ];
1047     }
1048
1049     if (VirCorr)
1050     {
1051         rvec xiv;
1052         int  i, j;
1053
1054         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1055
1056         for (i = 0; i < DIM; i++)
1057         {
1058             for (j = 0; j < DIM; j++)
1059             {
1060                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1061                 dxdf[i][j] +=
1062                     -xiv[i]*fv[j]
1063                     + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
1064                     + xjk[i]*f2[j];
1065             }
1066         }
1067     }
1068
1069     /* TOTAL: 113 flops */
1070 }
1071
1072 static void spread_vsite3OUT(const t_iatom ia[], real a, real b, real c,
1073                              const rvec x[],
1074                              rvec f[], rvec fshift[],
1075                              gmx_bool VirCorr, matrix dxdf,
1076                              const t_pbc *pbc, const t_graph *g)
1077 {
1078     rvec    xvi, xij, xik, fv, fj, fk;
1079     real    cfx, cfy, cfz;
1080     int     av, ai, aj, ak;
1081     ivec    di;
1082     int     svi, sji, ski;
1083
1084     av = ia[1];
1085     ai = ia[2];
1086     aj = ia[3];
1087     ak = ia[4];
1088
1089     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1090     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1091     /* 6 Flops */
1092
1093     copy_rvec(f[av], fv);
1094
1095     cfx = c*fv[XX];
1096     cfy = c*fv[YY];
1097     cfz = c*fv[ZZ];
1098     /* 3 Flops */
1099
1100     fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
1101     fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
1102     fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
1103
1104     fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
1105     fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
1106     fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
1107     /* 30 Flops */
1108
1109     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1110     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1111     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1112     rvec_inc(f[aj], fj);
1113     rvec_inc(f[ak], fk);
1114     /* 15 Flops */
1115
1116     if (g)
1117     {
1118         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1119         svi = IVEC2IS(di);
1120         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1121         sji = IVEC2IS(di);
1122         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1123         ski = IVEC2IS(di);
1124     }
1125     else if (pbc)
1126     {
1127         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1128     }
1129     else
1130     {
1131         svi = CENTRAL;
1132     }
1133
1134     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1135     {
1136         rvec_dec(fshift[svi], fv);
1137         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1138         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1139         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1140         rvec_inc(fshift[sji], fj);
1141         rvec_inc(fshift[ski], fk);
1142     }
1143
1144     if (VirCorr)
1145     {
1146         rvec xiv;
1147
1148         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1149
1150         for (int i = 0; i < DIM; i++)
1151         {
1152             for (int j = 0; j < DIM; j++)
1153             {
1154                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
1155             }
1156         }
1157     }
1158
1159     /* TOTAL: 54 flops */
1160 }
1161
1162 static void spread_vsite4FD(const t_iatom ia[], real a, real b, real c,
1163                             const rvec x[],
1164                             rvec f[], rvec fshift[],
1165                             gmx_bool VirCorr, matrix dxdf,
1166                             const t_pbc *pbc, const t_graph *g)
1167 {
1168     real    d, invl, fproj, a1;
1169     rvec    xvi, xij, xjk, xjl, xix, fv, temp;
1170     int     av, ai, aj, ak, al;
1171     ivec    di;
1172     int     svi, sji, skj, slj, m;
1173
1174     av = ia[1];
1175     ai = ia[2];
1176     aj = ia[3];
1177     ak = ia[4];
1178     al = ia[5];
1179
1180     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1181     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1182     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1183     /* 9 flops */
1184
1185     /* xix goes from i to point x on the plane jkl */
1186     for (m = 0; m < DIM; m++)
1187     {
1188         xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
1189     }
1190     /* 12 flops */
1191
1192     invl = gmx::invsqrt(iprod(xix, xix));
1193     d    = c*invl;
1194     /* 4 + ?10? flops */
1195
1196     copy_rvec(f[av], fv);
1197
1198     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
1199
1200     for (m = 0; m < DIM; m++)
1201     {
1202         temp[m] = d*(fv[m] - fproj*xix[m]);
1203     }
1204     /* 16 */
1205
1206     /* c is already calculated in constr_vsite3FD
1207        storing c somewhere will save 35 flops!     */
1208
1209     a1 = 1 - a - b;
1210     for (m = 0; m < DIM; m++)
1211     {
1212         f[ai][m] += fv[m] - temp[m];
1213         f[aj][m] += a1*temp[m];
1214         f[ak][m] += a*temp[m];
1215         f[al][m] += b*temp[m];
1216     }
1217     /* 26 Flops */
1218
1219     if (g)
1220     {
1221         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1222         svi = IVEC2IS(di);
1223         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1224         sji = IVEC2IS(di);
1225         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1226         skj = IVEC2IS(di);
1227         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1228         slj = IVEC2IS(di);
1229     }
1230     else if (pbc)
1231     {
1232         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1233     }
1234     else
1235     {
1236         svi = CENTRAL;
1237     }
1238
1239     if (fshift &&
1240         (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1241     {
1242         rvec_dec(fshift[svi], fv);
1243         for (m = 0; m < DIM; m++)
1244         {
1245             fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
1246             fshift[    sji][m] += temp[m];
1247             fshift[    skj][m] += a*temp[m];
1248             fshift[    slj][m] += b*temp[m];
1249         }
1250     }
1251
1252     if (VirCorr)
1253     {
1254         rvec xiv;
1255         int  i, j;
1256
1257         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1258
1259         for (i = 0; i < DIM; i++)
1260         {
1261             for (j = 0; j < DIM; j++)
1262             {
1263                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
1264             }
1265         }
1266     }
1267
1268     /* TOTAL: 77 flops */
1269 }
1270
1271
1272 static void spread_vsite4FDN(const t_iatom ia[], real a, real b, real c,
1273                              const rvec x[],
1274                              rvec f[], rvec fshift[],
1275                              gmx_bool VirCorr, matrix dxdf,
1276                              const t_pbc *pbc, const t_graph *g)
1277 {
1278     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1279     rvec fv, fj, fk, fl;
1280     real invrm, denom;
1281     real cfx, cfy, cfz;
1282     ivec di;
1283     int  av, ai, aj, ak, al;
1284     int  svi, sij, sik, sil;
1285
1286     /* DEBUG: check atom indices */
1287     av = ia[1];
1288     ai = ia[2];
1289     aj = ia[3];
1290     ak = ia[4];
1291     al = ia[5];
1292
1293     copy_rvec(f[av], fv);
1294
1295     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1296     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1297     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1298     /* 9 flops */
1299
1300     ra[XX] = a*xik[XX];
1301     ra[YY] = a*xik[YY];
1302     ra[ZZ] = a*xik[ZZ];
1303
1304     rb[XX] = b*xil[XX];
1305     rb[YY] = b*xil[YY];
1306     rb[ZZ] = b*xil[ZZ];
1307
1308     /* 6 flops */
1309
1310     rvec_sub(ra, xij, rja);
1311     rvec_sub(rb, xij, rjb);
1312     rvec_sub(rb, ra, rab);
1313     /* 9 flops */
1314
1315     cprod(rja, rjb, rm);
1316     /* 9 flops */
1317
1318     invrm = gmx::invsqrt(norm2(rm));
1319     denom = invrm*invrm;
1320     /* 5+5+2 flops */
1321
1322     cfx = c*invrm*fv[XX];
1323     cfy = c*invrm*fv[YY];
1324     cfz = c*invrm*fv[ZZ];
1325     /* 6 Flops */
1326
1327     cprod(rm, rab, rt);
1328     /* 9 flops */
1329
1330     rt[XX] *= denom;
1331     rt[YY] *= denom;
1332     rt[ZZ] *= denom;
1333     /* 3flops */
1334
1335     fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
1336     fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
1337     fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
1338     /* 30 flops */
1339
1340     cprod(rjb, rm, rt);
1341     /* 9 flops */
1342
1343     rt[XX] *= denom*a;
1344     rt[YY] *= denom*a;
1345     rt[ZZ] *= denom*a;
1346     /* 3flops */
1347
1348     fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
1349     fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
1350     fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1351     /* 36 flops */
1352
1353     cprod(rm, rja, rt);
1354     /* 9 flops */
1355
1356     rt[XX] *= denom*b;
1357     rt[YY] *= denom*b;
1358     rt[ZZ] *= denom*b;
1359     /* 3flops */
1360
1361     fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
1362     fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
1363     fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1364     /* 36 flops */
1365
1366     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1367     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1368     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1369     rvec_inc(f[aj], fj);
1370     rvec_inc(f[ak], fk);
1371     rvec_inc(f[al], fl);
1372     /* 21 flops */
1373
1374     if (g)
1375     {
1376         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1377         svi = IVEC2IS(di);
1378         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1379         sij = IVEC2IS(di);
1380         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1381         sik = IVEC2IS(di);
1382         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1383         sil = IVEC2IS(di);
1384     }
1385     else if (pbc)
1386     {
1387         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1388     }
1389     else
1390     {
1391         svi = CENTRAL;
1392     }
1393
1394     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1395     {
1396         rvec_dec(fshift[svi], fv);
1397         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1398         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1399         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1400         rvec_inc(fshift[sij], fj);
1401         rvec_inc(fshift[sik], fk);
1402         rvec_inc(fshift[sil], fl);
1403     }
1404
1405     if (VirCorr)
1406     {
1407         rvec xiv;
1408         int  i, j;
1409
1410         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1411
1412         for (i = 0; i < DIM; i++)
1413         {
1414             for (j = 0; j < DIM; j++)
1415             {
1416                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
1417             }
1418         }
1419     }
1420
1421     /* Total: 207 flops (Yuck!) */
1422 }
1423
1424
1425 static int spread_vsiten(const t_iatom ia[], const t_iparams ip[],
1426                          const rvec x[],
1427                          rvec f[], rvec fshift[],
1428                          const t_pbc *pbc, const t_graph *g)
1429 {
1430     rvec xv, dx, fi;
1431     int  n3, av, i, ai;
1432     real a;
1433     ivec di;
1434     int  siv;
1435
1436     n3 = 3*ip[ia[0]].vsiten.n;
1437     av = ia[1];
1438     copy_rvec(x[av], xv);
1439
1440     for (i = 0; i < n3; i += 3)
1441     {
1442         ai = ia[i+2];
1443         if (g)
1444         {
1445             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1446             siv = IVEC2IS(di);
1447         }
1448         else if (pbc)
1449         {
1450             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1451         }
1452         else
1453         {
1454             siv = CENTRAL;
1455         }
1456         a = ip[ia[i]].vsiten.a;
1457         svmul(a, f[av], fi);
1458         rvec_inc(f[ai], fi);
1459         if (fshift && siv != CENTRAL)
1460         {
1461             rvec_inc(fshift[siv], fi);
1462             rvec_dec(fshift[CENTRAL], fi);
1463         }
1464         /* 6 Flops */
1465     }
1466
1467     return n3;
1468 }
1469
1470
1471 static int vsite_count(const t_ilist *ilist, int ftype)
1472 {
1473     if (ftype == F_VSITEN)
1474     {
1475         return ilist[ftype].nr/3;
1476     }
1477     else
1478     {
1479         return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
1480     }
1481 }
1482
1483 static void spread_vsite_f_thread(const gmx_vsite_t *vsite,
1484                                   const rvec x[],
1485                                   rvec f[], rvec *fshift,
1486                                   gmx_bool VirCorr, matrix dxdf,
1487                                   t_iparams ip[], const t_ilist ilist[],
1488                                   const t_graph *g, const t_pbc *pbc_null)
1489 {
1490     const PbcMode            pbcMode   = getPbcMode(pbc_null, vsite);
1491     /* We need another pbc pointer, as with charge groups we switch per vsite */
1492     const t_pbc             *pbc_null2 = pbc_null;
1493     gmx::ArrayRef<const int> vsite_pbc;
1494
1495     /* this loop goes backwards to be able to build *
1496      * higher type vsites from lower types         */
1497     for (int ftype = c_ftypeVsiteEnd - 1; ftype >= c_ftypeVsiteStart; ftype--)
1498     {
1499         if (ilist[ftype].nr == 0)
1500         {
1501             continue;
1502         }
1503
1504         {   // TODO remove me
1505             int            nra = interaction_function[ftype].nratoms;
1506             int            inc = 1 + nra;
1507             int            nr  = ilist[ftype].nr;
1508
1509             const t_iatom *ia = ilist[ftype].iatoms;
1510
1511             if (pbcMode == PbcMode::all)
1512             {
1513                 pbc_null2 = pbc_null;
1514             }
1515             else if (pbcMode == PbcMode::chargeGroup)
1516             {
1517                 if (vsite->vsite_pbc_loc)
1518                 {
1519                     vsite_pbc = (*vsite->vsite_pbc_loc)[ftype - c_ftypeVsiteStart];
1520                 }
1521             }
1522
1523             for (int i = 0; i < nr; )
1524             {
1525                 if (!vsite_pbc.empty())
1526                 {
1527                     if (vsite_pbc[i/(1 + nra)] > -2)
1528                     {
1529                         pbc_null2 = pbc_null;
1530                     }
1531                     else
1532                     {
1533                         pbc_null2 = nullptr;
1534                     }
1535                 }
1536
1537                 int tp = ia[0];
1538
1539                 /* Constants for constructing */
1540                 real a1, b1, c1;
1541                 a1 = ip[tp].vsite.a;
1542                 /* Construct the vsite depending on type */
1543                 switch (ftype)
1544                 {
1545                     case F_VSITE2:
1546                         spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g);
1547                         break;
1548                     case F_VSITE3:
1549                         b1 = ip[tp].vsite.b;
1550                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1551                         break;
1552                     case F_VSITE3FD:
1553                         b1 = ip[tp].vsite.b;
1554                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1555                         break;
1556                     case F_VSITE3FAD:
1557                         b1 = ip[tp].vsite.b;
1558                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1559                         break;
1560                     case F_VSITE3OUT:
1561                         b1 = ip[tp].vsite.b;
1562                         c1 = ip[tp].vsite.c;
1563                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1564                         break;
1565                     case F_VSITE4FD:
1566                         b1 = ip[tp].vsite.b;
1567                         c1 = ip[tp].vsite.c;
1568                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1569                         break;
1570                     case F_VSITE4FDN:
1571                         b1 = ip[tp].vsite.b;
1572                         c1 = ip[tp].vsite.c;
1573                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1574                         break;
1575                     case F_VSITEN:
1576                         inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g);
1577                         break;
1578                     default:
1579                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
1580                                   ftype, __FILE__, __LINE__);
1581                 }
1582                 clear_rvec(f[ia[1]]);
1583
1584                 /* Increment loop variables */
1585                 i  += inc;
1586                 ia += inc;
1587             }
1588         }
1589     }
1590 }
1591
1592 /*! \brief Clears the task force buffer elements that are written by task idTask */
1593 static void clearTaskForceBufferUsedElements(InterdependentTask *idTask)
1594 {
1595     int ntask = idTask->spreadTask.size();
1596     for (int ti = 0; ti < ntask; ti++)
1597     {
1598         const AtomIndex *atomList = &idTask->atomIndex[idTask->spreadTask[ti]];
1599         int              natom    = atomList->atom.size();
1600         RVec            *force    = idTask->force.data();
1601         for (int i = 0; i < natom; i++)
1602         {
1603             clear_rvec(force[atomList->atom[i]]);
1604         }
1605     }
1606 }
1607
1608 void spread_vsite_f(const gmx_vsite_t *vsite,
1609                     const rvec * gmx_restrict x,
1610                     rvec * gmx_restrict f, rvec * gmx_restrict fshift,
1611                     gmx_bool VirCorr, matrix vir,
1612                     t_nrnb *nrnb, const t_idef *idef,
1613                     int ePBC, gmx_bool bMolPBC, const t_graph *g, const matrix box,
1614                     const t_commrec *cr, gmx_wallcycle *wcycle)
1615 {
1616     wallcycle_start(wcycle, ewcVSITESPREAD);
1617     const bool useDomdec = vsite->useDomdec;
1618     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)), "When vsites are set up with domain decomposition, we need a valid commrec");
1619
1620     t_pbc pbc, *pbc_null;
1621
1622     /* We only need to do pbc when we have inter-cg vsites */
1623     if ((useDomdec || bMolPBC) && vsite->n_intercg_vsite)
1624     {
1625         /* This is wasting some CPU time as we now do this multiple times
1626          * per MD step.
1627          */
1628         pbc_null = set_pbc_dd(&pbc, ePBC, useDomdec ? cr->dd->nc : nullptr, FALSE, box);
1629     }
1630     else
1631     {
1632         pbc_null = nullptr;
1633     }
1634
1635     if (useDomdec)
1636     {
1637         dd_clear_f_vsites(cr->dd, f);
1638     }
1639
1640     if (vsite->nthreads == 1)
1641     {
1642         matrix dxdf;
1643         if (VirCorr)
1644         {
1645             clear_mat(dxdf);
1646         }
1647         spread_vsite_f_thread(vsite,
1648                               x, f, fshift,
1649                               VirCorr, dxdf,
1650                               idef->iparams, idef->il,
1651                               g, pbc_null);
1652
1653         if (VirCorr)
1654         {
1655             for (int i = 0; i < DIM; i++)
1656             {
1657                 for (int j = 0; j < DIM; j++)
1658                 {
1659                     vir[i][j] += -0.5*dxdf[i][j];
1660                 }
1661             }
1662         }
1663     }
1664     else
1665     {
1666         /* First spread the vsites that might depend on non-local vsites */
1667         if (VirCorr)
1668         {
1669             clear_mat(vsite->tData[vsite->nthreads]->dxdf);
1670         }
1671         spread_vsite_f_thread(vsite,
1672                               x, f, fshift,
1673                               VirCorr, vsite->tData[vsite->nthreads]->dxdf,
1674                               idef->iparams,
1675                               vsite->tData[vsite->nthreads]->ilist,
1676                               g, pbc_null);
1677
1678 #pragma omp parallel num_threads(vsite->nthreads)
1679         {
1680             try
1681             {
1682                 int          thread = gmx_omp_get_thread_num();
1683                 VsiteThread &tData  = *vsite->tData[thread];
1684
1685                 rvec        *fshift_t;
1686                 if (thread == 0 || fshift == nullptr)
1687                 {
1688                     fshift_t = fshift;
1689                 }
1690                 else
1691                 {
1692                     fshift_t = tData.fshift;
1693
1694                     for (int i = 0; i < SHIFTS; i++)
1695                     {
1696                         clear_rvec(fshift_t[i]);
1697                     }
1698                 }
1699                 if (VirCorr)
1700                 {
1701                     clear_mat(tData.dxdf);
1702                 }
1703
1704                 if (tData.useInterdependentTask)
1705                 {
1706                     /* Spread the vsites that spread outside our local range.
1707                      * This is done using a thread-local force buffer force.
1708                      * First we need to copy the input vsite forces to force.
1709                      */
1710                     InterdependentTask *idTask = &tData.idTask;
1711
1712                     /* Clear the buffer elements set by our task during
1713                      * the last call to spread_vsite_f.
1714                      */
1715                     clearTaskForceBufferUsedElements(idTask);
1716
1717                     int nvsite = idTask->vsite.size();
1718                     for (int i = 0; i < nvsite; i++)
1719                     {
1720                         copy_rvec(f[idTask->vsite[i]],
1721                                   idTask->force[idTask->vsite[i]]);
1722                     }
1723                     spread_vsite_f_thread(vsite,
1724                                           x, as_rvec_array(idTask->force.data()), fshift_t,
1725                                           VirCorr, tData.dxdf,
1726                                           idef->iparams,
1727                                           tData.idTask.ilist,
1728                                           g, pbc_null);
1729
1730                     /* We need a barrier before reducing forces below
1731                      * that have been produced by a different thread above.
1732                      */
1733 #pragma omp barrier
1734
1735                     /* Loop over all thread task and reduce forces they
1736                      * produced on atoms that fall in our range.
1737                      * Note that atomic reduction would be a simpler solution,
1738                      * but that might not have good support on all platforms.
1739                      */
1740                     int ntask = idTask->reduceTask.size();
1741                     for (int ti = 0; ti < ntask; ti++)
1742                     {
1743                         const InterdependentTask *idt_foreign = &vsite->tData[idTask->reduceTask[ti]]->idTask;
1744                         const AtomIndex          *atomList    = &idt_foreign->atomIndex[thread];
1745                         const RVec               *f_foreign   = idt_foreign->force.data();
1746
1747                         int natom = atomList->atom.size();
1748                         for (int i = 0; i < natom; i++)
1749                         {
1750                             int ind = atomList->atom[i];
1751                             rvec_inc(f[ind], f_foreign[ind]);
1752                             /* Clearing of f_foreign is done at the next step */
1753                         }
1754                     }
1755                     /* Clear the vsite forces, both in f and force */
1756                     for (int i = 0; i < nvsite; i++)
1757                     {
1758                         int ind = tData.idTask.vsite[i];
1759                         clear_rvec(f[ind]);
1760                         clear_rvec(tData.idTask.force[ind]);
1761                     }
1762                 }
1763
1764                 /* Spread the vsites that spread locally only */
1765                 spread_vsite_f_thread(vsite,
1766                                       x, f, fshift_t,
1767                                       VirCorr, tData.dxdf,
1768                                       idef->iparams,
1769                                       tData.ilist,
1770                                       g, pbc_null);
1771             }
1772             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1773         }
1774
1775         if (fshift != nullptr)
1776         {
1777             for (int th = 1; th < vsite->nthreads; th++)
1778             {
1779                 for (int i = 0; i < SHIFTS; i++)
1780                 {
1781                     rvec_inc(fshift[i], vsite->tData[th]->fshift[i]);
1782                 }
1783             }
1784         }
1785
1786         if (VirCorr)
1787         {
1788             for (int th = 0; th < vsite->nthreads + 1; th++)
1789             {
1790                 /* MSVC doesn't like matrix references, so we use a pointer */
1791                 const matrix *dxdf = &vsite->tData[th]->dxdf;
1792
1793                 for (int i = 0; i < DIM; i++)
1794                 {
1795                     for (int j = 0; j < DIM; j++)
1796                     {
1797                         vir[i][j] += -0.5*(*dxdf)[i][j];
1798                     }
1799                 }
1800             }
1801         }
1802     }
1803
1804     if (useDomdec)
1805     {
1806         dd_move_f_vsites(cr->dd, f, fshift);
1807     }
1808
1809     inc_nrnb(nrnb, eNR_VSITE2,   vsite_count(idef->il, F_VSITE2));
1810     inc_nrnb(nrnb, eNR_VSITE3,   vsite_count(idef->il, F_VSITE3));
1811     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
1812     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
1813     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
1814     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
1815     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
1816     inc_nrnb(nrnb, eNR_VSITEN,   vsite_count(idef->il, F_VSITEN));
1817
1818     wallcycle_stop(wcycle, ewcVSITESPREAD);
1819 }
1820
1821 /*! \brief Returns the an array with charge-group indices for each atom
1822  *
1823  * \param[in] chargeGroups  The charge group block struct
1824  */
1825 static std::vector<int> atom2cg(const t_block &chargeGroups)
1826 {
1827     std::vector<int> a2cg(chargeGroups.index[chargeGroups.nr], 0);
1828
1829     for (int chargeGroup = 0; chargeGroup < chargeGroups.nr; chargeGroup++)
1830     {
1831         std::fill(a2cg.begin() + chargeGroups.index[chargeGroup],
1832                   a2cg.begin() + chargeGroups.index[chargeGroup + 1],
1833                   chargeGroup);
1834     }
1835
1836     return a2cg;
1837 }
1838
1839 int count_intercg_vsites(const gmx_mtop_t *mtop)
1840 {
1841     int n_intercg_vsite = 0;
1842     for (const gmx_molblock_t &molb : mtop->molblock)
1843     {
1844         const gmx_moltype_t &molt = mtop->moltype[molb.type];
1845
1846         std::vector<int>     a2cg = atom2cg(molt.cgs);
1847         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1848         {
1849             int            nral = NRAL(ftype);
1850             const t_ilist &il   = molt.ilist[ftype];
1851             const t_iatom *ia   = il.iatoms;
1852             for (int i = 0; i < il.nr; i += 1 + nral)
1853             {
1854                 int cg = a2cg[ia[1+i]];
1855                 for (int a = 1; a < nral; a++)
1856                 {
1857                     if (a2cg[ia[1+a]] != cg)
1858                     {
1859                         n_intercg_vsite += molb.nmol;
1860                         break;
1861                     }
1862                 }
1863             }
1864         }
1865     }
1866
1867     return n_intercg_vsite;
1868 }
1869
1870 static VsitePbc
1871 get_vsite_pbc(const t_iparams *iparams, const t_ilist *ilist,
1872               const t_atom *atom, const t_mdatoms *md,
1873               const t_block &cgs)
1874 {
1875     /* Make an atom to charge group index */
1876     std::vector<int> a2cg = atom2cg(cgs);
1877
1878     /* Make an array that tells if the pbc of an atom is set */
1879     std::vector<bool> pbc_set(cgs.index[cgs.nr], false);
1880     /* PBC is set for all non vsites */
1881     for (int a = 0; a < cgs.index[cgs.nr]; a++)
1882     {
1883         if ((atom && atom[a].ptype != eptVSite) ||
1884             (md   && md->ptype[a]  != eptVSite))
1885         {
1886             pbc_set[a] = true;
1887         }
1888     }
1889
1890     VsitePbc vsite_pbc;
1891
1892     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1893     {
1894         {   // TODO remove me
1895             int               nral = NRAL(ftype);
1896             const t_ilist    *il   = &ilist[ftype];
1897             const t_iatom    *ia   = il->iatoms;
1898
1899             std::vector<int> &vsite_pbc_f = vsite_pbc[ftype - F_VSITE2];
1900             vsite_pbc_f.resize(il->nr/(1 + nral));
1901
1902             int i = 0;
1903             while (i < il->nr)
1904             {
1905                 int     vsi   = i/(1 + nral);
1906                 t_iatom vsite = ia[i+1];
1907                 int     cg_v  = a2cg[vsite];
1908                 /* A value of -2 signals that this vsite and its contructing
1909                  * atoms are all within the same cg, so no pbc is required.
1910                  */
1911                 vsite_pbc_f[vsi] = -2;
1912                 /* Check if constructing atoms are outside the vsite's cg */
1913                 int nc3 = 0;
1914                 if (ftype == F_VSITEN)
1915                 {
1916                     nc3 = 3*iparams[ia[i]].vsiten.n;
1917                     for (int j = 0; j < nc3; j += 3)
1918                     {
1919                         if (a2cg[ia[i+j+2]] != cg_v)
1920                         {
1921                             vsite_pbc_f[vsi] = -1;
1922                         }
1923                     }
1924                 }
1925                 else
1926                 {
1927                     for (int a = 1; a < nral; a++)
1928                     {
1929                         if (a2cg[ia[i+1+a]] != cg_v)
1930                         {
1931                             vsite_pbc_f[vsi] = -1;
1932                         }
1933                     }
1934                 }
1935                 if (vsite_pbc_f[vsi] == -1)
1936                 {
1937                     /* Check if this is the first processed atom of a vsite only cg */
1938                     gmx_bool bViteOnlyCG_and_FirstAtom = TRUE;
1939                     for (int a = cgs.index[cg_v]; a < cgs.index[cg_v + 1]; a++)
1940                     {
1941                         /* Non-vsites already have pbc set, so simply check for pbc_set */
1942                         if (pbc_set[a])
1943                         {
1944                             bViteOnlyCG_and_FirstAtom = FALSE;
1945                             break;
1946                         }
1947                     }
1948                     if (bViteOnlyCG_and_FirstAtom)
1949                     {
1950                         /* First processed atom of a vsite only charge group.
1951                          * The pbc of the input coordinates to construct_vsites
1952                          * should be preserved.
1953                          */
1954                         vsite_pbc_f[vsi] = vsite;
1955                     }
1956                     else if (cg_v != a2cg[ia[1+i+1]])
1957                     {
1958                         /* This vsite has a different charge group index
1959                          * than it's first constructing atom
1960                          * and the charge group has more than one atom,
1961                          * search for the first normal particle
1962                          * or vsite that already had its pbc defined.
1963                          * If nothing is found, use full pbc for this vsite.
1964                          */
1965                         for (int a = cgs.index[cg_v]; a < cgs.index[cg_v + 1]; a++)
1966                         {
1967                             if (a != vsite && pbc_set[a])
1968                             {
1969                                 vsite_pbc_f[vsi] = a;
1970                                 if (gmx_debug_at)
1971                                 {
1972                                     fprintf(debug, "vsite %d match pbc with atom %d\n",
1973                                             vsite+1, a+1);
1974                                 }
1975                                 break;
1976                             }
1977                         }
1978                         if (gmx_debug_at)
1979                         {
1980                             fprintf(debug, "vsite atom %d  cg %d - %d pbc atom %d\n",
1981                                     vsite+1, cgs.index[cg_v] + 1, cgs.index[cg_v + 1],
1982                                     vsite_pbc_f[vsi] + 1);
1983                         }
1984                     }
1985                 }
1986                 if (ftype == F_VSITEN)
1987                 {
1988                     /* The other entries in vsite_pbc_f are not used for center vsites */
1989                     i += nc3;
1990                 }
1991                 else
1992                 {
1993                     i += 1 + nral;
1994                 }
1995
1996                 /* This vsite now has its pbc defined */
1997                 pbc_set[vsite] = true;
1998             }
1999         }
2000     }
2001
2002     return vsite_pbc;
2003 }
2004
2005
2006 std::unique_ptr<gmx_vsite_t>
2007 initVsite(const gmx_mtop_t &mtop,
2008           const t_commrec  *cr)
2009 {
2010     GMX_RELEASE_ASSERT(cr != nullptr, "We need a valid commrec");
2011
2012     std::unique_ptr<gmx_vsite_t> vsite;
2013
2014     /* check if there are vsites */
2015     int nvsite = 0;
2016     for (int ftype = 0; ftype < F_NRE; ftype++)
2017     {
2018         if (interaction_function[ftype].flags & IF_VSITE)
2019         {
2020             GMX_ASSERT(ftype >= c_ftypeVsiteStart && ftype < c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2021
2022             nvsite += gmx_mtop_ftype_count(&mtop, ftype);
2023         }
2024         else
2025         {
2026             GMX_ASSERT(ftype < c_ftypeVsiteStart || ftype >= c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2027         }
2028     }
2029
2030     if (nvsite == 0)
2031     {
2032         return vsite;
2033     }
2034
2035     vsite = gmx::compat::make_unique<gmx_vsite_t>();
2036
2037     vsite->n_intercg_vsite   = count_intercg_vsites(&mtop);
2038
2039     vsite->bHaveChargeGroups = (ncg_mtop(&mtop) < mtop.natoms);
2040
2041     vsite->useDomdec         = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
2042
2043     /* If we don't have charge groups, the vsite follows its own pbc.
2044      *
2045      * With charge groups, each vsite needs to follow the pbc of the charge
2046      * group. Thus we need to keep track of PBC. Here we assume that without
2047      * domain decomposition all molecules are whole (which will not be
2048      * the case with periodic molecules).
2049      */
2050     if (vsite->bHaveChargeGroups &&
2051         vsite->n_intercg_vsite > 0 &&
2052         DOMAINDECOMP(cr))
2053     {
2054         vsite->vsite_pbc_molt.resize(mtop.moltype.size());
2055         for (size_t mt = 0; mt < mtop.moltype.size(); mt++)
2056         {
2057             const gmx_moltype_t &molt = mtop.moltype[mt];
2058             vsite->vsite_pbc_molt[mt] = get_vsite_pbc(mtop.ffparams.iparams,
2059                                                       molt.ilist,
2060                                                       molt.atoms.atom, nullptr,
2061                                                       molt.cgs);
2062         }
2063
2064         vsite->vsite_pbc_loc = gmx::compat::make_unique<VsitePbc>();
2065     }
2066
2067     vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
2068
2069     if (vsite->nthreads > 1)
2070     {
2071         /* We need one extra thread data structure for the overlap vsites */
2072         vsite->tData.resize(vsite->nthreads + 1);
2073 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2074         for (int thread = 0; thread < vsite->nthreads; thread++)
2075         {
2076             try
2077             {
2078                 vsite->tData[thread] = gmx::compat::make_unique<VsiteThread>();
2079
2080                 InterdependentTask &idTask = vsite->tData[thread]->idTask;
2081                 idTask.nuse                = 0;
2082                 idTask.atomIndex.resize(vsite->nthreads);
2083             }
2084             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2085         }
2086         if (vsite->nthreads > 1)
2087         {
2088             vsite->tData[vsite->nthreads] = gmx::compat::make_unique<VsiteThread>();
2089         }
2090     }
2091
2092     return vsite;
2093 }
2094
2095 gmx_vsite_t::gmx_vsite_t()
2096 {
2097 }
2098
2099 gmx_vsite_t::~gmx_vsite_t()
2100 {
2101 }
2102
2103 static inline void flagAtom(InterdependentTask *idTask, int atom,
2104                             int thread, int nthread, int natperthread)
2105 {
2106     if (!idTask->use[atom])
2107     {
2108         idTask->use[atom] = true;
2109         thread            = atom/natperthread;
2110         /* Assign all non-local atom force writes to thread 0 */
2111         if (thread >= nthread)
2112         {
2113             thread        = 0;
2114         }
2115         idTask->atomIndex[thread].atom.push_back(atom);
2116     }
2117 }
2118
2119 /*\brief Here we try to assign all vsites that are in our local range.
2120  *
2121  * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2122  * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2123  * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2124  * but not on other vsites are assigned to task tData->id_task.ilist.
2125  * taskIndex[] is set for all vsites in our range, either to our local tasks
2126  * or to the single last task as taskIndex[]=2*nthreads.
2127  */
2128 static void assignVsitesToThread(VsiteThread           *tData,
2129                                  int                    thread,
2130                                  int                    nthread,
2131                                  int                    natperthread,
2132                                  gmx::ArrayRef<int>     taskIndex,
2133                                  const t_ilist         *ilist,
2134                                  const t_iparams       *ip,
2135                                  const unsigned short  *ptype)
2136 {
2137     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2138     {
2139         tData->ilist[ftype].nr        = 0;
2140         tData->idTask.ilist[ftype].nr = 0;
2141
2142         int      nral1 = 1 + NRAL(ftype);
2143         int      inc   = nral1;
2144         t_iatom *iat   = ilist[ftype].iatoms;
2145         for (int i = 0; i < ilist[ftype].nr; )
2146         {
2147             if (ftype == F_VSITEN)
2148             {
2149                 /* The 3 below is from 1+NRAL(ftype)=3 */
2150                 inc = ip[iat[i]].vsiten.n*3;
2151             }
2152
2153             if (iat[1 + i] <  tData->rangeStart ||
2154                 iat[1 + i] >= tData->rangeEnd)
2155             {
2156                 /* This vsite belongs to a different thread */
2157                 i += inc;
2158                 continue;
2159             }
2160
2161             /* We would like to assign this vsite to task thread,
2162              * but it might depend on atoms outside the atom range of thread
2163              * or on another vsite not assigned to task thread.
2164              */
2165             int task = thread;
2166             if (ftype != F_VSITEN)
2167             {
2168                 for (int j = i + 2; j < i + nral1; j++)
2169                 {
2170                     /* Do a range check to avoid a harmless race on taskIndex */
2171                     if (iat[j] <  tData->rangeStart ||
2172                         iat[j] >= tData->rangeEnd ||
2173                         taskIndex[iat[j]] != thread)
2174                     {
2175                         if (!tData->useInterdependentTask ||
2176                             ptype[iat[j]] == eptVSite)
2177                         {
2178                             /* At least one constructing atom is a vsite
2179                              * that is not assigned to the same thread.
2180                              * Put this vsite into a separate task.
2181                              */
2182                             task = 2*nthread;
2183                             break;
2184                         }
2185
2186                         /* There are constructing atoms outside our range,
2187                          * put this vsite into a second task to be executed
2188                          * on the same thread. During construction no barrier
2189                          * is needed between the two tasks on the same thread.
2190                          * During spreading we need to run this task with
2191                          * an additional thread-local intermediate force buffer
2192                          * (or atomic reduction) and a barrier between the two
2193                          * tasks.
2194                          */
2195                         task = nthread + thread;
2196                     }
2197                 }
2198             }
2199             else
2200             {
2201                 for (int j = i + 2; j < i + inc; j += 3)
2202                 {
2203                     /* Do a range check to avoid a harmless race on taskIndex */
2204                     if (iat[j] <  tData->rangeStart ||
2205                         iat[j] >= tData->rangeEnd ||
2206                         taskIndex[iat[j]] != thread)
2207                     {
2208                         GMX_ASSERT(ptype[iat[j]] != eptVSite, "A vsite to be assigned in assignVsitesToThread has a vsite as a constructing atom that does not belong to our task, such vsites should be assigned to the single 'master' task");
2209
2210                         task = nthread + thread;
2211                     }
2212                 }
2213             }
2214
2215             /* Update this vsite's thread index entry */
2216             taskIndex[iat[1+i]] = task;
2217
2218             if (task == thread || task == nthread + thread)
2219             {
2220                 /* Copy this vsite to the thread data struct of thread */
2221                 t_ilist *il_task;
2222                 if (task == thread)
2223                 {
2224                     il_task = &tData->ilist[ftype];
2225                 }
2226                 else
2227                 {
2228                     il_task = &tData->idTask.ilist[ftype];
2229                 }
2230                 /* Ensure we have sufficient memory allocated */
2231                 if (il_task->nr + inc > il_task->nalloc)
2232                 {
2233                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2234                     srenew(il_task->iatoms, il_task->nalloc);
2235                 }
2236                 /* Copy the vsite data to the thread-task local array */
2237                 for (int j = i; j < i + inc; j++)
2238                 {
2239                     il_task->iatoms[il_task->nr++] = iat[j];
2240                 }
2241                 if (task == nthread + thread)
2242                 {
2243                     /* This vsite write outside our own task force block.
2244                      * Put it into the interdependent task list and flag
2245                      * the atoms involved for reduction.
2246                      */
2247                     tData->idTask.vsite.push_back(iat[i + 1]);
2248                     if (ftype != F_VSITEN)
2249                     {
2250                         for (int j = i + 2; j < i + nral1; j++)
2251                         {
2252                             flagAtom(&tData->idTask, iat[j],
2253                                      thread, nthread, natperthread);
2254                         }
2255                     }
2256                     else
2257                     {
2258                         for (int j = i + 2; j < i + inc; j += 3)
2259                         {
2260                             flagAtom(&tData->idTask, iat[j],
2261                                      thread, nthread, natperthread);
2262                         }
2263                     }
2264                 }
2265             }
2266
2267             i += inc;
2268         }
2269     }
2270 }
2271
2272 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2273 static void assignVsitesToSingleTask(VsiteThread              *tData,
2274                                      int                       task,
2275                                      gmx::ArrayRef<const int>  taskIndex,
2276                                      const t_ilist            *ilist,
2277                                      const t_iparams          *ip)
2278 {
2279     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2280     {
2281         tData->ilist[ftype].nr        = 0;
2282         tData->idTask.ilist[ftype].nr = 0;
2283
2284         int      nral1   = 1 + NRAL(ftype);
2285         int      inc     = nral1;
2286         t_iatom *iat     = ilist[ftype].iatoms;
2287         t_ilist *il_task = &tData->ilist[ftype];
2288
2289         for (int i = 0; i < ilist[ftype].nr; )
2290         {
2291             if (ftype == F_VSITEN)
2292             {
2293                 /* The 3 below is from 1+NRAL(ftype)=3 */
2294                 inc = ip[iat[i]].vsiten.n*3;
2295             }
2296             /* Check if the vsite is assigned to our task */
2297             if (taskIndex[iat[1 + i]] == task)
2298             {
2299                 /* Ensure we have sufficient memory allocated */
2300                 if (il_task->nr + inc > il_task->nalloc)
2301                 {
2302                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2303                     srenew(il_task->iatoms, il_task->nalloc);
2304                 }
2305                 /* Copy the vsite data to the thread-task local array */
2306                 for (int j = i; j < i + inc; j++)
2307                 {
2308                     il_task->iatoms[il_task->nr++] = iat[j];
2309                 }
2310             }
2311
2312             i += inc;
2313         }
2314     }
2315 }
2316
2317 void split_vsites_over_threads(const t_ilist   *ilist,
2318                                const t_iparams *ip,
2319                                const t_mdatoms *mdatoms,
2320                                gmx_vsite_t     *vsite)
2321 {
2322     int      vsite_atom_range, natperthread;
2323
2324     if (vsite->nthreads == 1)
2325     {
2326         /* Nothing to do */
2327         return;
2328     }
2329
2330     /* The current way of distributing the vsites over threads in primitive.
2331      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2332      * without taking into account how the vsites are distributed.
2333      * Without domain decomposition we at least tighten the upper bound
2334      * of the range (useful for common systems such as a vsite-protein
2335      * in 3-site water).
2336      * With domain decomposition, as long as the vsites are distributed
2337      * uniformly in each domain along the major dimension, usually x,
2338      * it will also perform well.
2339      */
2340     if (!vsite->useDomdec)
2341     {
2342         vsite_atom_range = -1;
2343         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2344         {
2345             {   // TODO remove me
2346                 if (ftype != F_VSITEN)
2347                 {
2348                     int            nral1 = 1 + NRAL(ftype);
2349                     const t_iatom *iat   = ilist[ftype].iatoms;
2350                     for (int i = 0; i < ilist[ftype].nr; i += nral1)
2351                     {
2352                         for (int j = i + 1; j < i + nral1; j++)
2353                         {
2354                             vsite_atom_range = std::max(vsite_atom_range, iat[j]);
2355                         }
2356                     }
2357                 }
2358                 else
2359                 {
2360                     int            vs_ind_end;
2361
2362                     const t_iatom *iat = ilist[ftype].iatoms;
2363
2364                     int            i = 0;
2365                     while (i < ilist[ftype].nr)
2366                     {
2367                         /* The 3 below is from 1+NRAL(ftype)=3 */
2368                         vs_ind_end = i + ip[iat[i]].vsiten.n*3;
2369
2370                         vsite_atom_range = std::max(vsite_atom_range, iat[i+1]);
2371                         while (i < vs_ind_end)
2372                         {
2373                             vsite_atom_range = std::max(vsite_atom_range, iat[i+2]);
2374                             i               += 3;
2375                         }
2376                     }
2377                 }
2378             }
2379         }
2380         vsite_atom_range++;
2381         natperthread     = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
2382     }
2383     else
2384     {
2385         /* Any local or not local atom could be involved in virtual sites.
2386          * But since we usually have very few non-local virtual sites
2387          * (only non-local vsites that depend on local vsites),
2388          * we distribute the local atom range equally over the threads.
2389          * When assigning vsites to threads, we should take care that the last
2390          * threads also covers the non-local range.
2391          */
2392         vsite_atom_range = mdatoms->nr;
2393         natperthread     = (mdatoms->homenr + vsite->nthreads - 1)/vsite->nthreads;
2394     }
2395
2396     if (debug)
2397     {
2398         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms->nr, vsite_atom_range, natperthread);
2399     }
2400
2401     /* To simplify the vsite assignment, we make an index which tells us
2402      * to which task particles, both non-vsites and vsites, are assigned.
2403      */
2404     vsite->taskIndex.resize(mdatoms->nr);
2405
2406     /* Initialize the task index array. Here we assign the non-vsite
2407      * particles to task=thread, so we easily figure out if vsites
2408      * depend on local and/or non-local particles in assignVsitesToThread.
2409      */
2410     gmx::ArrayRef<int> taskIndex = vsite->taskIndex;
2411     {
2412         int  thread = 0;
2413         for (int i = 0; i < mdatoms->nr; i++)
2414         {
2415             if (mdatoms->ptype[i] == eptVSite)
2416             {
2417                 /* vsites are not assigned to a task yet */
2418                 taskIndex[i] = -1;
2419             }
2420             else
2421             {
2422                 /* assign non-vsite particles to task thread */
2423                 taskIndex[i] = thread;
2424             }
2425             if (i == (thread + 1)*natperthread && thread < vsite->nthreads)
2426             {
2427                 thread++;
2428             }
2429         }
2430     }
2431
2432 #pragma omp parallel num_threads(vsite->nthreads)
2433     {
2434         try
2435         {
2436             int          thread = gmx_omp_get_thread_num();
2437             VsiteThread &tData  = *vsite->tData[thread];
2438
2439             /* Clear the buffer use flags that were set before */
2440             if (tData.useInterdependentTask)
2441             {
2442                 InterdependentTask &idTask = tData.idTask;
2443
2444                 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2445                  * we clear the force buffer at the next step,
2446                  * so we need to do it here as well.
2447                  */
2448                 clearTaskForceBufferUsedElements(&idTask);
2449
2450                 idTask.vsite.resize(0);
2451                 for (int t = 0; t < vsite->nthreads; t++)
2452                 {
2453                     AtomIndex &atomIndex = idTask.atomIndex[t];
2454                     int        natom     = atomIndex.atom.size();
2455                     for (int i = 0; i < natom; i++)
2456                     {
2457                         idTask.use[atomIndex.atom[i]] = false;
2458                     }
2459                     atomIndex.atom.resize(0);
2460                 }
2461                 idTask.nuse = 0;
2462             }
2463
2464             /* To avoid large f_buf allocations of #threads*vsite_atom_range
2465              * we don't use task2 with more than 200000 atoms. This doesn't
2466              * affect performance, since with such a large range relatively few
2467              * vsites will end up in the separate task.
2468              * Note that useTask2 should be the same for all threads.
2469              */
2470             tData.useInterdependentTask = (vsite_atom_range <= 200000);
2471             if (tData.useInterdependentTask)
2472             {
2473                 size_t              natoms_use_in_vsites = vsite_atom_range;
2474                 InterdependentTask &idTask               = tData.idTask;
2475                 /* To avoid resizing and re-clearing every nstlist steps,
2476                  * we never down size the force buffer.
2477                  */
2478                 if (natoms_use_in_vsites > idTask.force.size() ||
2479                     natoms_use_in_vsites > idTask.use.size())
2480                 {
2481                     idTask.force.resize(natoms_use_in_vsites, { 0, 0, 0 });
2482                     idTask.use.resize(natoms_use_in_vsites, false);
2483                 }
2484             }
2485
2486             /* Assign all vsites that can execute independently on threads */
2487             tData.rangeStart     =  thread     *natperthread;
2488             if (thread < vsite->nthreads - 1)
2489             {
2490                 tData.rangeEnd   = (thread + 1)*natperthread;
2491             }
2492             else
2493             {
2494                 /* The last thread should cover up to the end of the range */
2495                 tData.rangeEnd   = mdatoms->nr;
2496             }
2497             assignVsitesToThread(&tData,
2498                                  thread, vsite->nthreads,
2499                                  natperthread,
2500                                  taskIndex,
2501                                  ilist, ip, mdatoms->ptype);
2502
2503             if (tData.useInterdependentTask)
2504             {
2505                 /* In the worst case, all tasks write to force ranges of
2506                  * all other tasks, leading to #tasks^2 scaling (this is only
2507                  * the overhead, the actual flops remain constant).
2508                  * But in most cases there is far less coupling. To improve
2509                  * scaling at high thread counts we therefore construct
2510                  * an index to only loop over the actually affected tasks.
2511                  */
2512                 InterdependentTask &idTask = tData.idTask;
2513
2514                 /* Ensure assignVsitesToThread finished on other threads */
2515 #pragma omp barrier
2516
2517                 idTask.spreadTask.resize(0);
2518                 idTask.reduceTask.resize(0);
2519                 for (int t = 0; t < vsite->nthreads; t++)
2520                 {
2521                     /* Do we write to the force buffer of task t? */
2522                     if (!idTask.atomIndex[t].atom.empty())
2523                     {
2524                         idTask.spreadTask.push_back(t);
2525                     }
2526                     /* Does task t write to our force buffer? */
2527                     if (!vsite->tData[t]->idTask.atomIndex[thread].atom.empty())
2528                     {
2529                         idTask.reduceTask.push_back(t);
2530                     }
2531                 }
2532             }
2533         }
2534         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2535     }
2536     /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2537      * to a single task that will not run in parallel with other tasks.
2538      */
2539     assignVsitesToSingleTask(vsite->tData[vsite->nthreads].get(),
2540                              2*vsite->nthreads,
2541                              taskIndex,
2542                              ilist, ip);
2543
2544     if (debug && vsite->nthreads > 1)
2545     {
2546         fprintf(debug, "virtual site useInterdependentTask %d, nuse:\n",
2547                 static_cast<int>(vsite->tData[0]->useInterdependentTask));
2548         for (int th = 0; th < vsite->nthreads + 1; th++)
2549         {
2550             fprintf(debug, " %4d", vsite->tData[th]->idTask.nuse);
2551         }
2552         fprintf(debug, "\n");
2553
2554         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2555         {
2556             if (ilist[ftype].nr > 0)
2557             {
2558                 fprintf(debug, "%-20s thread dist:",
2559                         interaction_function[ftype].longname);
2560                 for (int th = 0; th < vsite->nthreads + 1; th++)
2561                 {
2562                     fprintf(debug, " %4d %4d ",
2563                             vsite->tData[th]->ilist[ftype].nr,
2564                             vsite->tData[th]->idTask.ilist[ftype].nr);
2565                 }
2566                 fprintf(debug, "\n");
2567             }
2568         }
2569     }
2570
2571 #ifndef NDEBUG
2572     int nrOrig     = vsiteIlistNrCount(ilist);
2573     int nrThreaded = 0;
2574     for (int th = 0; th < vsite->nthreads + 1; th++)
2575     {
2576         nrThreaded +=
2577             vsiteIlistNrCount(vsite->tData[th]->ilist) +
2578             vsiteIlistNrCount(vsite->tData[th]->idTask.ilist);
2579     }
2580     GMX_ASSERT(nrThreaded == nrOrig, "The number of virtual sites assigned to all thread task has to match the total number of virtual sites");
2581 #endif
2582 }
2583
2584 void set_vsite_top(gmx_vsite_t          *vsite,
2585                    const gmx_localtop_t *top,
2586                    const t_mdatoms      *md)
2587 {
2588     if (vsite->n_intercg_vsite > 0 && vsite->bHaveChargeGroups)
2589     {
2590         vsite->vsite_pbc_loc  = gmx::compat::make_unique<VsitePbc>();
2591         *vsite->vsite_pbc_loc = get_vsite_pbc(top->idef.iparams,
2592                                               top->idef.il, nullptr, md,
2593                                               top->cgs);
2594     }
2595
2596     if (vsite->nthreads > 1)
2597     {
2598         if (vsite->bHaveChargeGroups)
2599         {
2600             gmx_fatal(FARGS, "The combination of threading, virtual sites and charge groups is not implemented");
2601         }
2602
2603         split_vsites_over_threads(top->idef.il, top->idef.iparams,
2604                                   md, vsite);
2605     }
2606 }