src/gromacs/mdlib/vsite.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "vsite.h"
  40
  41 #include <cstdio>
  42
  43 #include <algorithm>
  44 #include <memory>
  45 #include <vector>
  46
  47 #include "gromacs/domdec/domdec.h"
  48 #include "gromacs/domdec/domdec_struct.h"
  49 #include "gromacs/gmxlib/network.h"
  50 #include "gromacs/gmxlib/nrnb.h"
  51 #include "gromacs/math/functions.h"
  52 #include "gromacs/math/vec.h"
  53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  54 #include "gromacs/mdtypes/commrec.h"
  55 #include "gromacs/mdtypes/mdatom.h"
  56 #include "gromacs/pbcutil/ishift.h"
  57 #include "gromacs/pbcutil/mshift.h"
  58 #include "gromacs/pbcutil/pbc.h"
  59 #include "gromacs/timing/wallcycle.h"
  60 #include "gromacs/topology/ifunc.h"
  61 #include "gromacs/topology/mtop_util.h"
  62 #include "gromacs/topology/topology.h"
  63 #include "gromacs/utility/exceptions.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/gmxassert.h"
  66 #include "gromacs/utility/gmxomp.h"
  67
  68 /* The strategy used here for assigning virtual sites to (thread-)tasks
  69  * is as follows:
  70  *
  71  * We divide the atom range that vsites operate on (natoms_local with DD,
  72  * 0 - last atom involved in vsites without DD) equally over all threads.
  73  *
  74  * Vsites in the local range constructed from atoms in the local range
  75  * and/or other vsites that are fully local are assigned to a simple,
  76  * independent task.
  77  *
  78  * Vsites that are not assigned after using the above criterion get assigned
  79  * to a so called "interdependent" thread task when none of the constructing
  80  * atoms is a vsite. These tasks are called interdependent, because one task
  81  * accesses atoms assigned to a different task/thread.
  82  * Note that this option is turned off with large (local) atom counts
  83  * to avoid high memory usage.
  84  *
  85  * Any remaining vsites are assigned to a separate master thread task.
  86  */
  87
  88 using gmx::RVec;
  89
  90 static void init_ilist(t_ilist *ilist)
  91 {
  92     for (int i = 0; i < F_NRE; i++)
  93     {
  94         ilist[i].nr     = 0;
  95         ilist[i].nalloc = 0;
  96         ilist[i].iatoms = nullptr;
  97     }
  98 }
  99
 100 /*! \brief List of atom indices belonging to a task */
 101 struct AtomIndex {
 102     //! List of atom indices
 103     std::vector<int> atom;
 104 };
 105
 106 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
 107 struct InterdependentTask
 108 {
 109     //! The interaction lists, only vsite entries are used
 110     t_ilist                ilist[F_NRE];
 111     //! Thread/task-local force buffer
 112     std::vector<RVec>      force;
 113     //! The atom indices of the vsites of our task
 114     std::vector<int>       vsite;
 115     //! Flags if elements in force are spread to or not
 116     std::vector<bool>      use;
 117     //! The number of entries set to true in use
 118     int                    nuse;
 119     //! Array of atoms indices, size nthreads, covering all nuse set elements in use
 120     std::vector<AtomIndex> atomIndex;
 121     //! List of tasks (force blocks) this task spread forces to
 122     std::vector<int>       spreadTask;
 123     //! List of tasks that write to this tasks force block range
 124     std::vector<int>       reduceTask;
 125
 126     InterdependentTask()
 127     {
 128         init_ilist(ilist);
 129         nuse = 0;
 130     }
 131 };
 132
 133 /*! \brief Vsite thread task data structure */
 134 struct VsiteThread
 135 {
 136     //! Start of atom range of this task
 137     int                rangeStart;
 138     //! End of atom range of this task
 139     int                rangeEnd;
 140     //! The interaction lists, only vsite entries are used
 141     t_ilist            ilist[F_NRE];
 142     //! Local fshift accumulation buffer
 143     rvec               fshift[SHIFTS];
 144     //! Local virial dx*df accumulation buffer
 145     matrix             dxdf;
 146     //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
 147     bool               useInterdependentTask;
 148     //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
 149     InterdependentTask idTask;
 150
 151     /*! \brief Constructor */
 152     VsiteThread()
 153     {
 154         rangeStart            = -1;
 155         rangeEnd              = -1;
 156         init_ilist(ilist);
 157         clear_rvecs(SHIFTS, fshift);
 158         clear_mat(dxdf);
 159         useInterdependentTask = false;
 160     }
 161 };
 162
 163 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
 164  *
 165  * \param[in] ilist  The interaction list
 166  */
 167 template <typename T>
 168 static int vsiteIlistNrCount(const T *ilist)
 169 {
 170     int nr = 0;
 171     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 172     {
 173         nr += ilist[ftype].size();
 174     }
 175
 176     return nr;
 177 }
 178
 179 static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 180 {
 181     if (pbc)
 182     {
 183         return pbc_dx_aiuc(pbc, xi, xj, dx);
 184     }
 185     else
 186     {
 187         rvec_sub(xi, xj, dx);
 188         return CENTRAL;
 189     }
 190 }
 191
 192 /* Vsite construction routines */
 193
 194 static void constr_vsite2(const rvec xi, const rvec xj, rvec x, real a, const t_pbc *pbc)
 195 {
 196     real b = 1 - a;
 197     /* 1 flop */
 198
 199     if (pbc)
 200     {
 201         rvec dx;
 202         pbc_dx_aiuc(pbc, xj, xi, dx);
 203         x[XX] = xi[XX] + a*dx[XX];
 204         x[YY] = xi[YY] + a*dx[YY];
 205         x[ZZ] = xi[ZZ] + a*dx[ZZ];
 206     }
 207     else
 208     {
 209         x[XX] = b*xi[XX] + a*xj[XX];
 210         x[YY] = b*xi[YY] + a*xj[YY];
 211         x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 212         /* 9 Flops */
 213     }
 214
 215     /* TOTAL: 10 flops */
 216 }
 217
 218 static void constr_vsite3(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 219                           const t_pbc *pbc)
 220 {
 221     real c = 1 - a - b;
 222     /* 2 flops */
 223
 224     if (pbc)
 225     {
 226         rvec dxj, dxk;
 227
 228         pbc_dx_aiuc(pbc, xj, xi, dxj);
 229         pbc_dx_aiuc(pbc, xk, xi, dxk);
 230         x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 231         x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 232         x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 233     }
 234     else
 235     {
 236         x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 237         x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 238         x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 239         /* 15 Flops */
 240     }
 241
 242     /* TOTAL: 17 flops */
 243 }
 244
 245 static void constr_vsite3FD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 246                             const t_pbc *pbc)
 247 {
 248     rvec xij, xjk, temp;
 249     real c;
 250
 251     pbc_rvec_sub(pbc, xj, xi, xij);
 252     pbc_rvec_sub(pbc, xk, xj, xjk);
 253     /* 6 flops */
 254
 255     /* temp goes from i to a point on the line jk */
 256     temp[XX] = xij[XX] + a*xjk[XX];
 257     temp[YY] = xij[YY] + a*xjk[YY];
 258     temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 259     /* 6 flops */
 260
 261     c = b*gmx::invsqrt(iprod(temp, temp));
 262     /* 6 + 10 flops */
 263
 264     x[XX] = xi[XX] + c*temp[XX];
 265     x[YY] = xi[YY] + c*temp[YY];
 266     x[ZZ] = xi[ZZ] + c*temp[ZZ];
 267     /* 6 Flops */
 268
 269     /* TOTAL: 34 flops */
 270 }
 271
 272 static void constr_vsite3FAD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc *pbc)
 273 {
 274     rvec xij, xjk, xp;
 275     real a1, b1, c1, invdij;
 276
 277     pbc_rvec_sub(pbc, xj, xi, xij);
 278     pbc_rvec_sub(pbc, xk, xj, xjk);
 279     /* 6 flops */
 280
 281     invdij = gmx::invsqrt(iprod(xij, xij));
 282     c1     = invdij * invdij * iprod(xij, xjk);
 283     xp[XX] = xjk[XX] - c1*xij[XX];
 284     xp[YY] = xjk[YY] - c1*xij[YY];
 285     xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 286     a1     = a*invdij;
 287     b1     = b*gmx::invsqrt(iprod(xp, xp));
 288     /* 45 */
 289
 290     x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 291     x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 292     x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 293     /* 12 Flops */
 294
 295     /* TOTAL: 63 flops */
 296 }
 297
 298 static void constr_vsite3OUT(const rvec xi, const rvec xj, const rvec xk, rvec x,
 299                              real a, real b, real c, const t_pbc *pbc)
 300 {
 301     rvec xij, xik, temp;
 302
 303     pbc_rvec_sub(pbc, xj, xi, xij);
 304     pbc_rvec_sub(pbc, xk, xi, xik);
 305     cprod(xij, xik, temp);
 306     /* 15 Flops */
 307
 308     x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 309     x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 310     x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 311     /* 18 Flops */
 312
 313     /* TOTAL: 33 flops */
 314 }
 315
 316 static void constr_vsite4FD(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 317                             real a, real b, real c, const t_pbc *pbc)
 318 {
 319     rvec xij, xjk, xjl, temp;
 320     real d;
 321
 322     pbc_rvec_sub(pbc, xj, xi, xij);
 323     pbc_rvec_sub(pbc, xk, xj, xjk);
 324     pbc_rvec_sub(pbc, xl, xj, xjl);
 325     /* 9 flops */
 326
 327     /* temp goes from i to a point on the plane jkl */
 328     temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 329     temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 330     temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 331     /* 12 flops */
 332
 333     d = c*gmx::invsqrt(iprod(temp, temp));
 334     /* 6 + 10 flops */
 335
 336     x[XX] = xi[XX] + d*temp[XX];
 337     x[YY] = xi[YY] + d*temp[YY];
 338     x[ZZ] = xi[ZZ] + d*temp[ZZ];
 339     /* 6 Flops */
 340
 341     /* TOTAL: 43 flops */
 342 }
 343
 344 static void constr_vsite4FDN(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 345                              real a, real b, real c, const t_pbc *pbc)
 346 {
 347     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 348     real d;
 349
 350     pbc_rvec_sub(pbc, xj, xi, xij);
 351     pbc_rvec_sub(pbc, xk, xi, xik);
 352     pbc_rvec_sub(pbc, xl, xi, xil);
 353     /* 9 flops */
 354
 355     ra[XX] = a*xik[XX];
 356     ra[YY] = a*xik[YY];
 357     ra[ZZ] = a*xik[ZZ];
 358
 359     rb[XX] = b*xil[XX];
 360     rb[YY] = b*xil[YY];
 361     rb[ZZ] = b*xil[ZZ];
 362
 363     /* 6 flops */
 364
 365     rvec_sub(ra, xij, rja);
 366     rvec_sub(rb, xij, rjb);
 367     /* 6 flops */
 368
 369     cprod(rja, rjb, rm);
 370     /* 9 flops */
 371
 372     d = c*gmx::invsqrt(norm2(rm));
 373     /* 5+5+1 flops */
 374
 375     x[XX] = xi[XX] + d*rm[XX];
 376     x[YY] = xi[YY] + d*rm[YY];
 377     x[ZZ] = xi[ZZ] + d*rm[ZZ];
 378     /* 6 Flops */
 379
 380     /* TOTAL: 47 flops */
 381 }
 382
 383
 384 static int constr_vsiten(const t_iatom *ia, const t_iparams ip[],
 385                          rvec *x, const t_pbc *pbc)
 386 {
 387     rvec x1, dx;
 388     dvec dsum;
 389     int  n3, av, ai;
 390     real a;
 391
 392     n3 = 3*ip[ia[0]].vsiten.n;
 393     av = ia[1];
 394     ai = ia[2];
 395     copy_rvec(x[ai], x1);
 396     clear_dvec(dsum);
 397     for (int i = 3; i < n3; i += 3)
 398     {
 399         ai = ia[i+2];
 400         a  = ip[ia[i]].vsiten.a;
 401         if (pbc)
 402         {
 403             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 404         }
 405         else
 406         {
 407             rvec_sub(x[ai], x1, dx);
 408         }
 409         dsum[XX] += a*dx[XX];
 410         dsum[YY] += a*dx[YY];
 411         dsum[ZZ] += a*dx[ZZ];
 412         /* 9 Flops */
 413     }
 414
 415     x[av][XX] = x1[XX] + dsum[XX];
 416     x[av][YY] = x1[YY] + dsum[YY];
 417     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 418
 419     return n3;
 420 }
 421
 422 /*! \brief PBC modes for vsite construction and spreading */
 423 enum class PbcMode
 424 {
 425     all,         // Apply normal, simple PBC for all vsites
 426     none         // No PBC treatment needed
 427 };
 428
 429 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
 430  *
 431  * \param[in] pbcPtr  A pointer to a PBC struct or nullptr when no PBC treatment is required
 432  */
 433 static PbcMode getPbcMode(const t_pbc *pbcPtr)
 434 {
 435     if (pbcPtr == nullptr)
 436     {
 437         return PbcMode::none;
 438     }
 439     else
 440     {
 441         return PbcMode::all;
 442     }
 443 }
 444
 445 static void construct_vsites_thread(rvec x[],
 446                                     real dt, rvec *v,
 447                                     const t_iparams ip[], const t_ilist ilist[],
 448                                     const t_pbc *pbc_null)
 449 {
 450     real         inv_dt;
 451     if (v != nullptr)
 452     {
 453         inv_dt = 1.0/dt;
 454     }
 455     else
 456     {
 457         inv_dt = 1.0;
 458     }
 459
 460     const PbcMode            pbcMode   = getPbcMode(pbc_null);
 461     /* We need another pbc pointer, as with charge groups we switch per vsite */
 462     const t_pbc             *pbc_null2 = pbc_null;
 463
 464     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 465     {
 466         if (ilist[ftype].nr == 0)
 467         {
 468             continue;
 469         }
 470
 471         {   // TODO remove me
 472             int            nra = interaction_function[ftype].nratoms;
 473             int            inc = 1 + nra;
 474             int            nr  = ilist[ftype].nr;
 475
 476             const t_iatom *ia = ilist[ftype].iatoms;
 477
 478             for (int i = 0; i < nr; )
 479             {
 480                 int  tp     = ia[0];
 481                 /* The vsite and constructing atoms */
 482                 int  avsite = ia[1];
 483                 int  ai     = ia[2];
 484                 /* Constants for constructing vsites */
 485                 real a1     = ip[tp].vsite.a;
 486                 /* Copy the old position */
 487                 rvec xv;
 488                 copy_rvec(x[avsite], xv);
 489
 490                 /* Construct the vsite depending on type */
 491                 int  aj, ak, al;
 492                 real b1, c1;
 493                 switch (ftype)
 494                 {
 495                     case F_VSITE2:
 496                         aj = ia[3];
 497                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 498                         break;
 499                     case F_VSITE3:
 500                         aj = ia[3];
 501                         ak = ia[4];
 502                         b1 = ip[tp].vsite.b;
 503                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 504                         break;
 505                     case F_VSITE3FD:
 506                         aj = ia[3];
 507                         ak = ia[4];
 508                         b1 = ip[tp].vsite.b;
 509                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 510                         break;
 511                     case F_VSITE3FAD:
 512                         aj = ia[3];
 513                         ak = ia[4];
 514                         b1 = ip[tp].vsite.b;
 515                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 516                         break;
 517                     case F_VSITE3OUT:
 518                         aj = ia[3];
 519                         ak = ia[4];
 520                         b1 = ip[tp].vsite.b;
 521                         c1 = ip[tp].vsite.c;
 522                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 523                         break;
 524                     case F_VSITE4FD:
 525                         aj = ia[3];
 526                         ak = ia[4];
 527                         al = ia[5];
 528                         b1 = ip[tp].vsite.b;
 529                         c1 = ip[tp].vsite.c;
 530                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 531                                         pbc_null2);
 532                         break;
 533                     case F_VSITE4FDN:
 534                         aj = ia[3];
 535                         ak = ia[4];
 536                         al = ia[5];
 537                         b1 = ip[tp].vsite.b;
 538                         c1 = ip[tp].vsite.c;
 539                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 540                                          pbc_null2);
 541                         break;
 542                     case F_VSITEN:
 543                         inc = constr_vsiten(ia, ip, x, pbc_null2);
 544                         break;
 545                     default:
 546                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 547                                   ftype, __FILE__, __LINE__);
 548                 }
 549
 550                 if (pbcMode == PbcMode::all)
 551                 {
 552                     /* Keep the vsite in the same periodic image as before */
 553                     rvec dx;
 554                     int  ishift = pbc_dx_aiuc(pbc_null, x[avsite], xv, dx);
 555                     if (ishift != CENTRAL)
 556                     {
 557                         rvec_add(xv, dx, x[avsite]);
 558                     }
 559                 }
 560                 if (v != nullptr)
 561                 {
 562                     /* Calculate velocity of vsite... */
 563                     rvec vv;
 564                     rvec_sub(x[avsite], xv, vv);
 565                     svmul(inv_dt, vv, v[avsite]);
 566                 }
 567
 568                 /* Increment loop variables */
 569                 i  += inc;
 570                 ia += inc;
 571             }
 572         }
 573     }
 574 }
 575
 576 void construct_vsites(const gmx_vsite_t *vsite,
 577                       rvec x[],
 578                       real dt, rvec *v,
 579                       const t_iparams ip[], const t_ilist ilist[],
 580                       int ePBC, gmx_bool bMolPBC,
 581                       const t_commrec *cr,
 582                       const matrix box)
 583 {
 584     const bool useDomdec = (vsite != nullptr && vsite->useDomdec);
 585     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)), "When vsites are set up with domain decomposition, we need a valid commrec");
 586     // TODO: Remove this assertion when we remove charge groups
 587     GMX_ASSERT(vsite != nullptr || ePBC == epbcNONE, "Without a vsite struct we can not do PBC (in case we have charge groups)");
 588
 589     t_pbc     pbc, *pbc_null;
 590
 591     /* We only need to do pbc when we have inter-cg vsites.
 592      * Note that with domain decomposition we do not need to apply PBC here
 593      * when we have at least 3 domains along each dimension. Currently we
 594      * do not optimize this case.
 595      */
 596     if (ePBC != epbcNONE && (useDomdec || bMolPBC) &&
 597         !(vsite != nullptr && vsite->numInterUpdategroupVsites == 0))
 598     {
 599         /* This is wasting some CPU time as we now do this multiple times
 600          * per MD step.
 601          */
 602         ivec null_ivec;
 603         clear_ivec(null_ivec);
 604         pbc_null = set_pbc_dd(&pbc, ePBC,
 605                               useDomdec ? cr->dd->nc : null_ivec,
 606                               FALSE, box);
 607     }
 608     else
 609     {
 610         pbc_null = nullptr;
 611     }
 612
 613     if (useDomdec)
 614     {
 615         dd_move_x_vsites(cr->dd, box, x);
 616     }
 617
 618     if (vsite == nullptr || vsite->nthreads == 1)
 619     {
 620         construct_vsites_thread(x, dt, v,
 621                                 ip, ilist,
 622                                 pbc_null);
 623     }
 624     else
 625     {
 626 #pragma omp parallel num_threads(vsite->nthreads)
 627         {
 628             try
 629             {
 630                 const int          th    = gmx_omp_get_thread_num();
 631                 const VsiteThread &tData = *vsite->tData[th];
 632                 GMX_ASSERT(tData.rangeStart >= 0, "The thread data should be initialized before calling construct_vsites");
 633
 634                 construct_vsites_thread(x, dt, v,
 635                                         ip, tData.ilist,
 636                                         pbc_null);
 637                 if (tData.useInterdependentTask)
 638                 {
 639                     /* Here we don't need a barrier (unlike the spreading),
 640                      * since both tasks only construct vsites from particles,
 641                      * or local vsites, not from non-local vsites.
 642                      */
 643                     construct_vsites_thread(x, dt, v,
 644                                             ip, tData.idTask.ilist,
 645                                             pbc_null);
 646                 }
 647             }
 648             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 649         }
 650         /* Now we can construct the vsites that might depend on other vsites */
 651         construct_vsites_thread(x, dt, v,
 652                                 ip, vsite->tData[vsite->nthreads]->ilist,
 653                                 pbc_null);
 654     }
 655 }
 656
 657 static void spread_vsite2(const t_iatom ia[], real a,
 658                           const rvec x[],
 659                           rvec f[], rvec fshift[],
 660                           const t_pbc *pbc, const t_graph *g)
 661 {
 662     rvec    fi, fj, dx;
 663     t_iatom av, ai, aj;
 664     ivec    di;
 665     int     siv, sij;
 666
 667     av = ia[1];
 668     ai = ia[2];
 669     aj = ia[3];
 670
 671     svmul(1 - a, f[av], fi);
 672     svmul(    a, f[av], fj);
 673     /* 7 flop */
 674
 675     rvec_inc(f[ai], fi);
 676     rvec_inc(f[aj], fj);
 677     /* 6 Flops */
 678
 679     if (g)
 680     {
 681         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 682         siv = IVEC2IS(di);
 683         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 684         sij = IVEC2IS(di);
 685     }
 686     else if (pbc)
 687     {
 688         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 689         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 690     }
 691     else
 692     {
 693         siv = CENTRAL;
 694         sij = CENTRAL;
 695     }
 696
 697     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 698     {
 699         rvec_inc(fshift[siv], f[av]);
 700         rvec_dec(fshift[CENTRAL], fi);
 701         rvec_dec(fshift[sij], fj);
 702     }
 703
 704     /* TOTAL: 13 flops */
 705 }
 706
 707 void constructVsitesGlobal(const gmx_mtop_t         &mtop,
 708                            gmx::ArrayRef<gmx::RVec>  x)
 709 {
 710     GMX_ASSERT(x.ssize() >= mtop.natoms, "x should contain the whole system");
 711     GMX_ASSERT(!mtop.moleculeBlockIndices.empty(), "molblock indices are needed in constructVsitesGlobal");
 712
 713     for (size_t mb = 0; mb < mtop.molblock.size(); mb++)
 714     {
 715         const gmx_molblock_t  &molb = mtop.molblock[mb];
 716         const gmx_moltype_t   &molt = mtop.moltype[molb.type];
 717         if (vsiteIlistNrCount(molt.ilist.data()) > 0)
 718         {
 719             int atomOffset = mtop.moleculeBlockIndices[mb].globalAtomStart;
 720             for (int mol = 0; mol < molb.nmol; mol++)
 721             {
 722                 t_ilist ilist[F_NRE];
 723                 for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 724                 {
 725                     ilist[ftype].nr     = molt.ilist[ftype].size();
 726                     ilist[ftype].iatoms = const_cast<t_iatom *>(molt.ilist[ftype].iatoms.data());
 727                 }
 728
 729                 construct_vsites(nullptr, as_rvec_array(x.data()) + atomOffset,
 730                                  0.0, nullptr,
 731                                  mtop.ffparams.iparams.data(), ilist,
 732                                  epbcNONE, TRUE, nullptr, nullptr);
 733                 atomOffset += molt.atoms.nr;
 734             }
 735         }
 736     }
 737 }
 738
 739 static void spread_vsite3(const t_iatom ia[], real a, real b,
 740                           const rvec x[],
 741                           rvec f[], rvec fshift[],
 742                           const t_pbc *pbc, const t_graph *g)
 743 {
 744     rvec    fi, fj, fk, dx;
 745     int     av, ai, aj, ak;
 746     ivec    di;
 747     int     siv, sij, sik;
 748
 749     av = ia[1];
 750     ai = ia[2];
 751     aj = ia[3];
 752     ak = ia[4];
 753
 754     svmul(1 - a - b, f[av], fi);
 755     svmul(        a, f[av], fj);
 756     svmul(        b, f[av], fk);
 757     /* 11 flops */
 758
 759     rvec_inc(f[ai], fi);
 760     rvec_inc(f[aj], fj);
 761     rvec_inc(f[ak], fk);
 762     /* 9 Flops */
 763
 764     if (g)
 765     {
 766         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 767         siv = IVEC2IS(di);
 768         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 769         sij = IVEC2IS(di);
 770         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 771         sik = IVEC2IS(di);
 772     }
 773     else if (pbc)
 774     {
 775         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 776         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 777         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 778     }
 779     else
 780     {
 781         siv = CENTRAL;
 782         sij = CENTRAL;
 783         sik = CENTRAL;
 784     }
 785
 786     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 787     {
 788         rvec_inc(fshift[siv], f[av]);
 789         rvec_dec(fshift[CENTRAL], fi);
 790         rvec_dec(fshift[sij], fj);
 791         rvec_dec(fshift[sik], fk);
 792     }
 793
 794     /* TOTAL: 20 flops */
 795 }
 796
 797 static void spread_vsite3FD(const t_iatom ia[], real a, real b,
 798                             const rvec x[],
 799                             rvec f[], rvec fshift[],
 800                             gmx_bool VirCorr, matrix dxdf,
 801                             const t_pbc *pbc, const t_graph *g)
 802 {
 803     real    c, invl, fproj, a1;
 804     rvec    xvi, xij, xjk, xix, fv, temp;
 805     t_iatom av, ai, aj, ak;
 806     int     svi, sji, skj;
 807     ivec    di;
 808
 809     av = ia[1];
 810     ai = ia[2];
 811     aj = ia[3];
 812     ak = ia[4];
 813     copy_rvec(f[av], fv);
 814
 815     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 816     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 817     /* 6 flops */
 818
 819     /* xix goes from i to point x on the line jk */
 820     xix[XX] = xij[XX]+a*xjk[XX];
 821     xix[YY] = xij[YY]+a*xjk[YY];
 822     xix[ZZ] = xij[ZZ]+a*xjk[ZZ];
 823     /* 6 flops */
 824
 825     invl = gmx::invsqrt(iprod(xix, xix));
 826     c    = b*invl;
 827     /* 4 + ?10? flops */
 828
 829     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 830
 831     temp[XX] = c*(fv[XX]-fproj*xix[XX]);
 832     temp[YY] = c*(fv[YY]-fproj*xix[YY]);
 833     temp[ZZ] = c*(fv[ZZ]-fproj*xix[ZZ]);
 834     /* 16 */
 835
 836     /* c is already calculated in constr_vsite3FD
 837        storing c somewhere will save 26 flops!     */
 838
 839     a1         = 1 - a;
 840     f[ai][XX] += fv[XX] - temp[XX];
 841     f[ai][YY] += fv[YY] - temp[YY];
 842     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 843     f[aj][XX] += a1*temp[XX];
 844     f[aj][YY] += a1*temp[YY];
 845     f[aj][ZZ] += a1*temp[ZZ];
 846     f[ak][XX] += a*temp[XX];
 847     f[ak][YY] += a*temp[YY];
 848     f[ak][ZZ] += a*temp[ZZ];
 849     /* 19 Flops */
 850
 851     if (g)
 852     {
 853         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 854         svi = IVEC2IS(di);
 855         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 856         sji = IVEC2IS(di);
 857         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 858         skj = IVEC2IS(di);
 859     }
 860     else if (pbc)
 861     {
 862         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 863     }
 864     else
 865     {
 866         svi = CENTRAL;
 867     }
 868
 869     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 870     {
 871         rvec_dec(fshift[svi], fv);
 872         fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 873         fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 874         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 875         fshift[    sji][XX] += temp[XX];
 876         fshift[    sji][YY] += temp[YY];
 877         fshift[    sji][ZZ] += temp[ZZ];
 878         fshift[    skj][XX] += a*temp[XX];
 879         fshift[    skj][YY] += a*temp[YY];
 880         fshift[    skj][ZZ] += a*temp[ZZ];
 881     }
 882
 883     if (VirCorr)
 884     {
 885         /* When VirCorr=TRUE, the virial for the current forces is not
 886          * calculated from the redistributed forces. This means that
 887          * the effect of non-linear virtual site constructions on the virial
 888          * needs to be added separately. This contribution can be calculated
 889          * in many ways, but the simplest and cheapest way is to use
 890          * the first constructing atom ai as a reference position in space:
 891          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 892          */
 893         rvec xiv;
 894
 895         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 896
 897         for (int i = 0; i < DIM; i++)
 898         {
 899             for (int j = 0; j < DIM; j++)
 900             {
 901                 /* As xix is a linear combination of j and k, use that here */
 902                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 903             }
 904         }
 905     }
 906
 907     /* TOTAL: 61 flops */
 908 }
 909
 910 static void spread_vsite3FAD(const t_iatom ia[], real a, real b,
 911                              const rvec x[],
 912                              rvec f[], rvec fshift[],
 913                              gmx_bool VirCorr, matrix dxdf,
 914                              const t_pbc *pbc, const t_graph *g)
 915 {
 916     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 917     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 918     t_iatom av, ai, aj, ak;
 919     int     svi, sji, skj, d;
 920     ivec    di;
 921
 922     av = ia[1];
 923     ai = ia[2];
 924     aj = ia[3];
 925     ak = ia[4];
 926     copy_rvec(f[ia[1]], fv);
 927
 928     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 929     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 930     /* 6 flops */
 931
 932     invdij    = gmx::invsqrt(iprod(xij, xij));
 933     invdij2   = invdij * invdij;
 934     c1        = iprod(xij, xjk) * invdij2;
 935     xperp[XX] = xjk[XX] - c1*xij[XX];
 936     xperp[YY] = xjk[YY] - c1*xij[YY];
 937     xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 938     /* xperp in plane ijk, perp. to ij */
 939     invdp = gmx::invsqrt(iprod(xperp, xperp));
 940     a1    = a*invdij;
 941     b1    = b*invdp;
 942     /* 45 flops */
 943
 944     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 945        storing them somewhere will save 45 flops!     */
 946
 947     fproj = iprod(xij, fv)*invdij2;
 948     svmul(fproj,                      xij,  Fpij);    /* proj. f on xij */
 949     svmul(iprod(xperp, fv)*invdp*invdp, xperp, Fppp); /* proj. f on xperp */
 950     svmul(b1*fproj,                   xperp, f3);
 951     /* 23 flops */
 952
 953     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
 954     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
 955     for (d = 0; (d < DIM); d++)
 956     {
 957         f1[d] *= a1;
 958         f2[d] *= b1;
 959     }
 960     /* 12 flops */
 961
 962     c2         = 1 + c1;
 963     f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
 964     f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
 965     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
 966     f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
 967     f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
 968     f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
 969     f[ak][XX] +=                      f2[XX];
 970     f[ak][YY] +=                      f2[YY];
 971     f[ak][ZZ] +=                      f2[ZZ];
 972     /* 30 Flops */
 973
 974     if (g)
 975     {
 976         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 977         svi = IVEC2IS(di);
 978         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 979         sji = IVEC2IS(di);
 980         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 981         skj = IVEC2IS(di);
 982     }
 983     else if (pbc)
 984     {
 985         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 986     }
 987     else
 988     {
 989         svi = CENTRAL;
 990     }
 991
 992     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 993     {
 994         rvec_dec(fshift[svi], fv);
 995         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
 996         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
 997         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
 998         fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
 999         fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
1000         fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
1001         fshift[    skj][XX] +=                          f2[XX];
1002         fshift[    skj][YY] +=                          f2[YY];
1003         fshift[    skj][ZZ] +=                          f2[ZZ];
1004     }
1005
1006     if (VirCorr)
1007     {
1008         rvec xiv;
1009         int  i, j;
1010
1011         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1012
1013         for (i = 0; i < DIM; i++)
1014         {
1015             for (j = 0; j < DIM; j++)
1016             {
1017                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1018                 dxdf[i][j] +=
1019                     -xiv[i]*fv[j]
1020                     + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
1021                     + xjk[i]*f2[j];
1022             }
1023         }
1024     }
1025
1026     /* TOTAL: 113 flops */
1027 }
1028
1029 static void spread_vsite3OUT(const t_iatom ia[], real a, real b, real c,
1030                              const rvec x[],
1031                              rvec f[], rvec fshift[],
1032                              gmx_bool VirCorr, matrix dxdf,
1033                              const t_pbc *pbc, const t_graph *g)
1034 {
1035     rvec    xvi, xij, xik, fv, fj, fk;
1036     real    cfx, cfy, cfz;
1037     int     av, ai, aj, ak;
1038     ivec    di;
1039     int     svi, sji, ski;
1040
1041     av = ia[1];
1042     ai = ia[2];
1043     aj = ia[3];
1044     ak = ia[4];
1045
1046     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1047     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1048     /* 6 Flops */
1049
1050     copy_rvec(f[av], fv);
1051
1052     cfx = c*fv[XX];
1053     cfy = c*fv[YY];
1054     cfz = c*fv[ZZ];
1055     /* 3 Flops */
1056
1057     fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
1058     fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
1059     fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
1060
1061     fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
1062     fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
1063     fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
1064     /* 30 Flops */
1065
1066     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1067     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1068     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1069     rvec_inc(f[aj], fj);
1070     rvec_inc(f[ak], fk);
1071     /* 15 Flops */
1072
1073     if (g)
1074     {
1075         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1076         svi = IVEC2IS(di);
1077         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1078         sji = IVEC2IS(di);
1079         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1080         ski = IVEC2IS(di);
1081     }
1082     else if (pbc)
1083     {
1084         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1085     }
1086     else
1087     {
1088         svi = CENTRAL;
1089     }
1090
1091     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1092     {
1093         rvec_dec(fshift[svi], fv);
1094         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1095         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1096         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1097         rvec_inc(fshift[sji], fj);
1098         rvec_inc(fshift[ski], fk);
1099     }
1100
1101     if (VirCorr)
1102     {
1103         rvec xiv;
1104
1105         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1106
1107         for (int i = 0; i < DIM; i++)
1108         {
1109             for (int j = 0; j < DIM; j++)
1110             {
1111                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
1112             }
1113         }
1114     }
1115
1116     /* TOTAL: 54 flops */
1117 }
1118
1119 static void spread_vsite4FD(const t_iatom ia[], real a, real b, real c,
1120                             const rvec x[],
1121                             rvec f[], rvec fshift[],
1122                             gmx_bool VirCorr, matrix dxdf,
1123                             const t_pbc *pbc, const t_graph *g)
1124 {
1125     real    d, invl, fproj, a1;
1126     rvec    xvi, xij, xjk, xjl, xix, fv, temp;
1127     int     av, ai, aj, ak, al;
1128     ivec    di;
1129     int     svi, sji, skj, slj, m;
1130
1131     av = ia[1];
1132     ai = ia[2];
1133     aj = ia[3];
1134     ak = ia[4];
1135     al = ia[5];
1136
1137     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1138     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1139     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1140     /* 9 flops */
1141
1142     /* xix goes from i to point x on the plane jkl */
1143     for (m = 0; m < DIM; m++)
1144     {
1145         xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
1146     }
1147     /* 12 flops */
1148
1149     invl = gmx::invsqrt(iprod(xix, xix));
1150     d    = c*invl;
1151     /* 4 + ?10? flops */
1152
1153     copy_rvec(f[av], fv);
1154
1155     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
1156
1157     for (m = 0; m < DIM; m++)
1158     {
1159         temp[m] = d*(fv[m] - fproj*xix[m]);
1160     }
1161     /* 16 */
1162
1163     /* c is already calculated in constr_vsite3FD
1164        storing c somewhere will save 35 flops!     */
1165
1166     a1 = 1 - a - b;
1167     for (m = 0; m < DIM; m++)
1168     {
1169         f[ai][m] += fv[m] - temp[m];
1170         f[aj][m] += a1*temp[m];
1171         f[ak][m] += a*temp[m];
1172         f[al][m] += b*temp[m];
1173     }
1174     /* 26 Flops */
1175
1176     if (g)
1177     {
1178         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1179         svi = IVEC2IS(di);
1180         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1181         sji = IVEC2IS(di);
1182         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1183         skj = IVEC2IS(di);
1184         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1185         slj = IVEC2IS(di);
1186     }
1187     else if (pbc)
1188     {
1189         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1190     }
1191     else
1192     {
1193         svi = CENTRAL;
1194     }
1195
1196     if (fshift &&
1197         (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1198     {
1199         rvec_dec(fshift[svi], fv);
1200         for (m = 0; m < DIM; m++)
1201         {
1202             fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
1203             fshift[    sji][m] += temp[m];
1204             fshift[    skj][m] += a*temp[m];
1205             fshift[    slj][m] += b*temp[m];
1206         }
1207     }
1208
1209     if (VirCorr)
1210     {
1211         rvec xiv;
1212         int  i, j;
1213
1214         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1215
1216         for (i = 0; i < DIM; i++)
1217         {
1218             for (j = 0; j < DIM; j++)
1219             {
1220                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
1221             }
1222         }
1223     }
1224
1225     /* TOTAL: 77 flops */
1226 }
1227
1228
1229 static void spread_vsite4FDN(const t_iatom ia[], real a, real b, real c,
1230                              const rvec x[],
1231                              rvec f[], rvec fshift[],
1232                              gmx_bool VirCorr, matrix dxdf,
1233                              const t_pbc *pbc, const t_graph *g)
1234 {
1235     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1236     rvec fv, fj, fk, fl;
1237     real invrm, denom;
1238     real cfx, cfy, cfz;
1239     ivec di;
1240     int  av, ai, aj, ak, al;
1241     int  svi, sij, sik, sil;
1242
1243     /* DEBUG: check atom indices */
1244     av = ia[1];
1245     ai = ia[2];
1246     aj = ia[3];
1247     ak = ia[4];
1248     al = ia[5];
1249
1250     copy_rvec(f[av], fv);
1251
1252     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1253     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1254     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1255     /* 9 flops */
1256
1257     ra[XX] = a*xik[XX];
1258     ra[YY] = a*xik[YY];
1259     ra[ZZ] = a*xik[ZZ];
1260
1261     rb[XX] = b*xil[XX];
1262     rb[YY] = b*xil[YY];
1263     rb[ZZ] = b*xil[ZZ];
1264
1265     /* 6 flops */
1266
1267     rvec_sub(ra, xij, rja);
1268     rvec_sub(rb, xij, rjb);
1269     rvec_sub(rb, ra, rab);
1270     /* 9 flops */
1271
1272     cprod(rja, rjb, rm);
1273     /* 9 flops */
1274
1275     invrm = gmx::invsqrt(norm2(rm));
1276     denom = invrm*invrm;
1277     /* 5+5+2 flops */
1278
1279     cfx = c*invrm*fv[XX];
1280     cfy = c*invrm*fv[YY];
1281     cfz = c*invrm*fv[ZZ];
1282     /* 6 Flops */
1283
1284     cprod(rm, rab, rt);
1285     /* 9 flops */
1286
1287     rt[XX] *= denom;
1288     rt[YY] *= denom;
1289     rt[ZZ] *= denom;
1290     /* 3flops */
1291
1292     fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
1293     fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
1294     fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
1295     /* 30 flops */
1296
1297     cprod(rjb, rm, rt);
1298     /* 9 flops */
1299
1300     rt[XX] *= denom*a;
1301     rt[YY] *= denom*a;
1302     rt[ZZ] *= denom*a;
1303     /* 3flops */
1304
1305     fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
1306     fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
1307     fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1308     /* 36 flops */
1309
1310     cprod(rm, rja, rt);
1311     /* 9 flops */
1312
1313     rt[XX] *= denom*b;
1314     rt[YY] *= denom*b;
1315     rt[ZZ] *= denom*b;
1316     /* 3flops */
1317
1318     fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
1319     fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
1320     fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1321     /* 36 flops */
1322
1323     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1324     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1325     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1326     rvec_inc(f[aj], fj);
1327     rvec_inc(f[ak], fk);
1328     rvec_inc(f[al], fl);
1329     /* 21 flops */
1330
1331     if (g)
1332     {
1333         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1334         svi = IVEC2IS(di);
1335         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1336         sij = IVEC2IS(di);
1337         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1338         sik = IVEC2IS(di);
1339         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1340         sil = IVEC2IS(di);
1341     }
1342     else if (pbc)
1343     {
1344         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1345     }
1346     else
1347     {
1348         svi = CENTRAL;
1349     }
1350
1351     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1352     {
1353         rvec_dec(fshift[svi], fv);
1354         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1355         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1356         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1357         rvec_inc(fshift[sij], fj);
1358         rvec_inc(fshift[sik], fk);
1359         rvec_inc(fshift[sil], fl);
1360     }
1361
1362     if (VirCorr)
1363     {
1364         rvec xiv;
1365         int  i, j;
1366
1367         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1368
1369         for (i = 0; i < DIM; i++)
1370         {
1371             for (j = 0; j < DIM; j++)
1372             {
1373                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
1374             }
1375         }
1376     }
1377
1378     /* Total: 207 flops (Yuck!) */
1379 }
1380
1381
1382 static int spread_vsiten(const t_iatom ia[], const t_iparams ip[],
1383                          const rvec x[],
1384                          rvec f[], rvec fshift[],
1385                          const t_pbc *pbc, const t_graph *g)
1386 {
1387     rvec xv, dx, fi;
1388     int  n3, av, i, ai;
1389     real a;
1390     ivec di;
1391     int  siv;
1392
1393     n3 = 3*ip[ia[0]].vsiten.n;
1394     av = ia[1];
1395     copy_rvec(x[av], xv);
1396
1397     for (i = 0; i < n3; i += 3)
1398     {
1399         ai = ia[i+2];
1400         if (g)
1401         {
1402             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1403             siv = IVEC2IS(di);
1404         }
1405         else if (pbc)
1406         {
1407             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1408         }
1409         else
1410         {
1411             siv = CENTRAL;
1412         }
1413         a = ip[ia[i]].vsiten.a;
1414         svmul(a, f[av], fi);
1415         rvec_inc(f[ai], fi);
1416         if (fshift && siv != CENTRAL)
1417         {
1418             rvec_inc(fshift[siv], fi);
1419             rvec_dec(fshift[CENTRAL], fi);
1420         }
1421         /* 6 Flops */
1422     }
1423
1424     return n3;
1425 }
1426
1427
1428 static int vsite_count(const t_ilist *ilist, int ftype)
1429 {
1430     if (ftype == F_VSITEN)
1431     {
1432         return ilist[ftype].nr/3;
1433     }
1434     else
1435     {
1436         return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
1437     }
1438 }
1439
1440 static void spread_vsite_f_thread(const rvec x[],
1441                                   rvec f[], rvec *fshift,
1442                                   gmx_bool VirCorr, matrix dxdf,
1443                                   t_iparams ip[], const t_ilist ilist[],
1444                                   const t_graph *g, const t_pbc *pbc_null)
1445 {
1446     const PbcMode            pbcMode   = getPbcMode(pbc_null);
1447     /* We need another pbc pointer, as with charge groups we switch per vsite */
1448     const t_pbc             *pbc_null2 = pbc_null;
1449     gmx::ArrayRef<const int> vsite_pbc;
1450
1451     /* this loop goes backwards to be able to build *
1452      * higher type vsites from lower types         */
1453     for (int ftype = c_ftypeVsiteEnd - 1; ftype >= c_ftypeVsiteStart; ftype--)
1454     {
1455         if (ilist[ftype].nr == 0)
1456         {
1457             continue;
1458         }
1459
1460         {   // TODO remove me
1461             int            nra = interaction_function[ftype].nratoms;
1462             int            inc = 1 + nra;
1463             int            nr  = ilist[ftype].nr;
1464
1465             const t_iatom *ia = ilist[ftype].iatoms;
1466
1467             if (pbcMode == PbcMode::all)
1468             {
1469                 pbc_null2 = pbc_null;
1470             }
1471
1472             for (int i = 0; i < nr; )
1473             {
1474                 int tp = ia[0];
1475
1476                 /* Constants for constructing */
1477                 real a1, b1, c1;
1478                 a1 = ip[tp].vsite.a;
1479                 /* Construct the vsite depending on type */
1480                 switch (ftype)
1481                 {
1482                     case F_VSITE2:
1483                         spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g);
1484                         break;
1485                     case F_VSITE3:
1486                         b1 = ip[tp].vsite.b;
1487                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1488                         break;
1489                     case F_VSITE3FD:
1490                         b1 = ip[tp].vsite.b;
1491                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1492                         break;
1493                     case F_VSITE3FAD:
1494                         b1 = ip[tp].vsite.b;
1495                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1496                         break;
1497                     case F_VSITE3OUT:
1498                         b1 = ip[tp].vsite.b;
1499                         c1 = ip[tp].vsite.c;
1500                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1501                         break;
1502                     case F_VSITE4FD:
1503                         b1 = ip[tp].vsite.b;
1504                         c1 = ip[tp].vsite.c;
1505                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1506                         break;
1507                     case F_VSITE4FDN:
1508                         b1 = ip[tp].vsite.b;
1509                         c1 = ip[tp].vsite.c;
1510                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1511                         break;
1512                     case F_VSITEN:
1513                         inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g);
1514                         break;
1515                     default:
1516                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
1517                                   ftype, __FILE__, __LINE__);
1518                 }
1519                 clear_rvec(f[ia[1]]);
1520
1521                 /* Increment loop variables */
1522                 i  += inc;
1523                 ia += inc;
1524             }
1525         }
1526     }
1527 }
1528
1529 /*! \brief Clears the task force buffer elements that are written by task idTask */
1530 static void clearTaskForceBufferUsedElements(InterdependentTask *idTask)
1531 {
1532     int ntask = idTask->spreadTask.size();
1533     for (int ti = 0; ti < ntask; ti++)
1534     {
1535         const AtomIndex *atomList = &idTask->atomIndex[idTask->spreadTask[ti]];
1536         int              natom    = atomList->atom.size();
1537         RVec            *force    = idTask->force.data();
1538         for (int i = 0; i < natom; i++)
1539         {
1540             clear_rvec(force[atomList->atom[i]]);
1541         }
1542     }
1543 }
1544
1545 void spread_vsite_f(const gmx_vsite_t *vsite,
1546                     const rvec * gmx_restrict x,
1547                     rvec * gmx_restrict f, rvec * gmx_restrict fshift,
1548                     gmx_bool VirCorr, matrix vir,
1549                     t_nrnb *nrnb, const t_idef *idef,
1550                     int ePBC, gmx_bool bMolPBC, const t_graph *g, const matrix box,
1551                     const t_commrec *cr, gmx_wallcycle *wcycle)
1552 {
1553     wallcycle_start(wcycle, ewcVSITESPREAD);
1554     const bool useDomdec = vsite->useDomdec;
1555     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)), "When vsites are set up with domain decomposition, we need a valid commrec");
1556
1557     t_pbc pbc, *pbc_null;
1558
1559     /* We only need to do pbc when we have inter-cg vsites */
1560     if ((useDomdec || bMolPBC) && vsite->numInterUpdategroupVsites)
1561     {
1562         /* This is wasting some CPU time as we now do this multiple times
1563          * per MD step.
1564          */
1565         pbc_null = set_pbc_dd(&pbc, ePBC, useDomdec ? cr->dd->nc : nullptr, FALSE, box);
1566     }
1567     else
1568     {
1569         pbc_null = nullptr;
1570     }
1571
1572     if (useDomdec)
1573     {
1574         dd_clear_f_vsites(cr->dd, f);
1575     }
1576
1577     if (vsite->nthreads == 1)
1578     {
1579         matrix dxdf;
1580         if (VirCorr)
1581         {
1582             clear_mat(dxdf);
1583         }
1584         spread_vsite_f_thread(x, f, fshift,
1585                               VirCorr, dxdf,
1586                               idef->iparams, idef->il,
1587                               g, pbc_null);
1588
1589         if (VirCorr)
1590         {
1591             for (int i = 0; i < DIM; i++)
1592             {
1593                 for (int j = 0; j < DIM; j++)
1594                 {
1595                     vir[i][j] += -0.5*dxdf[i][j];
1596                 }
1597             }
1598         }
1599     }
1600     else
1601     {
1602         /* First spread the vsites that might depend on non-local vsites */
1603         if (VirCorr)
1604         {
1605             clear_mat(vsite->tData[vsite->nthreads]->dxdf);
1606         }
1607         spread_vsite_f_thread(x, f, fshift,
1608                               VirCorr, vsite->tData[vsite->nthreads]->dxdf,
1609                               idef->iparams,
1610                               vsite->tData[vsite->nthreads]->ilist,
1611                               g, pbc_null);
1612
1613 #pragma omp parallel num_threads(vsite->nthreads)
1614         {
1615             try
1616             {
1617                 int          thread = gmx_omp_get_thread_num();
1618                 VsiteThread &tData  = *vsite->tData[thread];
1619
1620                 rvec        *fshift_t;
1621                 if (thread == 0 || fshift == nullptr)
1622                 {
1623                     fshift_t = fshift;
1624                 }
1625                 else
1626                 {
1627                     fshift_t = tData.fshift;
1628
1629                     for (int i = 0; i < SHIFTS; i++)
1630                     {
1631                         clear_rvec(fshift_t[i]);
1632                     }
1633                 }
1634                 if (VirCorr)
1635                 {
1636                     clear_mat(tData.dxdf);
1637                 }
1638
1639                 if (tData.useInterdependentTask)
1640                 {
1641                     /* Spread the vsites that spread outside our local range.
1642                      * This is done using a thread-local force buffer force.
1643                      * First we need to copy the input vsite forces to force.
1644                      */
1645                     InterdependentTask *idTask = &tData.idTask;
1646
1647                     /* Clear the buffer elements set by our task during
1648                      * the last call to spread_vsite_f.
1649                      */
1650                     clearTaskForceBufferUsedElements(idTask);
1651
1652                     int nvsite = idTask->vsite.size();
1653                     for (int i = 0; i < nvsite; i++)
1654                     {
1655                         copy_rvec(f[idTask->vsite[i]],
1656                                   idTask->force[idTask->vsite[i]]);
1657                     }
1658                     spread_vsite_f_thread(x, as_rvec_array(idTask->force.data()), fshift_t,
1659                                           VirCorr, tData.dxdf,
1660                                           idef->iparams,
1661                                           tData.idTask.ilist,
1662                                           g, pbc_null);
1663
1664                     /* We need a barrier before reducing forces below
1665                      * that have been produced by a different thread above.
1666                      */
1667 #pragma omp barrier
1668
1669                     /* Loop over all thread task and reduce forces they
1670                      * produced on atoms that fall in our range.
1671                      * Note that atomic reduction would be a simpler solution,
1672                      * but that might not have good support on all platforms.
1673                      */
1674                     int ntask = idTask->reduceTask.size();
1675                     for (int ti = 0; ti < ntask; ti++)
1676                     {
1677                         const InterdependentTask *idt_foreign = &vsite->tData[idTask->reduceTask[ti]]->idTask;
1678                         const AtomIndex          *atomList    = &idt_foreign->atomIndex[thread];
1679                         const RVec               *f_foreign   = idt_foreign->force.data();
1680
1681                         int natom = atomList->atom.size();
1682                         for (int i = 0; i < natom; i++)
1683                         {
1684                             int ind = atomList->atom[i];
1685                             rvec_inc(f[ind], f_foreign[ind]);
1686                             /* Clearing of f_foreign is done at the next step */
1687                         }
1688                     }
1689                     /* Clear the vsite forces, both in f and force */
1690                     for (int i = 0; i < nvsite; i++)
1691                     {
1692                         int ind = tData.idTask.vsite[i];
1693                         clear_rvec(f[ind]);
1694                         clear_rvec(tData.idTask.force[ind]);
1695                     }
1696                 }
1697
1698                 /* Spread the vsites that spread locally only */
1699                 spread_vsite_f_thread(x, f, fshift_t,
1700                                       VirCorr, tData.dxdf,
1701                                       idef->iparams,
1702                                       tData.ilist,
1703                                       g, pbc_null);
1704             }
1705             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1706         }
1707
1708         if (fshift != nullptr)
1709         {
1710             for (int th = 1; th < vsite->nthreads; th++)
1711             {
1712                 for (int i = 0; i < SHIFTS; i++)
1713                 {
1714                     rvec_inc(fshift[i], vsite->tData[th]->fshift[i]);
1715                 }
1716             }
1717         }
1718
1719         if (VirCorr)
1720         {
1721             for (int th = 0; th < vsite->nthreads + 1; th++)
1722             {
1723                 /* MSVC doesn't like matrix references, so we use a pointer */
1724                 const matrix *dxdf = &vsite->tData[th]->dxdf;
1725
1726                 for (int i = 0; i < DIM; i++)
1727                 {
1728                     for (int j = 0; j < DIM; j++)
1729                     {
1730                         vir[i][j] += -0.5*(*dxdf)[i][j];
1731                     }
1732                 }
1733             }
1734         }
1735     }
1736
1737     if (useDomdec)
1738     {
1739         dd_move_f_vsites(cr->dd, f, fshift);
1740     }
1741
1742     inc_nrnb(nrnb, eNR_VSITE2,   vsite_count(idef->il, F_VSITE2));
1743     inc_nrnb(nrnb, eNR_VSITE3,   vsite_count(idef->il, F_VSITE3));
1744     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
1745     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
1746     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
1747     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
1748     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
1749     inc_nrnb(nrnb, eNR_VSITEN,   vsite_count(idef->il, F_VSITEN));
1750
1751     wallcycle_stop(wcycle, ewcVSITESPREAD);
1752 }
1753
1754 /*! \brief Returns the an array with group indices for each atom
1755  *
1756  * \param[in] grouping  The paritioning of the atom range into atom groups
1757  */
1758 static std::vector<int> makeAtomToGroupMapping(const gmx::RangePartitioning &grouping)
1759 {
1760     std::vector<int> atomToGroup(grouping.fullRange().end(), 0);
1761
1762     for (int group = 0; group < grouping.numBlocks(); group++)
1763     {
1764         auto block = grouping.block(group);
1765         std::fill(atomToGroup.begin() + block.begin(),
1766                   atomToGroup.begin() + block.end(),
1767                   group);
1768     }
1769
1770     return atomToGroup;
1771 }
1772
1773 int countNonlinearVsites(const gmx_mtop_t &mtop)
1774 {
1775     int numNonlinearVsites = 0;
1776     for (const gmx_molblock_t &molb : mtop.molblock)
1777     {
1778         const gmx_moltype_t &molt = mtop.moltype[molb.type];
1779
1780         for (const auto &ilist : extractILists(molt.ilist, IF_VSITE))
1781         {
1782             if (ilist.functionType != F_VSITE2 &&
1783                 ilist.functionType != F_VSITE3 &&
1784                 ilist.functionType != F_VSITEN)
1785             {
1786                 numNonlinearVsites += molb.nmol*ilist.iatoms.size()/(1 + NRAL(ilist.functionType));
1787             }
1788         }
1789     }
1790
1791     return numNonlinearVsites;
1792 }
1793
1794 int countInterUpdategroupVsites(const gmx_mtop_t                            &mtop,
1795                                 gmx::ArrayRef<const gmx::RangePartitioning>  updateGroupingPerMoleculetype)
1796 {
1797     int n_intercg_vsite = 0;
1798     for (const gmx_molblock_t &molb : mtop.molblock)
1799     {
1800         const gmx_moltype_t &molt = mtop.moltype[molb.type];
1801
1802         std::vector<int>     atomToGroup;
1803         if (!updateGroupingPerMoleculetype.empty())
1804         {
1805             atomToGroup = makeAtomToGroupMapping(updateGroupingPerMoleculetype[molb.type]);
1806         }
1807         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1808         {
1809             const int              nral = NRAL(ftype);
1810             const InteractionList &il   = molt.ilist[ftype];
1811             for (int i = 0; i < il.size(); i += 1 + nral)
1812             {
1813                 bool isInterGroup = atomToGroup.empty();
1814                 if (!isInterGroup)
1815                 {
1816                     const int group = atomToGroup[il.iatoms[1 + i]];
1817                     for (int a = 1; a < nral; a++)
1818                     {
1819                         if (atomToGroup[il.iatoms[1 + a]] != group)
1820                         {
1821                             isInterGroup = true;
1822                             break;
1823                         }
1824                     }
1825                 }
1826                 if (isInterGroup)
1827                 {
1828                     n_intercg_vsite += molb.nmol;
1829                 }
1830             }
1831         }
1832     }
1833
1834     return n_intercg_vsite;
1835 }
1836
1837 std::unique_ptr<gmx_vsite_t>
1838 initVsite(const gmx_mtop_t &mtop,
1839           const t_commrec  *cr)
1840 {
1841     GMX_RELEASE_ASSERT(cr != nullptr, "We need a valid commrec");
1842
1843     std::unique_ptr<gmx_vsite_t> vsite;
1844
1845     /* check if there are vsites */
1846     int nvsite = 0;
1847     for (int ftype = 0; ftype < F_NRE; ftype++)
1848     {
1849         if (interaction_function[ftype].flags & IF_VSITE)
1850         {
1851             GMX_ASSERT(ftype >= c_ftypeVsiteStart && ftype < c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
1852
1853             nvsite += gmx_mtop_ftype_count(&mtop, ftype);
1854         }
1855         else
1856         {
1857             GMX_ASSERT(ftype < c_ftypeVsiteStart || ftype >= c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
1858         }
1859     }
1860
1861     if (nvsite == 0)
1862     {
1863         return vsite;
1864     }
1865
1866     vsite = std::make_unique<gmx_vsite_t>();
1867
1868     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingPerMoleculetype;
1869     if (DOMAINDECOMP(cr))
1870     {
1871         updateGroupingPerMoleculetype = getUpdateGroupingPerMoleculetype(*cr->dd);
1872     }
1873     vsite->numInterUpdategroupVsites = countInterUpdategroupVsites(mtop, updateGroupingPerMoleculetype);
1874
1875     vsite->useDomdec                 = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
1876
1877     vsite->nthreads                  = gmx_omp_nthreads_get(emntVSITE);
1878
1879     if (vsite->nthreads > 1)
1880     {
1881         /* We need one extra thread data structure for the overlap vsites */
1882         vsite->tData.resize(vsite->nthreads + 1);
1883 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
1884         for (int thread = 0; thread < vsite->nthreads; thread++)
1885         {
1886             try
1887             {
1888                 vsite->tData[thread] = std::make_unique<VsiteThread>();
1889
1890                 InterdependentTask &idTask = vsite->tData[thread]->idTask;
1891                 idTask.nuse                = 0;
1892                 idTask.atomIndex.resize(vsite->nthreads);
1893             }
1894             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1895         }
1896         if (vsite->nthreads > 1)
1897         {
1898             vsite->tData[vsite->nthreads] = std::make_unique<VsiteThread>();
1899         }
1900     }
1901
1902     return vsite;
1903 }
1904
1905 gmx_vsite_t::gmx_vsite_t()
1906 {
1907 }
1908
1909 gmx_vsite_t::~gmx_vsite_t()
1910 {
1911 }
1912
1913 static inline void flagAtom(InterdependentTask *idTask, int atom,
1914                             int thread, int nthread, int natperthread)
1915 {
1916     if (!idTask->use[atom])
1917     {
1918         idTask->use[atom] = true;
1919         thread            = atom/natperthread;
1920         /* Assign all non-local atom force writes to thread 0 */
1921         if (thread >= nthread)
1922         {
1923             thread        = 0;
1924         }
1925         idTask->atomIndex[thread].atom.push_back(atom);
1926     }
1927 }
1928
1929 /*\brief Here we try to assign all vsites that are in our local range.
1930  *
1931  * Our task local atom range is tData->rangeStart - tData->rangeEnd.
1932  * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
1933  * are assigned to task tData->ilist. Vsites that depend on non-local atoms
1934  * but not on other vsites are assigned to task tData->id_task.ilist.
1935  * taskIndex[] is set for all vsites in our range, either to our local tasks
1936  * or to the single last task as taskIndex[]=2*nthreads.
1937  */
1938 static void assignVsitesToThread(VsiteThread           *tData,
1939                                  int                    thread,
1940                                  int                    nthread,
1941                                  int                    natperthread,
1942                                  gmx::ArrayRef<int>     taskIndex,
1943                                  const t_ilist         *ilist,
1944                                  const t_iparams       *ip,
1945                                  const unsigned short  *ptype)
1946 {
1947     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1948     {
1949         tData->ilist[ftype].nr        = 0;
1950         tData->idTask.ilist[ftype].nr = 0;
1951
1952         int      nral1 = 1 + NRAL(ftype);
1953         int      inc   = nral1;
1954         t_iatom *iat   = ilist[ftype].iatoms;
1955         for (int i = 0; i < ilist[ftype].nr; )
1956         {
1957             if (ftype == F_VSITEN)
1958             {
1959                 /* The 3 below is from 1+NRAL(ftype)=3 */
1960                 inc = ip[iat[i]].vsiten.n*3;
1961             }
1962
1963             if (iat[1 + i] <  tData->rangeStart ||
1964                 iat[1 + i] >= tData->rangeEnd)
1965             {
1966                 /* This vsite belongs to a different thread */
1967                 i += inc;
1968                 continue;
1969             }
1970
1971             /* We would like to assign this vsite to task thread,
1972              * but it might depend on atoms outside the atom range of thread
1973              * or on another vsite not assigned to task thread.
1974              */
1975             int task = thread;
1976             if (ftype != F_VSITEN)
1977             {
1978                 for (int j = i + 2; j < i + nral1; j++)
1979                 {
1980                     /* Do a range check to avoid a harmless race on taskIndex */
1981                     if (iat[j] <  tData->rangeStart ||
1982                         iat[j] >= tData->rangeEnd ||
1983                         taskIndex[iat[j]] != thread)
1984                     {
1985                         if (!tData->useInterdependentTask ||
1986                             ptype[iat[j]] == eptVSite)
1987                         {
1988                             /* At least one constructing atom is a vsite
1989                              * that is not assigned to the same thread.
1990                              * Put this vsite into a separate task.
1991                              */
1992                             task = 2*nthread;
1993                             break;
1994                         }
1995
1996                         /* There are constructing atoms outside our range,
1997                          * put this vsite into a second task to be executed
1998                          * on the same thread. During construction no barrier
1999                          * is needed between the two tasks on the same thread.
2000                          * During spreading we need to run this task with
2001                          * an additional thread-local intermediate force buffer
2002                          * (or atomic reduction) and a barrier between the two
2003                          * tasks.
2004                          */
2005                         task = nthread + thread;
2006                     }
2007                 }
2008             }
2009             else
2010             {
2011                 for (int j = i + 2; j < i + inc; j += 3)
2012                 {
2013                     /* Do a range check to avoid a harmless race on taskIndex */
2014                     if (iat[j] <  tData->rangeStart ||
2015                         iat[j] >= tData->rangeEnd ||
2016                         taskIndex[iat[j]] != thread)
2017                     {
2018                         GMX_ASSERT(ptype[iat[j]] != eptVSite, "A vsite to be assigned in assignVsitesToThread has a vsite as a constructing atom that does not belong to our task, such vsites should be assigned to the single 'master' task");
2019
2020                         task = nthread + thread;
2021                     }
2022                 }
2023             }
2024
2025             /* Update this vsite's thread index entry */
2026             taskIndex[iat[1+i]] = task;
2027
2028             if (task == thread || task == nthread + thread)
2029             {
2030                 /* Copy this vsite to the thread data struct of thread */
2031                 t_ilist *il_task;
2032                 if (task == thread)
2033                 {
2034                     il_task = &tData->ilist[ftype];
2035                 }
2036                 else
2037                 {
2038                     il_task = &tData->idTask.ilist[ftype];
2039                 }
2040                 /* Ensure we have sufficient memory allocated */
2041                 if (il_task->nr + inc > il_task->nalloc)
2042                 {
2043                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2044                     srenew(il_task->iatoms, il_task->nalloc);
2045                 }
2046                 /* Copy the vsite data to the thread-task local array */
2047                 for (int j = i; j < i + inc; j++)
2048                 {
2049                     il_task->iatoms[il_task->nr++] = iat[j];
2050                 }
2051                 if (task == nthread + thread)
2052                 {
2053                     /* This vsite write outside our own task force block.
2054                      * Put it into the interdependent task list and flag
2055                      * the atoms involved for reduction.
2056                      */
2057                     tData->idTask.vsite.push_back(iat[i + 1]);
2058                     if (ftype != F_VSITEN)
2059                     {
2060                         for (int j = i + 2; j < i + nral1; j++)
2061                         {
2062                             flagAtom(&tData->idTask, iat[j],
2063                                      thread, nthread, natperthread);
2064                         }
2065                     }
2066                     else
2067                     {
2068                         for (int j = i + 2; j < i + inc; j += 3)
2069                         {
2070                             flagAtom(&tData->idTask, iat[j],
2071                                      thread, nthread, natperthread);
2072                         }
2073                     }
2074                 }
2075             }
2076
2077             i += inc;
2078         }
2079     }
2080 }
2081
2082 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2083 static void assignVsitesToSingleTask(VsiteThread              *tData,
2084                                      int                       task,
2085                                      gmx::ArrayRef<const int>  taskIndex,
2086                                      const t_ilist            *ilist,
2087                                      const t_iparams          *ip)
2088 {
2089     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2090     {
2091         tData->ilist[ftype].nr        = 0;
2092         tData->idTask.ilist[ftype].nr = 0;
2093
2094         int      nral1   = 1 + NRAL(ftype);
2095         int      inc     = nral1;
2096         t_iatom *iat     = ilist[ftype].iatoms;
2097         t_ilist *il_task = &tData->ilist[ftype];
2098
2099         for (int i = 0; i < ilist[ftype].nr; )
2100         {
2101             if (ftype == F_VSITEN)
2102             {
2103                 /* The 3 below is from 1+NRAL(ftype)=3 */
2104                 inc = ip[iat[i]].vsiten.n*3;
2105             }
2106             /* Check if the vsite is assigned to our task */
2107             if (taskIndex[iat[1 + i]] == task)
2108             {
2109                 /* Ensure we have sufficient memory allocated */
2110                 if (il_task->nr + inc > il_task->nalloc)
2111                 {
2112                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2113                     srenew(il_task->iatoms, il_task->nalloc);
2114                 }
2115                 /* Copy the vsite data to the thread-task local array */
2116                 for (int j = i; j < i + inc; j++)
2117                 {
2118                     il_task->iatoms[il_task->nr++] = iat[j];
2119                 }
2120             }
2121
2122             i += inc;
2123         }
2124     }
2125 }
2126
2127 void split_vsites_over_threads(const t_ilist   *ilist,
2128                                const t_iparams *ip,
2129                                const t_mdatoms *mdatoms,
2130                                gmx_vsite_t     *vsite)
2131 {
2132     int      vsite_atom_range, natperthread;
2133
2134     if (vsite->nthreads == 1)
2135     {
2136         /* Nothing to do */
2137         return;
2138     }
2139
2140     /* The current way of distributing the vsites over threads in primitive.
2141      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2142      * without taking into account how the vsites are distributed.
2143      * Without domain decomposition we at least tighten the upper bound
2144      * of the range (useful for common systems such as a vsite-protein
2145      * in 3-site water).
2146      * With domain decomposition, as long as the vsites are distributed
2147      * uniformly in each domain along the major dimension, usually x,
2148      * it will also perform well.
2149      */
2150     if (!vsite->useDomdec)
2151     {
2152         vsite_atom_range = -1;
2153         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2154         {
2155             {   // TODO remove me
2156                 if (ftype != F_VSITEN)
2157                 {
2158                     int            nral1 = 1 + NRAL(ftype);
2159                     const t_iatom *iat   = ilist[ftype].iatoms;
2160                     for (int i = 0; i < ilist[ftype].nr; i += nral1)
2161                     {
2162                         for (int j = i + 1; j < i + nral1; j++)
2163                         {
2164                             vsite_atom_range = std::max(vsite_atom_range, iat[j]);
2165                         }
2166                     }
2167                 }
2168                 else
2169                 {
2170                     int            vs_ind_end;
2171
2172                     const t_iatom *iat = ilist[ftype].iatoms;
2173
2174                     int            i = 0;
2175                     while (i < ilist[ftype].nr)
2176                     {
2177                         /* The 3 below is from 1+NRAL(ftype)=3 */
2178                         vs_ind_end = i + ip[iat[i]].vsiten.n*3;
2179
2180                         vsite_atom_range = std::max(vsite_atom_range, iat[i+1]);
2181                         while (i < vs_ind_end)
2182                         {
2183                             vsite_atom_range = std::max(vsite_atom_range, iat[i+2]);
2184                             i               += 3;
2185                         }
2186                     }
2187                 }
2188             }
2189         }
2190         vsite_atom_range++;
2191         natperthread     = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
2192     }
2193     else
2194     {
2195         /* Any local or not local atom could be involved in virtual sites.
2196          * But since we usually have very few non-local virtual sites
2197          * (only non-local vsites that depend on local vsites),
2198          * we distribute the local atom range equally over the threads.
2199          * When assigning vsites to threads, we should take care that the last
2200          * threads also covers the non-local range.
2201          */
2202         vsite_atom_range = mdatoms->nr;
2203         natperthread     = (mdatoms->homenr + vsite->nthreads - 1)/vsite->nthreads;
2204     }
2205
2206     if (debug)
2207     {
2208         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms->nr, vsite_atom_range, natperthread);
2209     }
2210
2211     /* To simplify the vsite assignment, we make an index which tells us
2212      * to which task particles, both non-vsites and vsites, are assigned.
2213      */
2214     vsite->taskIndex.resize(mdatoms->nr);
2215
2216     /* Initialize the task index array. Here we assign the non-vsite
2217      * particles to task=thread, so we easily figure out if vsites
2218      * depend on local and/or non-local particles in assignVsitesToThread.
2219      */
2220     gmx::ArrayRef<int> taskIndex = vsite->taskIndex;
2221     {
2222         int  thread = 0;
2223         for (int i = 0; i < mdatoms->nr; i++)
2224         {
2225             if (mdatoms->ptype[i] == eptVSite)
2226             {
2227                 /* vsites are not assigned to a task yet */
2228                 taskIndex[i] = -1;
2229             }
2230             else
2231             {
2232                 /* assign non-vsite particles to task thread */
2233                 taskIndex[i] = thread;
2234             }
2235             if (i == (thread + 1)*natperthread && thread < vsite->nthreads)
2236             {
2237                 thread++;
2238             }
2239         }
2240     }
2241
2242 #pragma omp parallel num_threads(vsite->nthreads)
2243     {
2244         try
2245         {
2246             int          thread = gmx_omp_get_thread_num();
2247             VsiteThread &tData  = *vsite->tData[thread];
2248
2249             /* Clear the buffer use flags that were set before */
2250             if (tData.useInterdependentTask)
2251             {
2252                 InterdependentTask &idTask = tData.idTask;
2253
2254                 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2255                  * we clear the force buffer at the next step,
2256                  * so we need to do it here as well.
2257                  */
2258                 clearTaskForceBufferUsedElements(&idTask);
2259
2260                 idTask.vsite.resize(0);
2261                 for (int t = 0; t < vsite->nthreads; t++)
2262                 {
2263                     AtomIndex &atomIndex = idTask.atomIndex[t];
2264                     int        natom     = atomIndex.atom.size();
2265                     for (int i = 0; i < natom; i++)
2266                     {
2267                         idTask.use[atomIndex.atom[i]] = false;
2268                     }
2269                     atomIndex.atom.resize(0);
2270                 }
2271                 idTask.nuse = 0;
2272             }
2273
2274             /* To avoid large f_buf allocations of #threads*vsite_atom_range
2275              * we don't use task2 with more than 200000 atoms. This doesn't
2276              * affect performance, since with such a large range relatively few
2277              * vsites will end up in the separate task.
2278              * Note that useTask2 should be the same for all threads.
2279              */
2280             tData.useInterdependentTask = (vsite_atom_range <= 200000);
2281             if (tData.useInterdependentTask)
2282             {
2283                 size_t              natoms_use_in_vsites = vsite_atom_range;
2284                 InterdependentTask &idTask               = tData.idTask;
2285                 /* To avoid resizing and re-clearing every nstlist steps,
2286                  * we never down size the force buffer.
2287                  */
2288                 if (natoms_use_in_vsites > idTask.force.size() ||
2289                     natoms_use_in_vsites > idTask.use.size())
2290                 {
2291                     idTask.force.resize(natoms_use_in_vsites, { 0, 0, 0 });
2292                     idTask.use.resize(natoms_use_in_vsites, false);
2293                 }
2294             }
2295
2296             /* Assign all vsites that can execute independently on threads */
2297             tData.rangeStart     =  thread     *natperthread;
2298             if (thread < vsite->nthreads - 1)
2299             {
2300                 tData.rangeEnd   = (thread + 1)*natperthread;
2301             }
2302             else
2303             {
2304                 /* The last thread should cover up to the end of the range */
2305                 tData.rangeEnd   = mdatoms->nr;
2306             }
2307             assignVsitesToThread(&tData,
2308                                  thread, vsite->nthreads,
2309                                  natperthread,
2310                                  taskIndex,
2311                                  ilist, ip, mdatoms->ptype);
2312
2313             if (tData.useInterdependentTask)
2314             {
2315                 /* In the worst case, all tasks write to force ranges of
2316                  * all other tasks, leading to #tasks^2 scaling (this is only
2317                  * the overhead, the actual flops remain constant).
2318                  * But in most cases there is far less coupling. To improve
2319                  * scaling at high thread counts we therefore construct
2320                  * an index to only loop over the actually affected tasks.
2321                  */
2322                 InterdependentTask &idTask = tData.idTask;
2323
2324                 /* Ensure assignVsitesToThread finished on other threads */
2325 #pragma omp barrier
2326
2327                 idTask.spreadTask.resize(0);
2328                 idTask.reduceTask.resize(0);
2329                 for (int t = 0; t < vsite->nthreads; t++)
2330                 {
2331                     /* Do we write to the force buffer of task t? */
2332                     if (!idTask.atomIndex[t].atom.empty())
2333                     {
2334                         idTask.spreadTask.push_back(t);
2335                     }
2336                     /* Does task t write to our force buffer? */
2337                     if (!vsite->tData[t]->idTask.atomIndex[thread].atom.empty())
2338                     {
2339                         idTask.reduceTask.push_back(t);
2340                     }
2341                 }
2342             }
2343         }
2344         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2345     }
2346     /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2347      * to a single task that will not run in parallel with other tasks.
2348      */
2349     assignVsitesToSingleTask(vsite->tData[vsite->nthreads].get(),
2350                              2*vsite->nthreads,
2351                              taskIndex,
2352                              ilist, ip);
2353
2354     if (debug && vsite->nthreads > 1)
2355     {
2356         fprintf(debug, "virtual site useInterdependentTask %d, nuse:\n",
2357                 static_cast<int>(vsite->tData[0]->useInterdependentTask));
2358         for (int th = 0; th < vsite->nthreads + 1; th++)
2359         {
2360             fprintf(debug, " %4d", vsite->tData[th]->idTask.nuse);
2361         }
2362         fprintf(debug, "\n");
2363
2364         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2365         {
2366             if (ilist[ftype].nr > 0)
2367             {
2368                 fprintf(debug, "%-20s thread dist:",
2369                         interaction_function[ftype].longname);
2370                 for (int th = 0; th < vsite->nthreads + 1; th++)
2371                 {
2372                     fprintf(debug, " %4d %4d ",
2373                             vsite->tData[th]->ilist[ftype].nr,
2374                             vsite->tData[th]->idTask.ilist[ftype].nr);
2375                 }
2376                 fprintf(debug, "\n");
2377             }
2378         }
2379     }
2380
2381 #ifndef NDEBUG
2382     int nrOrig     = vsiteIlistNrCount(ilist);
2383     int nrThreaded = 0;
2384     for (int th = 0; th < vsite->nthreads + 1; th++)
2385     {
2386         nrThreaded +=
2387             vsiteIlistNrCount(vsite->tData[th]->ilist) +
2388             vsiteIlistNrCount(vsite->tData[th]->idTask.ilist);
2389     }
2390     GMX_ASSERT(nrThreaded == nrOrig, "The number of virtual sites assigned to all thread task has to match the total number of virtual sites");
2391 #endif
2392 }
2393
2394 void set_vsite_top(gmx_vsite_t          *vsite,
2395                    const gmx_localtop_t *top,
2396                    const t_mdatoms      *md)
2397 {
2398     if (vsite->nthreads > 1)
2399     {
2400         split_vsites_over_threads(top->idef.il, top->idef.iparams,
2401                                   md, vsite);
2402     }
2403 }