src/gromacs/mdlib/vsite.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "vsite.h"
  40
  41 #include <stdio.h>
  42
  43 #include <algorithm>
  44 #include <vector>
  45
  46 #include "gromacs/domdec/domdec.h"
  47 #include "gromacs/domdec/domdec_struct.h"
  48 #include "gromacs/gmxlib/network.h"
  49 #include "gromacs/gmxlib/nrnb.h"
  50 #include "gromacs/math/functions.h"
  51 #include "gromacs/math/vec.h"
  52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  53 #include "gromacs/mdtypes/commrec.h"
  54 #include "gromacs/pbcutil/ishift.h"
  55 #include "gromacs/pbcutil/mshift.h"
  56 #include "gromacs/pbcutil/pbc.h"
  57 #include "gromacs/topology/ifunc.h"
  58 #include "gromacs/topology/mtop_util.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/fatalerror.h"
  61 #include "gromacs/utility/gmxassert.h"
  62 #include "gromacs/utility/gmxomp.h"
  63 #include "gromacs/utility/smalloc.h"
  64
  65
  66 /* The strategy used here for assigning virtual sites to (thread-)tasks
  67  * is as follows:
  68  *
  69  * We divide the atom range that vsites operate on (natoms_local with DD,
  70  * 0 - last atom involved in vsites without DD) equally over all threads.
  71  *
  72  * Vsites in the local range constructed from atoms in the local range
  73  * and/or other vsites that are fully local are assigned to a simple,
  74  * independent task.
  75  *
  76  * Vsites that are not assigned after using the above criterion get assigned
  77  * to a so called "interdependent" thread task when none of the constructing
  78  * atoms is a vsite. These tasks are called interdependent, because one task
  79  * accesses atoms assigned to a different task/thread.
  80  * Note that this option is turned off with large (local) atom counts
  81  * to avoid high memory usage.
  82  *
  83  * Any remaining vsites are assigned to a separate master thread task.
  84  */
  85
  86 using gmx::RVec;
  87
  88 static void init_ilist(t_ilist *ilist)
  89 {
  90     for (int i = 0; i < F_NRE; i++)
  91     {
  92         ilist[i].nr     = 0;
  93         ilist[i].nalloc = 0;
  94         ilist[i].iatoms = nullptr;
  95     }
  96 }
  97
  98 /*! \brief List of atom indices belonging to a task */
  99 struct AtomIndex {
 100     //! List of atom indices
 101     std::vector<int> atom;
 102 };
 103
 104 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
 105 struct InterdependentTask
 106 {
 107     //! The interaction lists, only vsite entries are used
 108     t_ilist                ilist[F_NRE];
 109     //! Thread/task-local force buffer
 110     std::vector<RVec>      force;
 111     //! The atom indices of the vsites of our task
 112     std::vector<int>       vsite;
 113     //! Flags if elements in force are spread to or not
 114     std::vector<bool>      use;
 115     //! The number of entries set to true in use
 116     int                    nuse;
 117     //! Array of atoms indices, size nthreads, covering all nuse set elements in use
 118     std::vector<AtomIndex> atomIndex;
 119     //! List of tasks (force blocks) this task spread forces to
 120     std::vector<int>       spreadTask;
 121     //! List of tasks that write to this tasks force block range
 122     std::vector<int>       reduceTask;
 123
 124     InterdependentTask()
 125     {
 126         init_ilist(ilist);
 127         nuse = 0;
 128     }
 129 };
 130
 131 /*! \brief Vsite thread task data structure */
 132 struct VsiteThread {
 133     //! Start of atom range of this task
 134     int                rangeStart;
 135     //! End of atom range of this task
 136     int                rangeEnd;
 137     //! The interaction lists, only vsite entries are used
 138     t_ilist            ilist[F_NRE];
 139     //! Local fshift accumulation buffer
 140     rvec               fshift[SHIFTS];
 141     //! Local virial dx*df accumulation buffer
 142     matrix             dxdf;
 143     //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
 144     bool               useInterdependentTask;
 145     //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
 146     InterdependentTask idTask;
 147
 148     /*! \brief Constructor */
 149     VsiteThread()
 150     {
 151         rangeStart            = -1;
 152         rangeEnd              = -1;
 153         init_ilist(ilist);
 154         clear_rvecs(SHIFTS, fshift);
 155         clear_mat(dxdf);
 156         useInterdependentTask = false;
 157     }
 158 };
 159
 160
 161 /* The start and end values of for the vsite indices in the ftype enum.
 162  * The validity of these values is checked in init_vsite.
 163  * This is used to avoid loops over all ftypes just to get the vsite entries.
 164  * (We should replace the fixed ilist array by only the used entries.)
 165  */
 166 static const int c_ftypeVsiteStart = F_VSITE2;
 167 static const int c_ftypeVsiteEnd   = F_VSITEN + 1;
 168
 169
 170 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
 171  *
 172  * \param[in] ilist  The interaction list
 173  */
 174 static int vsiteIlistNrCount(const t_ilist *ilist)
 175 {
 176     int nr = 0;
 177     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 178     {
 179         nr += ilist[ftype].nr;
 180     }
 181
 182     return nr;
 183 }
 184
 185 static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 186 {
 187     if (pbc)
 188     {
 189         return pbc_dx_aiuc(pbc, xi, xj, dx);
 190     }
 191     else
 192     {
 193         rvec_sub(xi, xj, dx);
 194         return CENTRAL;
 195     }
 196 }
 197
 198 /* Vsite construction routines */
 199
 200 static void constr_vsite2(const rvec xi, const rvec xj, rvec x, real a, const t_pbc *pbc)
 201 {
 202     real b = 1 - a;
 203     /* 1 flop */
 204
 205     if (pbc)
 206     {
 207         rvec dx;
 208         pbc_dx_aiuc(pbc, xj, xi, dx);
 209         x[XX] = xi[XX] + a*dx[XX];
 210         x[YY] = xi[YY] + a*dx[YY];
 211         x[ZZ] = xi[ZZ] + a*dx[ZZ];
 212     }
 213     else
 214     {
 215         x[XX] = b*xi[XX] + a*xj[XX];
 216         x[YY] = b*xi[YY] + a*xj[YY];
 217         x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 218         /* 9 Flops */
 219     }
 220
 221     /* TOTAL: 10 flops */
 222 }
 223
 224 static void constr_vsite3(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 225                           const t_pbc *pbc)
 226 {
 227     real c = 1 - a - b;
 228     /* 2 flops */
 229
 230     if (pbc)
 231     {
 232         rvec dxj, dxk;
 233
 234         pbc_dx_aiuc(pbc, xj, xi, dxj);
 235         pbc_dx_aiuc(pbc, xk, xi, dxk);
 236         x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 237         x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 238         x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 239     }
 240     else
 241     {
 242         x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 243         x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 244         x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 245         /* 15 Flops */
 246     }
 247
 248     /* TOTAL: 17 flops */
 249 }
 250
 251 static void constr_vsite3FD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 252                             const t_pbc *pbc)
 253 {
 254     rvec xij, xjk, temp;
 255     real c;
 256
 257     pbc_rvec_sub(pbc, xj, xi, xij);
 258     pbc_rvec_sub(pbc, xk, xj, xjk);
 259     /* 6 flops */
 260
 261     /* temp goes from i to a point on the line jk */
 262     temp[XX] = xij[XX] + a*xjk[XX];
 263     temp[YY] = xij[YY] + a*xjk[YY];
 264     temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 265     /* 6 flops */
 266
 267     c = b*gmx::invsqrt(iprod(temp, temp));
 268     /* 6 + 10 flops */
 269
 270     x[XX] = xi[XX] + c*temp[XX];
 271     x[YY] = xi[YY] + c*temp[YY];
 272     x[ZZ] = xi[ZZ] + c*temp[ZZ];
 273     /* 6 Flops */
 274
 275     /* TOTAL: 34 flops */
 276 }
 277
 278 static void constr_vsite3FAD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc *pbc)
 279 {
 280     rvec xij, xjk, xp;
 281     real a1, b1, c1, invdij;
 282
 283     pbc_rvec_sub(pbc, xj, xi, xij);
 284     pbc_rvec_sub(pbc, xk, xj, xjk);
 285     /* 6 flops */
 286
 287     invdij = gmx::invsqrt(iprod(xij, xij));
 288     c1     = invdij * invdij * iprod(xij, xjk);
 289     xp[XX] = xjk[XX] - c1*xij[XX];
 290     xp[YY] = xjk[YY] - c1*xij[YY];
 291     xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 292     a1     = a*invdij;
 293     b1     = b*gmx::invsqrt(iprod(xp, xp));
 294     /* 45 */
 295
 296     x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 297     x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 298     x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 299     /* 12 Flops */
 300
 301     /* TOTAL: 63 flops */
 302 }
 303
 304 static void constr_vsite3OUT(const rvec xi, const rvec xj, const rvec xk, rvec x,
 305                              real a, real b, real c, const t_pbc *pbc)
 306 {
 307     rvec xij, xik, temp;
 308
 309     pbc_rvec_sub(pbc, xj, xi, xij);
 310     pbc_rvec_sub(pbc, xk, xi, xik);
 311     cprod(xij, xik, temp);
 312     /* 15 Flops */
 313
 314     x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 315     x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 316     x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 317     /* 18 Flops */
 318
 319     /* TOTAL: 33 flops */
 320 }
 321
 322 static void constr_vsite4FD(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 323                             real a, real b, real c, const t_pbc *pbc)
 324 {
 325     rvec xij, xjk, xjl, temp;
 326     real d;
 327
 328     pbc_rvec_sub(pbc, xj, xi, xij);
 329     pbc_rvec_sub(pbc, xk, xj, xjk);
 330     pbc_rvec_sub(pbc, xl, xj, xjl);
 331     /* 9 flops */
 332
 333     /* temp goes from i to a point on the plane jkl */
 334     temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 335     temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 336     temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 337     /* 12 flops */
 338
 339     d = c*gmx::invsqrt(iprod(temp, temp));
 340     /* 6 + 10 flops */
 341
 342     x[XX] = xi[XX] + d*temp[XX];
 343     x[YY] = xi[YY] + d*temp[YY];
 344     x[ZZ] = xi[ZZ] + d*temp[ZZ];
 345     /* 6 Flops */
 346
 347     /* TOTAL: 43 flops */
 348 }
 349
 350 static void constr_vsite4FDN(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 351                              real a, real b, real c, const t_pbc *pbc)
 352 {
 353     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 354     real d;
 355
 356     pbc_rvec_sub(pbc, xj, xi, xij);
 357     pbc_rvec_sub(pbc, xk, xi, xik);
 358     pbc_rvec_sub(pbc, xl, xi, xil);
 359     /* 9 flops */
 360
 361     ra[XX] = a*xik[XX];
 362     ra[YY] = a*xik[YY];
 363     ra[ZZ] = a*xik[ZZ];
 364
 365     rb[XX] = b*xil[XX];
 366     rb[YY] = b*xil[YY];
 367     rb[ZZ] = b*xil[ZZ];
 368
 369     /* 6 flops */
 370
 371     rvec_sub(ra, xij, rja);
 372     rvec_sub(rb, xij, rjb);
 373     /* 6 flops */
 374
 375     cprod(rja, rjb, rm);
 376     /* 9 flops */
 377
 378     d = c*gmx::invsqrt(norm2(rm));
 379     /* 5+5+1 flops */
 380
 381     x[XX] = xi[XX] + d*rm[XX];
 382     x[YY] = xi[YY] + d*rm[YY];
 383     x[ZZ] = xi[ZZ] + d*rm[ZZ];
 384     /* 6 Flops */
 385
 386     /* TOTAL: 47 flops */
 387 }
 388
 389
 390 static int constr_vsiten(const t_iatom *ia, const t_iparams ip[],
 391                          rvec *x, const t_pbc *pbc)
 392 {
 393     rvec x1, dx;
 394     dvec dsum;
 395     int  n3, av, ai;
 396     real a;
 397
 398     n3 = 3*ip[ia[0]].vsiten.n;
 399     av = ia[1];
 400     ai = ia[2];
 401     copy_rvec(x[ai], x1);
 402     clear_dvec(dsum);
 403     for (int i = 3; i < n3; i += 3)
 404     {
 405         ai = ia[i+2];
 406         a  = ip[ia[i]].vsiten.a;
 407         if (pbc)
 408         {
 409             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 410         }
 411         else
 412         {
 413             rvec_sub(x[ai], x1, dx);
 414         }
 415         dsum[XX] += a*dx[XX];
 416         dsum[YY] += a*dx[YY];
 417         dsum[ZZ] += a*dx[ZZ];
 418         /* 9 Flops */
 419     }
 420
 421     x[av][XX] = x1[XX] + dsum[XX];
 422     x[av][YY] = x1[YY] + dsum[YY];
 423     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 424
 425     return n3;
 426 }
 427
 428 /*! \brief PBC modes for vsite construction and spreading */
 429 enum class PbcMode
 430 {
 431     all,         // Apply normal, simple PBC for all vsites
 432     chargeGroup, // Keep vsite in the same periodic image as the rest of it's charge group
 433     none         // No PBC treatment needed
 434 };
 435
 436 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
 437  *
 438  * \param[in] pbcPtr  A pointer to a PBC struct or nullptr when no PBC treatment is required
 439  * \param[in] vsite   A pointer to the vsite struct, can be nullptr
 440  */
 441 static PbcMode getPbcMode(const t_pbc       *pbcPtr,
 442                           const gmx_vsite_t *vsite)
 443 {
 444     if (pbcPtr == nullptr)
 445     {
 446         return PbcMode::none;
 447     }
 448     else if (vsite != nullptr && vsite->bHaveChargeGroups)
 449     {
 450         return PbcMode::chargeGroup;
 451     }
 452     else
 453     {
 454         return PbcMode::all;
 455     }
 456 }
 457
 458 static void construct_vsites_thread(const gmx_vsite_t *vsite,
 459                                     rvec x[],
 460                                     real dt, rvec *v,
 461                                     const t_iparams ip[], const t_ilist ilist[],
 462                                     const t_pbc *pbc_null)
 463 {
 464     real         inv_dt;
 465     if (v != nullptr)
 466     {
 467         inv_dt = 1.0/dt;
 468     }
 469     else
 470     {
 471         inv_dt = 1.0;
 472     }
 473
 474     const PbcMode  pbcMode   = getPbcMode(pbc_null, vsite);
 475     /* We need another pbc pointer, as with charge groups we switch per vsite */
 476     const t_pbc   *pbc_null2 = pbc_null;
 477     const int     *vsite_pbc = nullptr;
 478
 479     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 480     {
 481         if (ilist[ftype].nr == 0)
 482         {
 483             continue;
 484         }
 485
 486         {   // TODO remove me
 487             int            nra = interaction_function[ftype].nratoms;
 488             int            inc = 1 + nra;
 489             int            nr  = ilist[ftype].nr;
 490
 491             const t_iatom *ia = ilist[ftype].iatoms;
 492
 493             if (pbcMode == PbcMode::chargeGroup)
 494             {
 495                 vsite_pbc = vsite->vsite_pbc_loc[ftype - c_ftypeVsiteStart];
 496             }
 497
 498             for (int i = 0; i < nr; )
 499             {
 500                 int  tp     = ia[0];
 501                 /* The vsite and constructing atoms */
 502                 int  avsite = ia[1];
 503                 int  ai     = ia[2];
 504                 /* Constants for constructing vsites */
 505                 real a1     = ip[tp].vsite.a;
 506                 /* Check what kind of pbc we need to use */
 507                 int  pbc_atom;
 508                 rvec xpbc;
 509                 if (pbcMode == PbcMode::all)
 510                 {
 511                     /* No charge groups, vsite follows its own pbc */
 512                     pbc_atom = avsite;
 513                     copy_rvec(x[avsite], xpbc);
 514                 }
 515                 else if (pbcMode == PbcMode::chargeGroup)
 516                 {
 517                     pbc_atom = vsite_pbc[i/(1 + nra)];
 518                     if (pbc_atom > -2)
 519                     {
 520                         if (pbc_atom >= 0)
 521                         {
 522                             /* We need to copy the coordinates here,
 523                              * single for single atom cg's pbc_atom
 524                              * is the vsite itself.
 525                              */
 526                             copy_rvec(x[pbc_atom], xpbc);
 527                         }
 528                         pbc_null2 = pbc_null;
 529                     }
 530                     else
 531                     {
 532                         pbc_null2 = nullptr;
 533                     }
 534                 }
 535                 else
 536                 {
 537                     pbc_atom = -2;
 538                 }
 539                 /* Copy the old position */
 540                 rvec xv;
 541                 copy_rvec(x[avsite], xv);
 542
 543                 /* Construct the vsite depending on type */
 544                 int  aj, ak, al;
 545                 real b1, c1;
 546                 switch (ftype)
 547                 {
 548                     case F_VSITE2:
 549                         aj = ia[3];
 550                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 551                         break;
 552                     case F_VSITE3:
 553                         aj = ia[3];
 554                         ak = ia[4];
 555                         b1 = ip[tp].vsite.b;
 556                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 557                         break;
 558                     case F_VSITE3FD:
 559                         aj = ia[3];
 560                         ak = ia[4];
 561                         b1 = ip[tp].vsite.b;
 562                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 563                         break;
 564                     case F_VSITE3FAD:
 565                         aj = ia[3];
 566                         ak = ia[4];
 567                         b1 = ip[tp].vsite.b;
 568                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 569                         break;
 570                     case F_VSITE3OUT:
 571                         aj = ia[3];
 572                         ak = ia[4];
 573                         b1 = ip[tp].vsite.b;
 574                         c1 = ip[tp].vsite.c;
 575                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 576                         break;
 577                     case F_VSITE4FD:
 578                         aj = ia[3];
 579                         ak = ia[4];
 580                         al = ia[5];
 581                         b1 = ip[tp].vsite.b;
 582                         c1 = ip[tp].vsite.c;
 583                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 584                                         pbc_null2);
 585                         break;
 586                     case F_VSITE4FDN:
 587                         aj = ia[3];
 588                         ak = ia[4];
 589                         al = ia[5];
 590                         b1 = ip[tp].vsite.b;
 591                         c1 = ip[tp].vsite.c;
 592                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 593                                          pbc_null2);
 594                         break;
 595                     case F_VSITEN:
 596                         inc = constr_vsiten(ia, ip, x, pbc_null2);
 597                         break;
 598                     default:
 599                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 600                                   ftype, __FILE__, __LINE__);
 601                 }
 602
 603                 if (pbc_atom >= 0)
 604                 {
 605                     /* Match the pbc of this vsite to the rest of its charge group */
 606                     rvec dx;
 607                     int  ishift = pbc_dx_aiuc(pbc_null, x[avsite], xpbc, dx);
 608                     if (ishift != CENTRAL)
 609                     {
 610                         rvec_add(xpbc, dx, x[avsite]);
 611                     }
 612                 }
 613                 if (v != nullptr)
 614                 {
 615                     /* Calculate velocity of vsite... */
 616                     rvec vv;
 617                     rvec_sub(x[avsite], xv, vv);
 618                     svmul(inv_dt, vv, v[avsite]);
 619                 }
 620
 621                 /* Increment loop variables */
 622                 i  += inc;
 623                 ia += inc;
 624             }
 625         }
 626     }
 627 }
 628
 629 void construct_vsites(const gmx_vsite_t *vsite,
 630                       rvec x[],
 631                       real dt, rvec *v,
 632                       const t_iparams ip[], const t_ilist ilist[],
 633                       int ePBC, gmx_bool bMolPBC,
 634                       t_commrec *cr,
 635                       const matrix box)
 636 {
 637     const bool useDomdec = (vsite != nullptr && vsite->useDomdec);
 638     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)), "When vsites are set up with domain decomposition, we need a valid commrec");
 639     // TODO: Remove this assertion when we remove charge groups
 640     GMX_ASSERT(vsite != nullptr || ePBC == epbcNONE, "Without a vsite struct we can not do PBC (in case we have charge groups)");
 641
 642     t_pbc     pbc, *pbc_null;
 643
 644     /* We only need to do pbc when we have inter-cg vsites.
 645      * Note that with domain decomposition we do not need to apply PBC here
 646      * when we have at least 3 domains along each dimension. Currently we
 647      * do not optimize this case.
 648      */
 649     if (ePBC != epbcNONE && (useDomdec || bMolPBC) &&
 650         !(vsite != nullptr && vsite->n_intercg_vsite == 0))
 651     {
 652         /* This is wasting some CPU time as we now do this multiple times
 653          * per MD step.
 654          */
 655         ivec null_ivec;
 656         clear_ivec(null_ivec);
 657         pbc_null = set_pbc_dd(&pbc, ePBC,
 658                               useDomdec ? cr->dd->nc : null_ivec,
 659                               FALSE, box);
 660     }
 661     else
 662     {
 663         pbc_null = nullptr;
 664     }
 665
 666     if (useDomdec)
 667     {
 668         dd_move_x_vsites(cr->dd, box, x);
 669     }
 670
 671     // cppcheck-suppress nullPointerRedundantCheck
 672     if (vsite == nullptr || vsite->nthreads == 1)
 673     {
 674         construct_vsites_thread(vsite,
 675                                 x, dt, v,
 676                                 ip, ilist,
 677                                 pbc_null);
 678     }
 679     else
 680     {
 681 #pragma omp parallel num_threads(vsite->nthreads)
 682         {
 683             try
 684             {
 685                 const int          th    = gmx_omp_get_thread_num();
 686                 const VsiteThread &tData = *vsite->tData[th];
 687                 GMX_ASSERT(tData.rangeStart >= 0, "The thread data should be initialized before calling construct_vsites");
 688
 689                 construct_vsites_thread(vsite,
 690                                         x, dt, v,
 691                                         ip, tData.ilist,
 692                                         pbc_null);
 693                 if (tData.useInterdependentTask)
 694                 {
 695                     /* Here we don't need a barrier (unlike the spreading),
 696                      * since both tasks only construct vsites from particles,
 697                      * or local vsites, not from non-local vsites.
 698                      */
 699                     construct_vsites_thread(vsite,
 700                                             x, dt, v,
 701                                             ip, tData.idTask.ilist,
 702                                             pbc_null);
 703                 }
 704             }
 705             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 706         }
 707         /* Now we can construct the vsites that might depend on other vsites */
 708         construct_vsites_thread(vsite,
 709                                 x, dt, v,
 710                                 ip, vsite->tData[vsite->nthreads]->ilist,
 711                                 pbc_null);
 712     }
 713 }
 714
 715 static void spread_vsite2(const t_iatom ia[], real a,
 716                           const rvec x[],
 717                           rvec f[], rvec fshift[],
 718                           const t_pbc *pbc, const t_graph *g)
 719 {
 720     rvec    fi, fj, dx;
 721     t_iatom av, ai, aj;
 722     ivec    di;
 723     int     siv, sij;
 724
 725     av = ia[1];
 726     ai = ia[2];
 727     aj = ia[3];
 728
 729     svmul(1 - a, f[av], fi);
 730     svmul(    a, f[av], fj);
 731     /* 7 flop */
 732
 733     rvec_inc(f[ai], fi);
 734     rvec_inc(f[aj], fj);
 735     /* 6 Flops */
 736
 737     if (g)
 738     {
 739         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 740         siv = IVEC2IS(di);
 741         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 742         sij = IVEC2IS(di);
 743     }
 744     else if (pbc)
 745     {
 746         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 747         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 748     }
 749     else
 750     {
 751         siv = CENTRAL;
 752         sij = CENTRAL;
 753     }
 754
 755     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 756     {
 757         rvec_inc(fshift[siv], f[av]);
 758         rvec_dec(fshift[CENTRAL], fi);
 759         rvec_dec(fshift[sij], fj);
 760     }
 761
 762     /* TOTAL: 13 flops */
 763 }
 764
 765 void constructVsitesGlobal(const gmx_mtop_t         &mtop,
 766                            gmx::ArrayRef<gmx::RVec>  x)
 767 {
 768     GMX_ASSERT(x.size() >= static_cast<size_t>(mtop.natoms), "x should contain the whole system");
 769
 770     for (int mb = 0; mb < mtop.nmolblock; mb++)
 771     {
 772         const gmx_molblock_t &molb = mtop.molblock[mb];
 773         const gmx_moltype_t  &molt = mtop.moltype[molb.type];
 774         if (vsiteIlistNrCount(molt.ilist) > 0)
 775         {
 776             int atomOffset = molb.globalAtomStart;
 777             for (int mol = 0; mol < molb.nmol; mol++)
 778             {
 779                 construct_vsites(nullptr, as_rvec_array(x.data()) + atomOffset,
 780                                  0.0, nullptr,
 781                                  mtop.ffparams.iparams, molt.ilist,
 782                                  epbcNONE, TRUE, nullptr, nullptr);
 783                 atomOffset += molt.atoms.nr;
 784             }
 785         }
 786     }
 787 }
 788
 789 static void spread_vsite3(const t_iatom ia[], real a, real b,
 790                           const rvec x[],
 791                           rvec f[], rvec fshift[],
 792                           const t_pbc *pbc, const t_graph *g)
 793 {
 794     rvec    fi, fj, fk, dx;
 795     int     av, ai, aj, ak;
 796     ivec    di;
 797     int     siv, sij, sik;
 798
 799     av = ia[1];
 800     ai = ia[2];
 801     aj = ia[3];
 802     ak = ia[4];
 803
 804     svmul(1 - a - b, f[av], fi);
 805     svmul(        a, f[av], fj);
 806     svmul(        b, f[av], fk);
 807     /* 11 flops */
 808
 809     rvec_inc(f[ai], fi);
 810     rvec_inc(f[aj], fj);
 811     rvec_inc(f[ak], fk);
 812     /* 9 Flops */
 813
 814     if (g)
 815     {
 816         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 817         siv = IVEC2IS(di);
 818         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 819         sij = IVEC2IS(di);
 820         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 821         sik = IVEC2IS(di);
 822     }
 823     else if (pbc)
 824     {
 825         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 826         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 827         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 828     }
 829     else
 830     {
 831         siv = CENTRAL;
 832         sij = CENTRAL;
 833         sik = CENTRAL;
 834     }
 835
 836     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 837     {
 838         rvec_inc(fshift[siv], f[av]);
 839         rvec_dec(fshift[CENTRAL], fi);
 840         rvec_dec(fshift[sij], fj);
 841         rvec_dec(fshift[sik], fk);
 842     }
 843
 844     /* TOTAL: 20 flops */
 845 }
 846
 847 static void spread_vsite3FD(const t_iatom ia[], real a, real b,
 848                             const rvec x[],
 849                             rvec f[], rvec fshift[],
 850                             gmx_bool VirCorr, matrix dxdf,
 851                             const t_pbc *pbc, const t_graph *g)
 852 {
 853     real    c, invl, fproj, a1;
 854     rvec    xvi, xij, xjk, xix, fv, temp;
 855     t_iatom av, ai, aj, ak;
 856     int     svi, sji, skj;
 857     ivec    di;
 858
 859     av = ia[1];
 860     ai = ia[2];
 861     aj = ia[3];
 862     ak = ia[4];
 863     copy_rvec(f[av], fv);
 864
 865     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 866     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 867     /* 6 flops */
 868
 869     /* xix goes from i to point x on the line jk */
 870     xix[XX] = xij[XX]+a*xjk[XX];
 871     xix[YY] = xij[YY]+a*xjk[YY];
 872     xix[ZZ] = xij[ZZ]+a*xjk[ZZ];
 873     /* 6 flops */
 874
 875     invl = gmx::invsqrt(iprod(xix, xix));
 876     c    = b*invl;
 877     /* 4 + ?10? flops */
 878
 879     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 880
 881     temp[XX] = c*(fv[XX]-fproj*xix[XX]);
 882     temp[YY] = c*(fv[YY]-fproj*xix[YY]);
 883     temp[ZZ] = c*(fv[ZZ]-fproj*xix[ZZ]);
 884     /* 16 */
 885
 886     /* c is already calculated in constr_vsite3FD
 887        storing c somewhere will save 26 flops!     */
 888
 889     a1         = 1 - a;
 890     f[ai][XX] += fv[XX] - temp[XX];
 891     f[ai][YY] += fv[YY] - temp[YY];
 892     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 893     f[aj][XX] += a1*temp[XX];
 894     f[aj][YY] += a1*temp[YY];
 895     f[aj][ZZ] += a1*temp[ZZ];
 896     f[ak][XX] += a*temp[XX];
 897     f[ak][YY] += a*temp[YY];
 898     f[ak][ZZ] += a*temp[ZZ];
 899     /* 19 Flops */
 900
 901     if (g)
 902     {
 903         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 904         svi = IVEC2IS(di);
 905         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 906         sji = IVEC2IS(di);
 907         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 908         skj = IVEC2IS(di);
 909     }
 910     else if (pbc)
 911     {
 912         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 913     }
 914     else
 915     {
 916         svi = CENTRAL;
 917     }
 918
 919     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 920     {
 921         rvec_dec(fshift[svi], fv);
 922         fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 923         fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 924         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 925         fshift[    sji][XX] += temp[XX];
 926         fshift[    sji][YY] += temp[YY];
 927         fshift[    sji][ZZ] += temp[ZZ];
 928         fshift[    skj][XX] += a*temp[XX];
 929         fshift[    skj][YY] += a*temp[YY];
 930         fshift[    skj][ZZ] += a*temp[ZZ];
 931     }
 932
 933     if (VirCorr)
 934     {
 935         /* When VirCorr=TRUE, the virial for the current forces is not
 936          * calculated from the redistributed forces. This means that
 937          * the effect of non-linear virtual site constructions on the virial
 938          * needs to be added separately. This contribution can be calculated
 939          * in many ways, but the simplest and cheapest way is to use
 940          * the first constructing atom ai as a reference position in space:
 941          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 942          */
 943         rvec xiv;
 944
 945         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 946
 947         for (int i = 0; i < DIM; i++)
 948         {
 949             for (int j = 0; j < DIM; j++)
 950             {
 951                 /* As xix is a linear combination of j and k, use that here */
 952                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 953             }
 954         }
 955     }
 956
 957     /* TOTAL: 61 flops */
 958 }
 959
 960 static void spread_vsite3FAD(const t_iatom ia[], real a, real b,
 961                              const rvec x[],
 962                              rvec f[], rvec fshift[],
 963                              gmx_bool VirCorr, matrix dxdf,
 964                              const t_pbc *pbc, const t_graph *g)
 965 {
 966     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 967     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 968     t_iatom av, ai, aj, ak;
 969     int     svi, sji, skj, d;
 970     ivec    di;
 971
 972     av = ia[1];
 973     ai = ia[2];
 974     aj = ia[3];
 975     ak = ia[4];
 976     copy_rvec(f[ia[1]], fv);
 977
 978     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 979     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 980     /* 6 flops */
 981
 982     invdij    = gmx::invsqrt(iprod(xij, xij));
 983     invdij2   = invdij * invdij;
 984     c1        = iprod(xij, xjk) * invdij2;
 985     xperp[XX] = xjk[XX] - c1*xij[XX];
 986     xperp[YY] = xjk[YY] - c1*xij[YY];
 987     xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 988     /* xperp in plane ijk, perp. to ij */
 989     invdp = gmx::invsqrt(iprod(xperp, xperp));
 990     a1    = a*invdij;
 991     b1    = b*invdp;
 992     /* 45 flops */
 993
 994     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 995        storing them somewhere will save 45 flops!     */
 996
 997     fproj = iprod(xij, fv)*invdij2;
 998     svmul(fproj,                      xij,  Fpij);    /* proj. f on xij */
 999     svmul(iprod(xperp, fv)*invdp*invdp, xperp, Fppp); /* proj. f on xperp */
1000     svmul(b1*fproj,                   xperp, f3);
1001     /* 23 flops */
1002
1003     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
1004     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
1005     for (d = 0; (d < DIM); d++)
1006     {
1007         f1[d] *= a1;
1008         f2[d] *= b1;
1009     }
1010     /* 12 flops */
1011
1012     c2         = 1 + c1;
1013     f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
1014     f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
1015     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
1016     f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
1017     f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
1018     f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
1019     f[ak][XX] +=                      f2[XX];
1020     f[ak][YY] +=                      f2[YY];
1021     f[ak][ZZ] +=                      f2[ZZ];
1022     /* 30 Flops */
1023
1024     if (g)
1025     {
1026         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1027         svi = IVEC2IS(di);
1028         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1029         sji = IVEC2IS(di);
1030         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1031         skj = IVEC2IS(di);
1032     }
1033     else if (pbc)
1034     {
1035         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1036     }
1037     else
1038     {
1039         svi = CENTRAL;
1040     }
1041
1042     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1043     {
1044         rvec_dec(fshift[svi], fv);
1045         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
1046         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
1047         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
1048         fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
1049         fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
1050         fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
1051         fshift[    skj][XX] +=                          f2[XX];
1052         fshift[    skj][YY] +=                          f2[YY];
1053         fshift[    skj][ZZ] +=                          f2[ZZ];
1054     }
1055
1056     if (VirCorr)
1057     {
1058         rvec xiv;
1059         int  i, j;
1060
1061         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1062
1063         for (i = 0; i < DIM; i++)
1064         {
1065             for (j = 0; j < DIM; j++)
1066             {
1067                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1068                 dxdf[i][j] +=
1069                     -xiv[i]*fv[j]
1070                     + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
1071                     + xjk[i]*f2[j];
1072             }
1073         }
1074     }
1075
1076     /* TOTAL: 113 flops */
1077 }
1078
1079 static void spread_vsite3OUT(const t_iatom ia[], real a, real b, real c,
1080                              const rvec x[],
1081                              rvec f[], rvec fshift[],
1082                              gmx_bool VirCorr, matrix dxdf,
1083                              const t_pbc *pbc, const t_graph *g)
1084 {
1085     rvec    xvi, xij, xik, fv, fj, fk;
1086     real    cfx, cfy, cfz;
1087     int     av, ai, aj, ak;
1088     ivec    di;
1089     int     svi, sji, ski;
1090
1091     av = ia[1];
1092     ai = ia[2];
1093     aj = ia[3];
1094     ak = ia[4];
1095
1096     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1097     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1098     /* 6 Flops */
1099
1100     copy_rvec(f[av], fv);
1101
1102     cfx = c*fv[XX];
1103     cfy = c*fv[YY];
1104     cfz = c*fv[ZZ];
1105     /* 3 Flops */
1106
1107     fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
1108     fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
1109     fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
1110
1111     fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
1112     fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
1113     fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
1114     /* 30 Flops */
1115
1116     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1117     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1118     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1119     rvec_inc(f[aj], fj);
1120     rvec_inc(f[ak], fk);
1121     /* 15 Flops */
1122
1123     if (g)
1124     {
1125         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1126         svi = IVEC2IS(di);
1127         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1128         sji = IVEC2IS(di);
1129         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1130         ski = IVEC2IS(di);
1131     }
1132     else if (pbc)
1133     {
1134         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1135     }
1136     else
1137     {
1138         svi = CENTRAL;
1139     }
1140
1141     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1142     {
1143         rvec_dec(fshift[svi], fv);
1144         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1145         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1146         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1147         rvec_inc(fshift[sji], fj);
1148         rvec_inc(fshift[ski], fk);
1149     }
1150
1151     if (VirCorr)
1152     {
1153         rvec xiv;
1154
1155         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1156
1157         for (int i = 0; i < DIM; i++)
1158         {
1159             for (int j = 0; j < DIM; j++)
1160             {
1161                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
1162             }
1163         }
1164     }
1165
1166     /* TOTAL: 54 flops */
1167 }
1168
1169 static void spread_vsite4FD(const t_iatom ia[], real a, real b, real c,
1170                             const rvec x[],
1171                             rvec f[], rvec fshift[],
1172                             gmx_bool VirCorr, matrix dxdf,
1173                             const t_pbc *pbc, const t_graph *g)
1174 {
1175     real    d, invl, fproj, a1;
1176     rvec    xvi, xij, xjk, xjl, xix, fv, temp;
1177     int     av, ai, aj, ak, al;
1178     ivec    di;
1179     int     svi, sji, skj, slj, m;
1180
1181     av = ia[1];
1182     ai = ia[2];
1183     aj = ia[3];
1184     ak = ia[4];
1185     al = ia[5];
1186
1187     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1188     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1189     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1190     /* 9 flops */
1191
1192     /* xix goes from i to point x on the plane jkl */
1193     for (m = 0; m < DIM; m++)
1194     {
1195         xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
1196     }
1197     /* 12 flops */
1198
1199     invl = gmx::invsqrt(iprod(xix, xix));
1200     d    = c*invl;
1201     /* 4 + ?10? flops */
1202
1203     copy_rvec(f[av], fv);
1204
1205     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
1206
1207     for (m = 0; m < DIM; m++)
1208     {
1209         temp[m] = d*(fv[m] - fproj*xix[m]);
1210     }
1211     /* 16 */
1212
1213     /* c is already calculated in constr_vsite3FD
1214        storing c somewhere will save 35 flops!     */
1215
1216     a1 = 1 - a - b;
1217     for (m = 0; m < DIM; m++)
1218     {
1219         f[ai][m] += fv[m] - temp[m];
1220         f[aj][m] += a1*temp[m];
1221         f[ak][m] += a*temp[m];
1222         f[al][m] += b*temp[m];
1223     }
1224     /* 26 Flops */
1225
1226     if (g)
1227     {
1228         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1229         svi = IVEC2IS(di);
1230         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1231         sji = IVEC2IS(di);
1232         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1233         skj = IVEC2IS(di);
1234         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1235         slj = IVEC2IS(di);
1236     }
1237     else if (pbc)
1238     {
1239         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1240     }
1241     else
1242     {
1243         svi = CENTRAL;
1244     }
1245
1246     if (fshift &&
1247         (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1248     {
1249         rvec_dec(fshift[svi], fv);
1250         for (m = 0; m < DIM; m++)
1251         {
1252             fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
1253             fshift[    sji][m] += temp[m];
1254             fshift[    skj][m] += a*temp[m];
1255             fshift[    slj][m] += b*temp[m];
1256         }
1257     }
1258
1259     if (VirCorr)
1260     {
1261         rvec xiv;
1262         int  i, j;
1263
1264         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1265
1266         for (i = 0; i < DIM; i++)
1267         {
1268             for (j = 0; j < DIM; j++)
1269             {
1270                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
1271             }
1272         }
1273     }
1274
1275     /* TOTAL: 77 flops */
1276 }
1277
1278
1279 static void spread_vsite4FDN(const t_iatom ia[], real a, real b, real c,
1280                              const rvec x[],
1281                              rvec f[], rvec fshift[],
1282                              gmx_bool VirCorr, matrix dxdf,
1283                              const t_pbc *pbc, const t_graph *g)
1284 {
1285     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1286     rvec fv, fj, fk, fl;
1287     real invrm, denom;
1288     real cfx, cfy, cfz;
1289     ivec di;
1290     int  av, ai, aj, ak, al;
1291     int  svi, sij, sik, sil;
1292
1293     /* DEBUG: check atom indices */
1294     av = ia[1];
1295     ai = ia[2];
1296     aj = ia[3];
1297     ak = ia[4];
1298     al = ia[5];
1299
1300     copy_rvec(f[av], fv);
1301
1302     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1303     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1304     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1305     /* 9 flops */
1306
1307     ra[XX] = a*xik[XX];
1308     ra[YY] = a*xik[YY];
1309     ra[ZZ] = a*xik[ZZ];
1310
1311     rb[XX] = b*xil[XX];
1312     rb[YY] = b*xil[YY];
1313     rb[ZZ] = b*xil[ZZ];
1314
1315     /* 6 flops */
1316
1317     rvec_sub(ra, xij, rja);
1318     rvec_sub(rb, xij, rjb);
1319     rvec_sub(rb, ra, rab);
1320     /* 9 flops */
1321
1322     cprod(rja, rjb, rm);
1323     /* 9 flops */
1324
1325     invrm = gmx::invsqrt(norm2(rm));
1326     denom = invrm*invrm;
1327     /* 5+5+2 flops */
1328
1329     cfx = c*invrm*fv[XX];
1330     cfy = c*invrm*fv[YY];
1331     cfz = c*invrm*fv[ZZ];
1332     /* 6 Flops */
1333
1334     cprod(rm, rab, rt);
1335     /* 9 flops */
1336
1337     rt[XX] *= denom;
1338     rt[YY] *= denom;
1339     rt[ZZ] *= denom;
1340     /* 3flops */
1341
1342     fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
1343     fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
1344     fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
1345     /* 30 flops */
1346
1347     cprod(rjb, rm, rt);
1348     /* 9 flops */
1349
1350     rt[XX] *= denom*a;
1351     rt[YY] *= denom*a;
1352     rt[ZZ] *= denom*a;
1353     /* 3flops */
1354
1355     fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
1356     fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
1357     fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1358     /* 36 flops */
1359
1360     cprod(rm, rja, rt);
1361     /* 9 flops */
1362
1363     rt[XX] *= denom*b;
1364     rt[YY] *= denom*b;
1365     rt[ZZ] *= denom*b;
1366     /* 3flops */
1367
1368     fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
1369     fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
1370     fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1371     /* 36 flops */
1372
1373     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1374     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1375     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1376     rvec_inc(f[aj], fj);
1377     rvec_inc(f[ak], fk);
1378     rvec_inc(f[al], fl);
1379     /* 21 flops */
1380
1381     if (g)
1382     {
1383         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1384         svi = IVEC2IS(di);
1385         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1386         sij = IVEC2IS(di);
1387         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1388         sik = IVEC2IS(di);
1389         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1390         sil = IVEC2IS(di);
1391     }
1392     else if (pbc)
1393     {
1394         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1395     }
1396     else
1397     {
1398         svi = CENTRAL;
1399     }
1400
1401     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1402     {
1403         rvec_dec(fshift[svi], fv);
1404         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1405         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1406         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1407         rvec_inc(fshift[sij], fj);
1408         rvec_inc(fshift[sik], fk);
1409         rvec_inc(fshift[sil], fl);
1410     }
1411
1412     if (VirCorr)
1413     {
1414         rvec xiv;
1415         int  i, j;
1416
1417         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1418
1419         for (i = 0; i < DIM; i++)
1420         {
1421             for (j = 0; j < DIM; j++)
1422             {
1423                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
1424             }
1425         }
1426     }
1427
1428     /* Total: 207 flops (Yuck!) */
1429 }
1430
1431
1432 static int spread_vsiten(const t_iatom ia[], const t_iparams ip[],
1433                          const rvec x[],
1434                          rvec f[], rvec fshift[],
1435                          const t_pbc *pbc, const t_graph *g)
1436 {
1437     rvec xv, dx, fi;
1438     int  n3, av, i, ai;
1439     real a;
1440     ivec di;
1441     int  siv;
1442
1443     n3 = 3*ip[ia[0]].vsiten.n;
1444     av = ia[1];
1445     copy_rvec(x[av], xv);
1446
1447     for (i = 0; i < n3; i += 3)
1448     {
1449         ai = ia[i+2];
1450         if (g)
1451         {
1452             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1453             siv = IVEC2IS(di);
1454         }
1455         else if (pbc)
1456         {
1457             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1458         }
1459         else
1460         {
1461             siv = CENTRAL;
1462         }
1463         a = ip[ia[i]].vsiten.a;
1464         svmul(a, f[av], fi);
1465         rvec_inc(f[ai], fi);
1466         if (fshift && siv != CENTRAL)
1467         {
1468             rvec_inc(fshift[siv], fi);
1469             rvec_dec(fshift[CENTRAL], fi);
1470         }
1471         /* 6 Flops */
1472     }
1473
1474     return n3;
1475 }
1476
1477
1478 static int vsite_count(const t_ilist *ilist, int ftype)
1479 {
1480     if (ftype == F_VSITEN)
1481     {
1482         return ilist[ftype].nr/3;
1483     }
1484     else
1485     {
1486         return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
1487     }
1488 }
1489
1490 static void spread_vsite_f_thread(const gmx_vsite_t *vsite,
1491                                   const rvec x[],
1492                                   rvec f[], rvec *fshift,
1493                                   gmx_bool VirCorr, matrix dxdf,
1494                                   t_iparams ip[], const t_ilist ilist[],
1495                                   const t_graph *g, const t_pbc *pbc_null)
1496 {
1497     const PbcMode  pbcMode   = getPbcMode(pbc_null, vsite);
1498     /* We need another pbc pointer, as with charge groups we switch per vsite */
1499     const t_pbc   *pbc_null2 = pbc_null;
1500     const int     *vsite_pbc = nullptr;
1501
1502     /* this loop goes backwards to be able to build *
1503      * higher type vsites from lower types         */
1504     for (int ftype = c_ftypeVsiteEnd - 1; ftype >= c_ftypeVsiteStart; ftype--)
1505     {
1506         if (ilist[ftype].nr == 0)
1507         {
1508             continue;
1509         }
1510
1511         {   // TODO remove me
1512             int            nra = interaction_function[ftype].nratoms;
1513             int            inc = 1 + nra;
1514             int            nr  = ilist[ftype].nr;
1515
1516             const t_iatom *ia = ilist[ftype].iatoms;
1517
1518             if (pbcMode == PbcMode::all)
1519             {
1520                 pbc_null2 = pbc_null;
1521             }
1522             else if (pbcMode == PbcMode::chargeGroup)
1523             {
1524                 vsite_pbc = vsite->vsite_pbc_loc[ftype - c_ftypeVsiteStart];
1525             }
1526
1527             for (int i = 0; i < nr; )
1528             {
1529                 if (vsite_pbc != nullptr)
1530                 {
1531                     if (vsite_pbc[i/(1 + nra)] > -2)
1532                     {
1533                         pbc_null2 = pbc_null;
1534                     }
1535                     else
1536                     {
1537                         pbc_null2 = nullptr;
1538                     }
1539                 }
1540
1541                 int tp = ia[0];
1542
1543                 /* Constants for constructing */
1544                 real a1, b1, c1;
1545                 a1 = ip[tp].vsite.a;
1546                 /* Construct the vsite depending on type */
1547                 switch (ftype)
1548                 {
1549                     case F_VSITE2:
1550                         spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g);
1551                         break;
1552                     case F_VSITE3:
1553                         b1 = ip[tp].vsite.b;
1554                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1555                         break;
1556                     case F_VSITE3FD:
1557                         b1 = ip[tp].vsite.b;
1558                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1559                         break;
1560                     case F_VSITE3FAD:
1561                         b1 = ip[tp].vsite.b;
1562                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1563                         break;
1564                     case F_VSITE3OUT:
1565                         b1 = ip[tp].vsite.b;
1566                         c1 = ip[tp].vsite.c;
1567                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1568                         break;
1569                     case F_VSITE4FD:
1570                         b1 = ip[tp].vsite.b;
1571                         c1 = ip[tp].vsite.c;
1572                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1573                         break;
1574                     case F_VSITE4FDN:
1575                         b1 = ip[tp].vsite.b;
1576                         c1 = ip[tp].vsite.c;
1577                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1578                         break;
1579                     case F_VSITEN:
1580                         inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g);
1581                         break;
1582                     default:
1583                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
1584                                   ftype, __FILE__, __LINE__);
1585                 }
1586                 clear_rvec(f[ia[1]]);
1587
1588                 /* Increment loop variables */
1589                 i  += inc;
1590                 ia += inc;
1591             }
1592         }
1593     }
1594 }
1595
1596 /*! \brief Clears the task force buffer elements that are written by task idTask */
1597 static void clearTaskForceBufferUsedElements(InterdependentTask *idTask)
1598 {
1599     int ntask = idTask->spreadTask.size();
1600     for (int ti = 0; ti < ntask; ti++)
1601     {
1602         const AtomIndex *atomList = &idTask->atomIndex[idTask->spreadTask[ti]];
1603         int              natom    = atomList->atom.size();
1604         RVec            *force    = idTask->force.data();
1605         for (int i = 0; i < natom; i++)
1606         {
1607             clear_rvec(force[atomList->atom[i]]);
1608         }
1609     }
1610 }
1611
1612 void spread_vsite_f(const gmx_vsite_t *vsite,
1613                     const rvec * gmx_restrict x,
1614                     rvec * gmx_restrict f, rvec * gmx_restrict fshift,
1615                     gmx_bool VirCorr, matrix vir,
1616                     t_nrnb *nrnb, const t_idef *idef,
1617                     int ePBC, gmx_bool bMolPBC, const t_graph *g, const matrix box,
1618                     t_commrec *cr)
1619 {
1620     const bool useDomdec = vsite->useDomdec;
1621     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)), "When vsites are set up with domain decomposition, we need a valid commrec");
1622
1623     t_pbc pbc, *pbc_null;
1624
1625     /* We only need to do pbc when we have inter-cg vsites */
1626     if ((useDomdec || bMolPBC) && vsite->n_intercg_vsite)
1627     {
1628         /* This is wasting some CPU time as we now do this multiple times
1629          * per MD step.
1630          */
1631         pbc_null = set_pbc_dd(&pbc, ePBC, useDomdec ? cr->dd->nc : nullptr, FALSE, box);
1632     }
1633     else
1634     {
1635         pbc_null = nullptr;
1636     }
1637
1638     if (useDomdec)
1639     {
1640         dd_clear_f_vsites(cr->dd, f);
1641     }
1642
1643     if (vsite->nthreads == 1)
1644     {
1645         matrix dxdf;
1646         if (VirCorr)
1647         {
1648             clear_mat(dxdf);
1649         }
1650         spread_vsite_f_thread(vsite,
1651                               x, f, fshift,
1652                               VirCorr, dxdf,
1653                               idef->iparams, idef->il,
1654                               g, pbc_null);
1655
1656         if (VirCorr)
1657         {
1658             for (int i = 0; i < DIM; i++)
1659             {
1660                 for (int j = 0; j < DIM; j++)
1661                 {
1662                     vir[i][j] += -0.5*dxdf[i][j];
1663                 }
1664             }
1665         }
1666     }
1667     else
1668     {
1669         /* First spread the vsites that might depend on non-local vsites */
1670         if (VirCorr)
1671         {
1672             clear_mat(vsite->tData[vsite->nthreads]->dxdf);
1673         }
1674         spread_vsite_f_thread(vsite,
1675                               x, f, fshift,
1676                               VirCorr, vsite->tData[vsite->nthreads]->dxdf,
1677                               idef->iparams,
1678                               vsite->tData[vsite->nthreads]->ilist,
1679                               g, pbc_null);
1680
1681 #pragma omp parallel num_threads(vsite->nthreads)
1682         {
1683             try
1684             {
1685                 int          thread = gmx_omp_get_thread_num();
1686                 VsiteThread *tData  = vsite->tData[thread];
1687
1688                 rvec        *fshift_t;
1689                 if (thread == 0 || fshift == nullptr)
1690                 {
1691                     fshift_t = fshift;
1692                 }
1693                 else
1694                 {
1695                     fshift_t = tData->fshift;
1696
1697                     for (int i = 0; i < SHIFTS; i++)
1698                     {
1699                         clear_rvec(fshift_t[i]);
1700                     }
1701                 }
1702                 if (VirCorr)
1703                 {
1704                     clear_mat(tData->dxdf);
1705                 }
1706
1707                 if (tData->useInterdependentTask)
1708                 {
1709                     /* Spread the vsites that spread outside our local range.
1710                      * This is done using a thread-local force buffer force.
1711                      * First we need to copy the input vsite forces to force.
1712                      */
1713                     InterdependentTask *idTask = &tData->idTask;
1714
1715                     /* Clear the buffer elements set by our task during
1716                      * the last call to spread_vsite_f.
1717                      */
1718                     clearTaskForceBufferUsedElements(idTask);
1719
1720                     int nvsite = idTask->vsite.size();
1721                     for (int i = 0; i < nvsite; i++)
1722                     {
1723                         copy_rvec(f[idTask->vsite[i]],
1724                                   idTask->force[idTask->vsite[i]]);
1725                     }
1726                     spread_vsite_f_thread(vsite,
1727                                           x, as_rvec_array(idTask->force.data()), fshift_t,
1728                                           VirCorr, tData->dxdf,
1729                                           idef->iparams,
1730                                           tData->idTask.ilist,
1731                                           g, pbc_null);
1732
1733                     /* We need a barrier before reducing forces below
1734                      * that have been produced by a different thread above.
1735                      */
1736 #pragma omp barrier
1737
1738                     /* Loop over all thread task and reduce forces they
1739                      * produced on atoms that fall in our range.
1740                      * Note that atomic reduction would be a simpler solution,
1741                      * but that might not have good support on all platforms.
1742                      */
1743                     int ntask = idTask->reduceTask.size();
1744                     for (int ti = 0; ti < ntask; ti++)
1745                     {
1746                         const InterdependentTask *idt_foreign = &vsite->tData[idTask->reduceTask[ti]]->idTask;
1747                         const AtomIndex          *atomList    = &idt_foreign->atomIndex[thread];
1748                         const RVec               *f_foreign   = idt_foreign->force.data();
1749
1750                         int natom = atomList->atom.size();
1751                         for (int i = 0; i < natom; i++)
1752                         {
1753                             int ind = atomList->atom[i];
1754                             rvec_inc(f[ind], f_foreign[ind]);
1755                             /* Clearing of f_foreign is done at the next step */
1756                         }
1757                     }
1758                     /* Clear the vsite forces, both in f and force */
1759                     for (int i = 0; i < nvsite; i++)
1760                     {
1761                         int ind = tData->idTask.vsite[i];
1762                         clear_rvec(f[ind]);
1763                         clear_rvec(tData->idTask.force[ind]);
1764                     }
1765                 }
1766
1767                 /* Spread the vsites that spread locally only */
1768                 spread_vsite_f_thread(vsite,
1769                                       x, f, fshift_t,
1770                                       VirCorr, tData->dxdf,
1771                                       idef->iparams,
1772                                       tData->ilist,
1773                                       g, pbc_null);
1774             }
1775             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1776         }
1777
1778         if (fshift != nullptr)
1779         {
1780             for (int th = 1; th < vsite->nthreads; th++)
1781             {
1782                 for (int i = 0; i < SHIFTS; i++)
1783                 {
1784                     rvec_inc(fshift[i], vsite->tData[th]->fshift[i]);
1785                 }
1786             }
1787         }
1788
1789         if (VirCorr)
1790         {
1791             for (int th = 0; th < vsite->nthreads + 1; th++)
1792             {
1793                 /* MSVC doesn't like matrix references, so we use a pointer */
1794                 const matrix *dxdf = &vsite->tData[th]->dxdf;
1795
1796                 for (int i = 0; i < DIM; i++)
1797                 {
1798                     for (int j = 0; j < DIM; j++)
1799                     {
1800                         vir[i][j] += -0.5*(*dxdf)[i][j];
1801                     }
1802                 }
1803             }
1804         }
1805     }
1806
1807     if (useDomdec)
1808     {
1809         dd_move_f_vsites(cr->dd, f, fshift);
1810     }
1811
1812     inc_nrnb(nrnb, eNR_VSITE2,   vsite_count(idef->il, F_VSITE2));
1813     inc_nrnb(nrnb, eNR_VSITE3,   vsite_count(idef->il, F_VSITE3));
1814     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
1815     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
1816     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
1817     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
1818     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
1819     inc_nrnb(nrnb, eNR_VSITEN,   vsite_count(idef->il, F_VSITEN));
1820 }
1821
1822 /*! \brief Returns the an array with charge-group indices for each atom
1823  *
1824  * \param[in] chargeGroups  The charge group block struct
1825  */
1826 static std::vector<int> atom2cg(const t_block &chargeGroups)
1827 {
1828     std::vector<int> a2cg(chargeGroups.index[chargeGroups.nr], 0);
1829
1830     for (int chargeGroup = 0; chargeGroup < chargeGroups.nr; chargeGroup++)
1831     {
1832         std::fill(a2cg.begin() + chargeGroups.index[chargeGroup],
1833                   a2cg.begin() + chargeGroups.index[chargeGroup + 1],
1834                   chargeGroup);
1835     }
1836
1837     return a2cg;
1838 }
1839
1840 int count_intercg_vsites(const gmx_mtop_t *mtop)
1841 {
1842     gmx_molblock_t *molb;
1843     gmx_moltype_t  *molt;
1844     int             n_intercg_vsite;
1845
1846     n_intercg_vsite = 0;
1847     for (int mb = 0; mb < mtop->nmolblock; mb++)
1848     {
1849         molb = &mtop->molblock[mb];
1850         molt = &mtop->moltype[molb->type];
1851
1852         std::vector<int> a2cg = atom2cg(molt->cgs);
1853         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1854         {
1855             int            nral = NRAL(ftype);
1856             t_ilist       *il   = &molt->ilist[ftype];
1857             const t_iatom *ia   = il->iatoms;
1858             for (int i = 0; i < il->nr; i += 1 + nral)
1859             {
1860                 int cg = a2cg[ia[1+i]];
1861                 for (int a = 1; a < nral; a++)
1862                 {
1863                     if (a2cg[ia[1+a]] != cg)
1864                     {
1865                         n_intercg_vsite += molb->nmol;
1866                         break;
1867                     }
1868                 }
1869             }
1870         }
1871     }
1872
1873     return n_intercg_vsite;
1874 }
1875
1876 static int **get_vsite_pbc(const t_iparams *iparams, const t_ilist *ilist,
1877                            const t_atom *atom, const t_mdatoms *md,
1878                            const t_block &cgs)
1879 {
1880     /* Make an atom to charge group index */
1881     std::vector<int> a2cg = atom2cg(cgs);
1882
1883     /* Make an array that tells if the pbc of an atom is set */
1884     std::vector<bool> pbc_set(cgs.index[cgs.nr], false);
1885     /* PBC is set for all non vsites */
1886     for (int a = 0; a < cgs.index[cgs.nr]; a++)
1887     {
1888         if ((atom && atom[a].ptype != eptVSite) ||
1889             (md   && md->ptype[a]  != eptVSite))
1890         {
1891             pbc_set[a] = true;
1892         }
1893     }
1894
1895     int **vsite_pbc;
1896     snew(vsite_pbc, F_VSITEN-F_VSITE2+1);
1897
1898     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1899     {
1900         {   // TODO remove me
1901             int            nral = NRAL(ftype);
1902             const t_ilist *il   = &ilist[ftype];
1903             const t_iatom *ia   = il->iatoms;
1904             int           *vsite_pbc_f;
1905
1906             snew(vsite_pbc[ftype-F_VSITE2], il->nr/(1 + nral));
1907             vsite_pbc_f = vsite_pbc[ftype-F_VSITE2];
1908
1909             int i = 0;
1910             while (i < il->nr)
1911             {
1912                 int     vsi   = i/(1 + nral);
1913                 t_iatom vsite = ia[i+1];
1914                 int     cg_v  = a2cg[vsite];
1915                 /* A value of -2 signals that this vsite and its contructing
1916                  * atoms are all within the same cg, so no pbc is required.
1917                  */
1918                 vsite_pbc_f[vsi] = -2;
1919                 /* Check if constructing atoms are outside the vsite's cg */
1920                 int nc3 = 0;
1921                 if (ftype == F_VSITEN)
1922                 {
1923                     nc3 = 3*iparams[ia[i]].vsiten.n;
1924                     for (int j = 0; j < nc3; j += 3)
1925                     {
1926                         if (a2cg[ia[i+j+2]] != cg_v)
1927                         {
1928                             vsite_pbc_f[vsi] = -1;
1929                         }
1930                     }
1931                 }
1932                 else
1933                 {
1934                     for (int a = 1; a < nral; a++)
1935                     {
1936                         if (a2cg[ia[i+1+a]] != cg_v)
1937                         {
1938                             vsite_pbc_f[vsi] = -1;
1939                         }
1940                     }
1941                 }
1942                 if (vsite_pbc_f[vsi] == -1)
1943                 {
1944                     /* Check if this is the first processed atom of a vsite only cg */
1945                     gmx_bool bViteOnlyCG_and_FirstAtom = TRUE;
1946                     for (int a = cgs.index[cg_v]; a < cgs.index[cg_v + 1]; a++)
1947                     {
1948                         /* Non-vsites already have pbc set, so simply check for pbc_set */
1949                         if (pbc_set[a])
1950                         {
1951                             bViteOnlyCG_and_FirstAtom = FALSE;
1952                             break;
1953                         }
1954                     }
1955                     if (bViteOnlyCG_and_FirstAtom)
1956                     {
1957                         /* First processed atom of a vsite only charge group.
1958                          * The pbc of the input coordinates to construct_vsites
1959                          * should be preserved.
1960                          */
1961                         vsite_pbc_f[vsi] = vsite;
1962                     }
1963                     else if (cg_v != a2cg[ia[1+i+1]])
1964                     {
1965                         /* This vsite has a different charge group index
1966                          * than it's first constructing atom
1967                          * and the charge group has more than one atom,
1968                          * search for the first normal particle
1969                          * or vsite that already had its pbc defined.
1970                          * If nothing is found, use full pbc for this vsite.
1971                          */
1972                         for (int a = cgs.index[cg_v]; a < cgs.index[cg_v + 1]; a++)
1973                         {
1974                             if (a != vsite && pbc_set[a])
1975                             {
1976                                 vsite_pbc_f[vsi] = a;
1977                                 if (gmx_debug_at)
1978                                 {
1979                                     fprintf(debug, "vsite %d match pbc with atom %d\n",
1980                                             vsite+1, a+1);
1981                                 }
1982                                 break;
1983                             }
1984                         }
1985                         if (gmx_debug_at)
1986                         {
1987                             fprintf(debug, "vsite atom %d  cg %d - %d pbc atom %d\n",
1988                                     vsite+1, cgs.index[cg_v] + 1, cgs.index[cg_v + 1],
1989                                     vsite_pbc_f[vsi] + 1);
1990                         }
1991                     }
1992                 }
1993                 if (ftype == F_VSITEN)
1994                 {
1995                     /* The other entries in vsite_pbc_f are not used for center vsites */
1996                     i += nc3;
1997                 }
1998                 else
1999                 {
2000                     i += 1 + nral;
2001                 }
2002
2003                 /* This vsite now has its pbc defined */
2004                 pbc_set[vsite] = true;
2005             }
2006         }
2007     }
2008
2009     return vsite_pbc;
2010 }
2011
2012
2013 gmx_vsite_t *initVsite(const gmx_mtop_t &mtop,
2014                        t_commrec        *cr)
2015 {
2016     GMX_RELEASE_ASSERT(cr != nullptr, "We need a valid commrec");
2017
2018     /* check if there are vsites */
2019     int nvsite = 0;
2020     for (int ftype = 0; ftype < F_NRE; ftype++)
2021     {
2022         if (interaction_function[ftype].flags & IF_VSITE)
2023         {
2024             GMX_ASSERT(ftype >= c_ftypeVsiteStart && ftype < c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2025
2026             nvsite += gmx_mtop_ftype_count(&mtop, ftype);
2027         }
2028         else
2029         {
2030             GMX_ASSERT(ftype < c_ftypeVsiteStart || ftype >= c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2031         }
2032     }
2033
2034     if (nvsite == 0)
2035     {
2036         return nullptr;
2037     }
2038
2039     gmx_vsite_t *vsite = new(gmx_vsite_t);
2040
2041     vsite->n_intercg_vsite   = count_intercg_vsites(&mtop);
2042
2043     vsite->bHaveChargeGroups = (ncg_mtop(&mtop) < mtop.natoms);
2044
2045     vsite->useDomdec         = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
2046
2047     /* If we don't have charge groups, the vsite follows its own pbc.
2048      *
2049      * With charge groups, each vsite needs to follow the pbc of the charge
2050      * group. Thus we need to keep track of PBC. Here we assume that without
2051      * domain decomposition all molecules are whole (which will not be
2052      * the case with periodic molecules).
2053      */
2054     if (vsite->bHaveChargeGroups &&
2055         vsite->n_intercg_vsite > 0 &&
2056         DOMAINDECOMP(cr))
2057     {
2058         vsite->nvsite_pbc_molt = mtop.nmoltype;
2059         snew(vsite->vsite_pbc_molt, vsite->nvsite_pbc_molt);
2060         for (int mt = 0; mt < mtop.nmoltype; mt++)
2061         {
2062             const gmx_moltype_t &molt = mtop.moltype[mt];
2063             vsite->vsite_pbc_molt[mt] = get_vsite_pbc(mtop.ffparams.iparams,
2064                                                       molt.ilist,
2065                                                       molt.atoms.atom, nullptr,
2066                                                       molt.cgs);
2067         }
2068
2069         snew(vsite->vsite_pbc_loc_nalloc, c_ftypeVsiteEnd - c_ftypeVsiteStart);
2070         snew(vsite->vsite_pbc_loc,        c_ftypeVsiteEnd - c_ftypeVsiteStart);
2071     }
2072     else
2073     {
2074         vsite->vsite_pbc_molt = nullptr;
2075         vsite->vsite_pbc_loc  = nullptr;
2076     }
2077
2078     vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
2079
2080     if (vsite->nthreads > 1)
2081     {
2082         /* We need one extra thread data structure for the overlap vsites */
2083         snew(vsite->tData, vsite->nthreads + 1);
2084 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2085         for (int thread = 0; thread < vsite->nthreads; thread++)
2086         {
2087             try
2088             {
2089                 vsite->tData[thread] = new VsiteThread;
2090
2091                 InterdependentTask *idTask = &vsite->tData[thread]->idTask;
2092                 idTask->nuse               = 0;
2093                 idTask->atomIndex.resize(vsite->nthreads);
2094             }
2095             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2096         }
2097         if (vsite->nthreads > 1)
2098         {
2099             vsite->tData[vsite->nthreads] = new VsiteThread;
2100         }
2101     }
2102
2103     vsite->taskIndex       = nullptr;
2104     vsite->taskIndexNalloc = 0;
2105
2106     return vsite;
2107 }
2108
2109 static gmx_inline void flagAtom(InterdependentTask *idTask, int atom,
2110                                 int thread, int nthread, int natperthread)
2111 {
2112     if (!idTask->use[atom])
2113     {
2114         idTask->use[atom] = true;
2115         thread            = atom/natperthread;
2116         /* Assign all non-local atom force writes to thread 0 */
2117         if (thread >= nthread)
2118         {
2119             thread        = 0;
2120         }
2121         idTask->atomIndex[thread].atom.push_back(atom);
2122     }
2123 }
2124
2125 /*\brief Here we try to assign all vsites that are in our local range.
2126  *
2127  * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2128  * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2129  * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2130  * but not on other vsites are assigned to task tData->id_task.ilist.
2131  * taskIndex[] is set for all vsites in our range, either to our local tasks
2132  * or to the single last task as taskIndex[]=2*nthreads.
2133  */
2134 static void assignVsitesToThread(VsiteThread           *tData,
2135                                  int                    thread,
2136                                  int                    nthread,
2137                                  int                    natperthread,
2138                                  int                   *taskIndex,
2139                                  const t_ilist         *ilist,
2140                                  const t_iparams       *ip,
2141                                  const unsigned short  *ptype)
2142 {
2143     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2144     {
2145         tData->ilist[ftype].nr        = 0;
2146         tData->idTask.ilist[ftype].nr = 0;
2147
2148         int      nral1 = 1 + NRAL(ftype);
2149         int      inc   = nral1;
2150         t_iatom *iat   = ilist[ftype].iatoms;
2151         for (int i = 0; i < ilist[ftype].nr; )
2152         {
2153             if (ftype == F_VSITEN)
2154             {
2155                 /* The 3 below is from 1+NRAL(ftype)=3 */
2156                 inc = ip[iat[i]].vsiten.n*3;
2157             }
2158
2159             if (iat[1 + i] <  tData->rangeStart ||
2160                 iat[1 + i] >= tData->rangeEnd)
2161             {
2162                 /* This vsite belongs to a different thread */
2163                 i += inc;
2164                 continue;
2165             }
2166
2167             /* We would like to assign this vsite to task thread,
2168              * but it might depend on atoms outside the atom range of thread
2169              * or on another vsite not assigned to task thread.
2170              */
2171             int task = thread;
2172             if (ftype != F_VSITEN)
2173             {
2174                 for (int j = i + 2; j < i + nral1; j++)
2175                 {
2176                     /* Do a range check to avoid a harmless race on taskIndex */
2177                     if (iat[j] <  tData->rangeStart ||
2178                         iat[j] >= tData->rangeEnd ||
2179                         taskIndex[iat[j]] != thread)
2180                     {
2181                         if (!tData->useInterdependentTask ||
2182                             ptype[iat[j]] == eptVSite)
2183                         {
2184                             /* At least one constructing atom is a vsite
2185                              * that is not assigned to the same thread.
2186                              * Put this vsite into a separate task.
2187                              */
2188                             task = 2*nthread;
2189                             break;
2190                         }
2191
2192                         /* There are constructing atoms outside our range,
2193                          * put this vsite into a second task to be executed
2194                          * on the same thread. During construction no barrier
2195                          * is needed between the two tasks on the same thread.
2196                          * During spreading we need to run this task with
2197                          * an additional thread-local intermediate force buffer
2198                          * (or atomic reduction) and a barrier between the two
2199                          * tasks.
2200                          */
2201                         task = nthread + thread;
2202                     }
2203                 }
2204             }
2205             else
2206             {
2207                 for (int j = i + 2; j < i + inc; j += 3)
2208                 {
2209                     /* Do a range check to avoid a harmless race on taskIndex */
2210                     if (iat[j] <  tData->rangeStart ||
2211                         iat[j] >= tData->rangeEnd ||
2212                         taskIndex[iat[j]] != thread)
2213                     {
2214                         GMX_ASSERT(ptype[iat[j]] != eptVSite, "A vsite to be assigned in assignVsitesToThread has a vsite as a constructing atom that does not belong to our task, such vsites should be assigned to the single 'master' task");
2215
2216                         task = nthread + thread;
2217                     }
2218                 }
2219             }
2220
2221             /* Update this vsite's thread index entry */
2222             taskIndex[iat[1+i]] = task;
2223
2224             if (task == thread || task == nthread + thread)
2225             {
2226                 /* Copy this vsite to the thread data struct of thread */
2227                 t_ilist *il_task;
2228                 if (task == thread)
2229                 {
2230                     il_task = &tData->ilist[ftype];
2231                 }
2232                 else
2233                 {
2234                     il_task = &tData->idTask.ilist[ftype];
2235                 }
2236                 /* Ensure we have sufficient memory allocated */
2237                 if (il_task->nr + inc > il_task->nalloc)
2238                 {
2239                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2240                     srenew(il_task->iatoms, il_task->nalloc);
2241                 }
2242                 /* Copy the vsite data to the thread-task local array */
2243                 for (int j = i; j < i + inc; j++)
2244                 {
2245                     il_task->iatoms[il_task->nr++] = iat[j];
2246                 }
2247                 if (task == nthread + thread)
2248                 {
2249                     /* This vsite write outside our own task force block.
2250                      * Put it into the interdependent task list and flag
2251                      * the atoms involved for reduction.
2252                      */
2253                     tData->idTask.vsite.push_back(iat[i + 1]);
2254                     if (ftype != F_VSITEN)
2255                     {
2256                         for (int j = i + 2; j < i + nral1; j++)
2257                         {
2258                             flagAtom(&tData->idTask, iat[j],
2259                                      thread, nthread, natperthread);
2260                         }
2261                     }
2262                     else
2263                     {
2264                         for (int j = i + 2; j < i + inc; j += 3)
2265                         {
2266                             flagAtom(&tData->idTask, iat[j],
2267                                      thread, nthread, natperthread);
2268                         }
2269                     }
2270                 }
2271             }
2272
2273             i += inc;
2274         }
2275     }
2276 }
2277
2278 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2279 static void assignVsitesToSingleTask(VsiteThread     *tData,
2280                                      int              task,
2281                                      const int       *taskIndex,
2282                                      const t_ilist   *ilist,
2283                                      const t_iparams *ip)
2284 {
2285     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2286     {
2287         tData->ilist[ftype].nr        = 0;
2288         tData->idTask.ilist[ftype].nr = 0;
2289
2290         int      nral1   = 1 + NRAL(ftype);
2291         int      inc     = nral1;
2292         t_iatom *iat     = ilist[ftype].iatoms;
2293         t_ilist *il_task = &tData->ilist[ftype];
2294
2295         for (int i = 0; i < ilist[ftype].nr; )
2296         {
2297             if (ftype == F_VSITEN)
2298             {
2299                 /* The 3 below is from 1+NRAL(ftype)=3 */
2300                 inc = ip[iat[i]].vsiten.n*3;
2301             }
2302             /* Check if the vsite is assigned to our task */
2303             if (taskIndex[iat[1 + i]] == task)
2304             {
2305                 /* Ensure we have sufficient memory allocated */
2306                 if (il_task->nr + inc > il_task->nalloc)
2307                 {
2308                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2309                     srenew(il_task->iatoms, il_task->nalloc);
2310                 }
2311                 /* Copy the vsite data to the thread-task local array */
2312                 for (int j = i; j < i + inc; j++)
2313                 {
2314                     il_task->iatoms[il_task->nr++] = iat[j];
2315                 }
2316             }
2317
2318             i += inc;
2319         }
2320     }
2321 }
2322
2323 void split_vsites_over_threads(const t_ilist   *ilist,
2324                                const t_iparams *ip,
2325                                const t_mdatoms *mdatoms,
2326                                gmx_vsite_t     *vsite)
2327 {
2328     int      vsite_atom_range, natperthread;
2329
2330     if (vsite->nthreads == 1)
2331     {
2332         /* Nothing to do */
2333         return;
2334     }
2335
2336     /* The current way of distributing the vsites over threads in primitive.
2337      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2338      * without taking into account how the vsites are distributed.
2339      * Without domain decomposition we at least tighten the upper bound
2340      * of the range (useful for common systems such as a vsite-protein
2341      * in 3-site water).
2342      * With domain decomposition, as long as the vsites are distributed
2343      * uniformly in each domain along the major dimension, usually x,
2344      * it will also perform well.
2345      */
2346     if (!vsite->useDomdec)
2347     {
2348         vsite_atom_range = -1;
2349         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2350         {
2351             {   // TODO remove me
2352                 if (ftype != F_VSITEN)
2353                 {
2354                     int            nral1 = 1 + NRAL(ftype);
2355                     const t_iatom *iat   = ilist[ftype].iatoms;
2356                     for (int i = 0; i < ilist[ftype].nr; i += nral1)
2357                     {
2358                         for (int j = i + 1; j < i + nral1; j++)
2359                         {
2360                             vsite_atom_range = std::max(vsite_atom_range, iat[j]);
2361                         }
2362                     }
2363                 }
2364                 else
2365                 {
2366                     int            vs_ind_end;
2367
2368                     const t_iatom *iat = ilist[ftype].iatoms;
2369
2370                     int            i = 0;
2371                     while (i < ilist[ftype].nr)
2372                     {
2373                         /* The 3 below is from 1+NRAL(ftype)=3 */
2374                         vs_ind_end = i + ip[iat[i]].vsiten.n*3;
2375
2376                         vsite_atom_range = std::max(vsite_atom_range, iat[i+1]);
2377                         while (i < vs_ind_end)
2378                         {
2379                             vsite_atom_range = std::max(vsite_atom_range, iat[i+2]);
2380                             i               += 3;
2381                         }
2382                     }
2383                 }
2384             }
2385         }
2386         vsite_atom_range++;
2387         natperthread     = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
2388     }
2389     else
2390     {
2391         /* Any local or not local atom could be involved in virtual sites.
2392          * But since we usually have very few non-local virtual sites
2393          * (only non-local vsites that depend on local vsites),
2394          * we distribute the local atom range equally over the threads.
2395          * When assigning vsites to threads, we should take care that the last
2396          * threads also covers the non-local range.
2397          */
2398         vsite_atom_range = mdatoms->nr;
2399         natperthread     = (mdatoms->homenr + vsite->nthreads - 1)/vsite->nthreads;
2400     }
2401
2402     if (debug)
2403     {
2404         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms->nr, vsite_atom_range, natperthread);
2405     }
2406
2407     /* To simplify the vsite assignment, we make an index which tells us
2408      * to which task particles, both non-vsites and vsites, are assigned.
2409      */
2410     if (mdatoms->nr > vsite->taskIndexNalloc)
2411     {
2412         vsite->taskIndexNalloc = over_alloc_large(mdatoms->nr);
2413         srenew(vsite->taskIndex, vsite->taskIndexNalloc);
2414     }
2415
2416     /* Initialize the task index array. Here we assign the non-vsite
2417      * particles to task=thread, so we easily figure out if vsites
2418      * depend on local and/or non-local particles in assignVsitesToThread.
2419      */
2420     int *taskIndex = vsite->taskIndex;
2421     {
2422         int  thread = 0;
2423         for (int i = 0; i < mdatoms->nr; i++)
2424         {
2425             if (mdatoms->ptype[i] == eptVSite)
2426             {
2427                 /* vsites are not assigned to a task yet */
2428                 taskIndex[i] = -1;
2429             }
2430             else
2431             {
2432                 /* assign non-vsite particles to task thread */
2433                 taskIndex[i] = thread;
2434             }
2435             if (i == (thread + 1)*natperthread && thread < vsite->nthreads)
2436             {
2437                 thread++;
2438             }
2439         }
2440     }
2441
2442 #pragma omp parallel num_threads(vsite->nthreads)
2443     {
2444         try
2445         {
2446             int          thread = gmx_omp_get_thread_num();
2447             VsiteThread *tData  = vsite->tData[thread];
2448
2449             /* Clear the buffer use flags that were set before */
2450             if (tData->useInterdependentTask)
2451             {
2452                 InterdependentTask *idTask = &tData->idTask;
2453
2454                 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2455                  * we clear the force buffer at the next step,
2456                  * so we need to do it here as well.
2457                  */
2458                 clearTaskForceBufferUsedElements(idTask);
2459
2460                 idTask->vsite.resize(0);
2461                 for (int t = 0; t < vsite->nthreads; t++)
2462                 {
2463                     AtomIndex *atomIndex = &idTask->atomIndex[t];
2464                     int        natom     = atomIndex->atom.size();
2465                     for (int i = 0; i < natom; i++)
2466                     {
2467                         idTask->use[atomIndex->atom[i]] = false;
2468                     }
2469                     atomIndex->atom.resize(0);
2470                 }
2471                 idTask->nuse = 0;
2472             }
2473
2474             /* To avoid large f_buf allocations of #threads*vsite_atom_range
2475              * we don't use task2 with more than 200000 atoms. This doesn't
2476              * affect performance, since with such a large range relatively few
2477              * vsites will end up in the separate task.
2478              * Note that useTask2 should be the same for all threads.
2479              */
2480             tData->useInterdependentTask = (vsite_atom_range <= 200000);
2481             if (tData->useInterdependentTask)
2482             {
2483                 size_t              natoms_use_in_vsites = vsite_atom_range;
2484                 InterdependentTask *idTask               = &tData->idTask;
2485                 /* To avoid resizing and re-clearing every nstlist steps,
2486                  * we never down size the force buffer.
2487                  */
2488                 if (natoms_use_in_vsites > idTask->force.size() ||
2489                     natoms_use_in_vsites > idTask->use.size())
2490                 {
2491                     idTask->force.resize(natoms_use_in_vsites, { 0, 0, 0 });
2492                     idTask->use.resize(natoms_use_in_vsites, false);
2493                 }
2494             }
2495
2496             /* Assign all vsites that can execute independently on threads */
2497             tData->rangeStart     =  thread     *natperthread;
2498             if (thread < vsite->nthreads - 1)
2499             {
2500                 tData->rangeEnd   = (thread + 1)*natperthread;
2501             }
2502             else
2503             {
2504                 /* The last thread should cover up to the end of the range */
2505                 tData->rangeEnd   = mdatoms->nr;
2506             }
2507             assignVsitesToThread(tData,
2508                                  thread, vsite->nthreads,
2509                                  natperthread,
2510                                  taskIndex,
2511                                  ilist, ip, mdatoms->ptype);
2512
2513             if (tData->useInterdependentTask)
2514             {
2515                 /* In the worst case, all tasks write to force ranges of
2516                  * all other tasks, leading to #tasks^2 scaling (this is only
2517                  * the overhead, the actual flops remain constant).
2518                  * But in most cases there is far less coupling. To improve
2519                  * scaling at high thread counts we therefore construct
2520                  * an index to only loop over the actually affected tasks.
2521                  */
2522                 InterdependentTask *idTask = &tData->idTask;
2523
2524                 /* Ensure assignVsitesToThread finished on other threads */
2525 #pragma omp barrier
2526
2527                 idTask->spreadTask.resize(0);
2528                 idTask->reduceTask.resize(0);
2529                 for (int t = 0; t < vsite->nthreads; t++)
2530                 {
2531                     /* Do we write to the force buffer of task t? */
2532                     if (idTask->atomIndex[t].atom.size() > 0)
2533                     {
2534                         idTask->spreadTask.push_back(t);
2535                     }
2536                     /* Does task t write to our force buffer? */
2537                     if (vsite->tData[t]->idTask.atomIndex[thread].atom.size() > 0)
2538                     {
2539                         idTask->reduceTask.push_back(t);
2540                     }
2541                 }
2542             }
2543         }
2544         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2545     }
2546     /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2547      * to a single task that will not run in parallel with other tasks.
2548      */
2549     assignVsitesToSingleTask(vsite->tData[vsite->nthreads],
2550                              2*vsite->nthreads,
2551                              taskIndex,
2552                              ilist, ip);
2553
2554     if (debug && vsite->nthreads > 1)
2555     {
2556         fprintf(debug, "virtual site useInterdependentTask %d, nuse:\n",
2557                 vsite->tData[0]->useInterdependentTask);
2558         for (int th = 0; th < vsite->nthreads + 1; th++)
2559         {
2560             fprintf(debug, " %4d", vsite->tData[th]->idTask.nuse);
2561         }
2562         fprintf(debug, "\n");
2563
2564         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2565         {
2566             if (ilist[ftype].nr > 0)
2567             {
2568                 fprintf(debug, "%-20s thread dist:",
2569                         interaction_function[ftype].longname);
2570                 for (int th = 0; th < vsite->nthreads + 1; th++)
2571                 {
2572                     fprintf(debug, " %4d %4d ",
2573                             vsite->tData[th]->ilist[ftype].nr,
2574                             vsite->tData[th]->idTask.ilist[ftype].nr);
2575                 }
2576                 fprintf(debug, "\n");
2577             }
2578         }
2579     }
2580
2581 #ifndef NDEBUG
2582     int nrOrig     = vsiteIlistNrCount(ilist);
2583     int nrThreaded = 0;
2584     for (int th = 0; th < vsite->nthreads + 1; th++)
2585     {
2586         nrThreaded +=
2587             vsiteIlistNrCount(vsite->tData[th]->ilist) +
2588             vsiteIlistNrCount(vsite->tData[th]->idTask.ilist);
2589     }
2590     GMX_ASSERT(nrThreaded == nrOrig, "The number of virtual sites assigned to all thread task has to match the total number of virtual sites");
2591 #endif
2592 }
2593
2594 void set_vsite_top(gmx_vsite_t          *vsite,
2595                    const gmx_localtop_t *top,
2596                    const t_mdatoms      *md)
2597 {
2598     if (vsite->n_intercg_vsite > 0 && vsite->bHaveChargeGroups)
2599     {
2600         vsite->vsite_pbc_loc = get_vsite_pbc(top->idef.iparams,
2601                                              top->idef.il, nullptr, md,
2602                                              top->cgs);
2603     }
2604
2605     if (vsite->nthreads > 1)
2606     {
2607         if (vsite->bHaveChargeGroups)
2608         {
2609             gmx_fatal(FARGS, "The combination of threading, virtual sites and charge groups is not implemented");
2610         }
2611
2612         split_vsites_over_threads(top->idef.il, top->idef.iparams,
2613                                   md, vsite);
2614     }
2615 }