src/gromacs/mdlib/vsite.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "vsite.h"
  40
  41 #include <cstdio>
  42
  43 #include <algorithm>
  44 #include <memory>
  45 #include <vector>
  46
  47 #include "gromacs/domdec/domdec.h"
  48 #include "gromacs/domdec/domdec_struct.h"
  49 #include "gromacs/gmxlib/network.h"
  50 #include "gromacs/gmxlib/nrnb.h"
  51 #include "gromacs/math/functions.h"
  52 #include "gromacs/math/vec.h"
  53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  54 #include "gromacs/mdtypes/commrec.h"
  55 #include "gromacs/mdtypes/mdatom.h"
  56 #include "gromacs/pbcutil/ishift.h"
  57 #include "gromacs/pbcutil/mshift.h"
  58 #include "gromacs/pbcutil/pbc.h"
  59 #include "gromacs/timing/wallcycle.h"
  60 #include "gromacs/topology/ifunc.h"
  61 #include "gromacs/topology/mtop_util.h"
  62 #include "gromacs/topology/topology.h"
  63 #include "gromacs/utility/exceptions.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/gmxassert.h"
  66 #include "gromacs/utility/gmxomp.h"
  67
  68 /* The strategy used here for assigning virtual sites to (thread-)tasks
  69  * is as follows:
  70  *
  71  * We divide the atom range that vsites operate on (natoms_local with DD,
  72  * 0 - last atom involved in vsites without DD) equally over all threads.
  73  *
  74  * Vsites in the local range constructed from atoms in the local range
  75  * and/or other vsites that are fully local are assigned to a simple,
  76  * independent task.
  77  *
  78  * Vsites that are not assigned after using the above criterion get assigned
  79  * to a so called "interdependent" thread task when none of the constructing
  80  * atoms is a vsite. These tasks are called interdependent, because one task
  81  * accesses atoms assigned to a different task/thread.
  82  * Note that this option is turned off with large (local) atom counts
  83  * to avoid high memory usage.
  84  *
  85  * Any remaining vsites are assigned to a separate master thread task.
  86  */
  87
  88 using gmx::RVec;
  89
  90 static void init_ilist(t_ilist* ilist)
  91 {
  92     for (int i = 0; i < F_NRE; i++)
  93     {
  94         ilist[i].nr     = 0;
  95         ilist[i].nalloc = 0;
  96         ilist[i].iatoms = nullptr;
  97     }
  98 }
  99
 100 /*! \brief List of atom indices belonging to a task */
 101 struct AtomIndex
 102 {
 103     //! List of atom indices
 104     std::vector<int> atom;
 105 };
 106
 107 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
 108 struct InterdependentTask
 109 {
 110     //! The interaction lists, only vsite entries are used
 111     t_ilist ilist[F_NRE];
 112     //! Thread/task-local force buffer
 113     std::vector<RVec> force;
 114     //! The atom indices of the vsites of our task
 115     std::vector<int> vsite;
 116     //! Flags if elements in force are spread to or not
 117     std::vector<bool> use;
 118     //! The number of entries set to true in use
 119     int nuse;
 120     //! Array of atoms indices, size nthreads, covering all nuse set elements in use
 121     std::vector<AtomIndex> atomIndex;
 122     //! List of tasks (force blocks) this task spread forces to
 123     std::vector<int> spreadTask;
 124     //! List of tasks that write to this tasks force block range
 125     std::vector<int> reduceTask;
 126
 127     InterdependentTask()
 128     {
 129         init_ilist(ilist);
 130         nuse = 0;
 131     }
 132 };
 133
 134 /*! \brief Vsite thread task data structure */
 135 struct VsiteThread
 136 {
 137     //! Start of atom range of this task
 138     int rangeStart;
 139     //! End of atom range of this task
 140     int rangeEnd;
 141     //! The interaction lists, only vsite entries are used
 142     t_ilist ilist[F_NRE];
 143     //! Local fshift accumulation buffer
 144     rvec fshift[SHIFTS];
 145     //! Local virial dx*df accumulation buffer
 146     matrix dxdf;
 147     //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
 148     bool useInterdependentTask;
 149     //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
 150     InterdependentTask idTask;
 151
 152     /*! \brief Constructor */
 153     VsiteThread()
 154     {
 155         rangeStart = -1;
 156         rangeEnd   = -1;
 157         init_ilist(ilist);
 158         clear_rvecs(SHIFTS, fshift);
 159         clear_mat(dxdf);
 160         useInterdependentTask = false;
 161     }
 162 };
 163
 164 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
 165  *
 166  * \param[in] ilist  The interaction list
 167  */
 168 template<typename T>
 169 static int vsiteIlistNrCount(const T* ilist)
 170 {
 171     int nr = 0;
 172     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 173     {
 174         nr += ilist[ftype].size();
 175     }
 176
 177     return nr;
 178 }
 179
 180 static int pbc_rvec_sub(const t_pbc* pbc, const rvec xi, const rvec xj, rvec dx)
 181 {
 182     if (pbc)
 183     {
 184         return pbc_dx_aiuc(pbc, xi, xj, dx);
 185     }
 186     else
 187     {
 188         rvec_sub(xi, xj, dx);
 189         return CENTRAL;
 190     }
 191 }
 192
 193 static inline real inverseNorm(const rvec x)
 194 {
 195     return gmx::invsqrt(iprod(x, x));
 196 }
 197
 198 /* Vsite construction routines */
 199
 200 static void constr_vsite2(const rvec xi, const rvec xj, rvec x, real a, const t_pbc* pbc)
 201 {
 202     real b = 1 - a;
 203     /* 1 flop */
 204
 205     if (pbc)
 206     {
 207         rvec dx;
 208         pbc_dx_aiuc(pbc, xj, xi, dx);
 209         x[XX] = xi[XX] + a * dx[XX];
 210         x[YY] = xi[YY] + a * dx[YY];
 211         x[ZZ] = xi[ZZ] + a * dx[ZZ];
 212     }
 213     else
 214     {
 215         x[XX] = b * xi[XX] + a * xj[XX];
 216         x[YY] = b * xi[YY] + a * xj[YY];
 217         x[ZZ] = b * xi[ZZ] + a * xj[ZZ];
 218         /* 9 Flops */
 219     }
 220
 221     /* TOTAL: 10 flops */
 222 }
 223
 224 static void constr_vsite2FD(const rvec xi, const rvec xj, rvec x, real a, const t_pbc* pbc)
 225 {
 226     rvec xij;
 227     pbc_rvec_sub(pbc, xj, xi, xij);
 228     /* 3 flops */
 229
 230     const real b = a * inverseNorm(xij);
 231     /* 6 + 10 flops */
 232
 233     x[XX] = xi[XX] + b * xij[XX];
 234     x[YY] = xi[YY] + b * xij[YY];
 235     x[ZZ] = xi[ZZ] + b * xij[ZZ];
 236     /* 6 Flops */
 237
 238     /* TOTAL: 25 flops */
 239 }
 240
 241 static void constr_vsite3(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 242 {
 243     real c = 1 - a - b;
 244     /* 2 flops */
 245
 246     if (pbc)
 247     {
 248         rvec dxj, dxk;
 249
 250         pbc_dx_aiuc(pbc, xj, xi, dxj);
 251         pbc_dx_aiuc(pbc, xk, xi, dxk);
 252         x[XX] = xi[XX] + a * dxj[XX] + b * dxk[XX];
 253         x[YY] = xi[YY] + a * dxj[YY] + b * dxk[YY];
 254         x[ZZ] = xi[ZZ] + a * dxj[ZZ] + b * dxk[ZZ];
 255     }
 256     else
 257     {
 258         x[XX] = c * xi[XX] + a * xj[XX] + b * xk[XX];
 259         x[YY] = c * xi[YY] + a * xj[YY] + b * xk[YY];
 260         x[ZZ] = c * xi[ZZ] + a * xj[ZZ] + b * xk[ZZ];
 261         /* 15 Flops */
 262     }
 263
 264     /* TOTAL: 17 flops */
 265 }
 266
 267 static void constr_vsite3FD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 268 {
 269     rvec xij, xjk, temp;
 270     real c;
 271
 272     pbc_rvec_sub(pbc, xj, xi, xij);
 273     pbc_rvec_sub(pbc, xk, xj, xjk);
 274     /* 6 flops */
 275
 276     /* temp goes from i to a point on the line jk */
 277     temp[XX] = xij[XX] + a * xjk[XX];
 278     temp[YY] = xij[YY] + a * xjk[YY];
 279     temp[ZZ] = xij[ZZ] + a * xjk[ZZ];
 280     /* 6 flops */
 281
 282     c = b * inverseNorm(temp);
 283     /* 6 + 10 flops */
 284
 285     x[XX] = xi[XX] + c * temp[XX];
 286     x[YY] = xi[YY] + c * temp[YY];
 287     x[ZZ] = xi[ZZ] + c * temp[ZZ];
 288     /* 6 Flops */
 289
 290     /* TOTAL: 34 flops */
 291 }
 292
 293 static void constr_vsite3FAD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc* pbc)
 294 {
 295     rvec xij, xjk, xp;
 296     real a1, b1, c1, invdij;
 297
 298     pbc_rvec_sub(pbc, xj, xi, xij);
 299     pbc_rvec_sub(pbc, xk, xj, xjk);
 300     /* 6 flops */
 301
 302     invdij = inverseNorm(xij);
 303     c1     = invdij * invdij * iprod(xij, xjk);
 304     xp[XX] = xjk[XX] - c1 * xij[XX];
 305     xp[YY] = xjk[YY] - c1 * xij[YY];
 306     xp[ZZ] = xjk[ZZ] - c1 * xij[ZZ];
 307     a1     = a * invdij;
 308     b1     = b * inverseNorm(xp);
 309     /* 45 */
 310
 311     x[XX] = xi[XX] + a1 * xij[XX] + b1 * xp[XX];
 312     x[YY] = xi[YY] + a1 * xij[YY] + b1 * xp[YY];
 313     x[ZZ] = xi[ZZ] + a1 * xij[ZZ] + b1 * xp[ZZ];
 314     /* 12 Flops */
 315
 316     /* TOTAL: 63 flops */
 317 }
 318
 319 static void
 320 constr_vsite3OUT(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, real c, const t_pbc* pbc)
 321 {
 322     rvec xij, xik, temp;
 323
 324     pbc_rvec_sub(pbc, xj, xi, xij);
 325     pbc_rvec_sub(pbc, xk, xi, xik);
 326     cprod(xij, xik, temp);
 327     /* 15 Flops */
 328
 329     x[XX] = xi[XX] + a * xij[XX] + b * xik[XX] + c * temp[XX];
 330     x[YY] = xi[YY] + a * xij[YY] + b * xik[YY] + c * temp[YY];
 331     x[ZZ] = xi[ZZ] + a * xij[ZZ] + b * xik[ZZ] + c * temp[ZZ];
 332     /* 18 Flops */
 333
 334     /* TOTAL: 33 flops */
 335 }
 336
 337 static void constr_vsite4FD(const rvec   xi,
 338                             const rvec   xj,
 339                             const rvec   xk,
 340                             const rvec   xl,
 341                             rvec         x,
 342                             real         a,
 343                             real         b,
 344                             real         c,
 345                             const t_pbc* pbc)
 346 {
 347     rvec xij, xjk, xjl, temp;
 348     real d;
 349
 350     pbc_rvec_sub(pbc, xj, xi, xij);
 351     pbc_rvec_sub(pbc, xk, xj, xjk);
 352     pbc_rvec_sub(pbc, xl, xj, xjl);
 353     /* 9 flops */
 354
 355     /* temp goes from i to a point on the plane jkl */
 356     temp[XX] = xij[XX] + a * xjk[XX] + b * xjl[XX];
 357     temp[YY] = xij[YY] + a * xjk[YY] + b * xjl[YY];
 358     temp[ZZ] = xij[ZZ] + a * xjk[ZZ] + b * xjl[ZZ];
 359     /* 12 flops */
 360
 361     d = c * inverseNorm(temp);
 362     /* 6 + 10 flops */
 363
 364     x[XX] = xi[XX] + d * temp[XX];
 365     x[YY] = xi[YY] + d * temp[YY];
 366     x[ZZ] = xi[ZZ] + d * temp[ZZ];
 367     /* 6 Flops */
 368
 369     /* TOTAL: 43 flops */
 370 }
 371
 372 static void constr_vsite4FDN(const rvec   xi,
 373                              const rvec   xj,
 374                              const rvec   xk,
 375                              const rvec   xl,
 376                              rvec         x,
 377                              real         a,
 378                              real         b,
 379                              real         c,
 380                              const t_pbc* pbc)
 381 {
 382     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 383     real d;
 384
 385     pbc_rvec_sub(pbc, xj, xi, xij);
 386     pbc_rvec_sub(pbc, xk, xi, xik);
 387     pbc_rvec_sub(pbc, xl, xi, xil);
 388     /* 9 flops */
 389
 390     ra[XX] = a * xik[XX];
 391     ra[YY] = a * xik[YY];
 392     ra[ZZ] = a * xik[ZZ];
 393
 394     rb[XX] = b * xil[XX];
 395     rb[YY] = b * xil[YY];
 396     rb[ZZ] = b * xil[ZZ];
 397
 398     /* 6 flops */
 399
 400     rvec_sub(ra, xij, rja);
 401     rvec_sub(rb, xij, rjb);
 402     /* 6 flops */
 403
 404     cprod(rja, rjb, rm);
 405     /* 9 flops */
 406
 407     d = c * inverseNorm(rm);
 408     /* 5+5+1 flops */
 409
 410     x[XX] = xi[XX] + d * rm[XX];
 411     x[YY] = xi[YY] + d * rm[YY];
 412     x[ZZ] = xi[ZZ] + d * rm[ZZ];
 413     /* 6 Flops */
 414
 415     /* TOTAL: 47 flops */
 416 }
 417
 418
 419 static int constr_vsiten(const t_iatom* ia, const t_iparams ip[], rvec* x, const t_pbc* pbc)
 420 {
 421     rvec x1, dx;
 422     dvec dsum;
 423     int  n3, av, ai;
 424     real a;
 425
 426     n3 = 3 * ip[ia[0]].vsiten.n;
 427     av = ia[1];
 428     ai = ia[2];
 429     copy_rvec(x[ai], x1);
 430     clear_dvec(dsum);
 431     for (int i = 3; i < n3; i += 3)
 432     {
 433         ai = ia[i + 2];
 434         a  = ip[ia[i]].vsiten.a;
 435         if (pbc)
 436         {
 437             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 438         }
 439         else
 440         {
 441             rvec_sub(x[ai], x1, dx);
 442         }
 443         dsum[XX] += a * dx[XX];
 444         dsum[YY] += a * dx[YY];
 445         dsum[ZZ] += a * dx[ZZ];
 446         /* 9 Flops */
 447     }
 448
 449     x[av][XX] = x1[XX] + dsum[XX];
 450     x[av][YY] = x1[YY] + dsum[YY];
 451     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 452
 453     return n3;
 454 }
 455
 456 /*! \brief PBC modes for vsite construction and spreading */
 457 enum class PbcMode
 458 {
 459     all, // Apply normal, simple PBC for all vsites
 460     none // No PBC treatment needed
 461 };
 462
 463 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
 464  *
 465  * \param[in] pbcPtr  A pointer to a PBC struct or nullptr when no PBC treatment is required
 466  */
 467 static PbcMode getPbcMode(const t_pbc* pbcPtr)
 468 {
 469     if (pbcPtr == nullptr)
 470     {
 471         return PbcMode::none;
 472     }
 473     else
 474     {
 475         return PbcMode::all;
 476     }
 477 }
 478
 479 static void construct_vsites_thread(rvec            x[],
 480                                     real            dt,
 481                                     rvec*           v,
 482                                     const t_iparams ip[],
 483                                     const t_ilist   ilist[],
 484                                     const t_pbc*    pbc_null)
 485 {
 486     real inv_dt;
 487     if (v != nullptr)
 488     {
 489         inv_dt = 1.0 / dt;
 490     }
 491     else
 492     {
 493         inv_dt = 1.0;
 494     }
 495
 496     const PbcMode pbcMode = getPbcMode(pbc_null);
 497     /* We need another pbc pointer, as with charge groups we switch per vsite */
 498     const t_pbc* pbc_null2 = pbc_null;
 499
 500     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 501     {
 502         if (ilist[ftype].nr == 0)
 503         {
 504             continue;
 505         }
 506
 507         { // TODO remove me
 508             int nra = interaction_function[ftype].nratoms;
 509             int inc = 1 + nra;
 510             int nr  = ilist[ftype].nr;
 511
 512             const t_iatom* ia = ilist[ftype].iatoms;
 513
 514             for (int i = 0; i < nr;)
 515             {
 516                 int tp = ia[0];
 517                 /* The vsite and constructing atoms */
 518                 int avsite = ia[1];
 519                 int ai     = ia[2];
 520                 /* Constants for constructing vsites */
 521                 real a1 = ip[tp].vsite.a;
 522                 /* Copy the old position */
 523                 rvec xv;
 524                 copy_rvec(x[avsite], xv);
 525
 526                 /* Construct the vsite depending on type */
 527                 int  aj, ak, al;
 528                 real b1, c1;
 529                 switch (ftype)
 530                 {
 531                     case F_VSITE2:
 532                         aj = ia[3];
 533                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 534                         break;
 535                     case F_VSITE2FD:
 536                         aj = ia[3];
 537                         constr_vsite2FD(x[ai], x[aj], x[avsite], a1, pbc_null2);
 538                         break;
 539                     case F_VSITE3:
 540                         aj = ia[3];
 541                         ak = ia[4];
 542                         b1 = ip[tp].vsite.b;
 543                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 544                         break;
 545                     case F_VSITE3FD:
 546                         aj = ia[3];
 547                         ak = ia[4];
 548                         b1 = ip[tp].vsite.b;
 549                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 550                         break;
 551                     case F_VSITE3FAD:
 552                         aj = ia[3];
 553                         ak = ia[4];
 554                         b1 = ip[tp].vsite.b;
 555                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 556                         break;
 557                     case F_VSITE3OUT:
 558                         aj = ia[3];
 559                         ak = ia[4];
 560                         b1 = ip[tp].vsite.b;
 561                         c1 = ip[tp].vsite.c;
 562                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 563                         break;
 564                     case F_VSITE4FD:
 565                         aj = ia[3];
 566                         ak = ia[4];
 567                         al = ia[5];
 568                         b1 = ip[tp].vsite.b;
 569                         c1 = ip[tp].vsite.c;
 570                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1, pbc_null2);
 571                         break;
 572                     case F_VSITE4FDN:
 573                         aj = ia[3];
 574                         ak = ia[4];
 575                         al = ia[5];
 576                         b1 = ip[tp].vsite.b;
 577                         c1 = ip[tp].vsite.c;
 578                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1, pbc_null2);
 579                         break;
 580                     case F_VSITEN: inc = constr_vsiten(ia, ip, x, pbc_null2); break;
 581                     default:
 582                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d", ftype, __FILE__, __LINE__);
 583                 }
 584
 585                 if (pbcMode == PbcMode::all)
 586                 {
 587                     /* Keep the vsite in the same periodic image as before */
 588                     rvec dx;
 589                     int  ishift = pbc_dx_aiuc(pbc_null, x[avsite], xv, dx);
 590                     if (ishift != CENTRAL)
 591                     {
 592                         rvec_add(xv, dx, x[avsite]);
 593                     }
 594                 }
 595                 if (v != nullptr)
 596                 {
 597                     /* Calculate velocity of vsite... */
 598                     rvec vv;
 599                     rvec_sub(x[avsite], xv, vv);
 600                     svmul(inv_dt, vv, v[avsite]);
 601                 }
 602
 603                 /* Increment loop variables */
 604                 i += inc;
 605                 ia += inc;
 606             }
 607         }
 608     }
 609 }
 610
 611 void construct_vsites(const gmx_vsite_t* vsite,
 612                       rvec               x[],
 613                       real               dt,
 614                       rvec*              v,
 615                       const t_iparams    ip[],
 616                       const t_ilist      ilist[],
 617                       int                ePBC,
 618                       gmx_bool           bMolPBC,
 619                       const t_commrec*   cr,
 620                       const matrix       box)
 621 {
 622     const bool useDomdec = (vsite != nullptr && vsite->useDomdec);
 623     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)),
 624                "When vsites are set up with domain decomposition, we need a valid commrec");
 625     // TODO: Remove this assertion when we remove charge groups
 626     GMX_ASSERT(vsite != nullptr || ePBC == epbcNONE,
 627                "Without a vsite struct we can not do PBC (in case we have charge groups)");
 628
 629     t_pbc pbc, *pbc_null;
 630
 631     /* We only need to do pbc when we have inter-cg vsites.
 632      * Note that with domain decomposition we do not need to apply PBC here
 633      * when we have at least 3 domains along each dimension. Currently we
 634      * do not optimize this case.
 635      */
 636     if (ePBC != epbcNONE && (useDomdec || bMolPBC)
 637         && !(vsite != nullptr && vsite->numInterUpdategroupVsites == 0))
 638     {
 639         /* This is wasting some CPU time as we now do this multiple times
 640          * per MD step.
 641          */
 642         ivec null_ivec;
 643         clear_ivec(null_ivec);
 644         pbc_null = set_pbc_dd(&pbc, ePBC, useDomdec ? cr->dd->numCells : null_ivec, FALSE, box);
 645     }
 646     else
 647     {
 648         pbc_null = nullptr;
 649     }
 650
 651     if (useDomdec)
 652     {
 653         dd_move_x_vsites(cr->dd, box, x);
 654     }
 655
 656     if (vsite == nullptr || vsite->nthreads == 1)
 657     {
 658         construct_vsites_thread(x, dt, v, ip, ilist, pbc_null);
 659     }
 660     else
 661     {
 662 #pragma omp parallel num_threads(vsite->nthreads)
 663         {
 664             try
 665             {
 666                 const int          th    = gmx_omp_get_thread_num();
 667                 const VsiteThread& tData = *vsite->tData[th];
 668                 GMX_ASSERT(tData.rangeStart >= 0,
 669                            "The thread data should be initialized before calling construct_vsites");
 670
 671                 construct_vsites_thread(x, dt, v, ip, tData.ilist, pbc_null);
 672                 if (tData.useInterdependentTask)
 673                 {
 674                     /* Here we don't need a barrier (unlike the spreading),
 675                      * since both tasks only construct vsites from particles,
 676                      * or local vsites, not from non-local vsites.
 677                      */
 678                     construct_vsites_thread(x, dt, v, ip, tData.idTask.ilist, pbc_null);
 679                 }
 680             }
 681             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 682         }
 683         /* Now we can construct the vsites that might depend on other vsites */
 684         construct_vsites_thread(x, dt, v, ip, vsite->tData[vsite->nthreads]->ilist, pbc_null);
 685     }
 686 }
 687
 688 static void spread_vsite2(const t_iatom  ia[],
 689                           real           a,
 690                           const rvec     x[],
 691                           rvec           f[],
 692                           rvec           fshift[],
 693                           const t_pbc*   pbc,
 694                           const t_graph* g)
 695 {
 696     rvec    fi, fj, dx;
 697     t_iatom av, ai, aj;
 698     ivec    di;
 699     int     siv, sij;
 700
 701     av = ia[1];
 702     ai = ia[2];
 703     aj = ia[3];
 704
 705     svmul(1 - a, f[av], fi);
 706     svmul(a, f[av], fj);
 707     /* 7 flop */
 708
 709     rvec_inc(f[ai], fi);
 710     rvec_inc(f[aj], fj);
 711     /* 6 Flops */
 712
 713     if (g)
 714     {
 715         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 716         siv = IVEC2IS(di);
 717         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 718         sij = IVEC2IS(di);
 719     }
 720     else if (pbc)
 721     {
 722         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 723         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 724     }
 725     else
 726     {
 727         siv = CENTRAL;
 728         sij = CENTRAL;
 729     }
 730
 731     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 732     {
 733         rvec_inc(fshift[siv], f[av]);
 734         rvec_dec(fshift[CENTRAL], fi);
 735         rvec_dec(fshift[sij], fj);
 736     }
 737
 738     /* TOTAL: 13 flops */
 739 }
 740
 741 void constructVsitesGlobal(const gmx_mtop_t& mtop, gmx::ArrayRef<gmx::RVec> x)
 742 {
 743     GMX_ASSERT(x.ssize() >= mtop.natoms, "x should contain the whole system");
 744     GMX_ASSERT(!mtop.moleculeBlockIndices.empty(),
 745                "molblock indices are needed in constructVsitesGlobal");
 746
 747     for (size_t mb = 0; mb < mtop.molblock.size(); mb++)
 748     {
 749         const gmx_molblock_t& molb = mtop.molblock[mb];
 750         const gmx_moltype_t&  molt = mtop.moltype[molb.type];
 751         if (vsiteIlistNrCount(molt.ilist.data()) > 0)
 752         {
 753             int atomOffset = mtop.moleculeBlockIndices[mb].globalAtomStart;
 754             for (int mol = 0; mol < molb.nmol; mol++)
 755             {
 756                 t_ilist ilist[F_NRE];
 757                 for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 758                 {
 759                     ilist[ftype].nr     = molt.ilist[ftype].size();
 760                     ilist[ftype].iatoms = const_cast<t_iatom*>(molt.ilist[ftype].iatoms.data());
 761                 }
 762
 763                 construct_vsites(nullptr, as_rvec_array(x.data()) + atomOffset, 0.0, nullptr,
 764                                  mtop.ffparams.iparams.data(), ilist, epbcNONE, TRUE, nullptr, nullptr);
 765                 atomOffset += molt.atoms.nr;
 766             }
 767         }
 768     }
 769 }
 770
 771 static void spread_vsite2FD(const t_iatom  ia[],
 772                             real           a,
 773                             const rvec     x[],
 774                             rvec           f[],
 775                             rvec           fshift[],
 776                             gmx_bool       VirCorr,
 777                             matrix         dxdf,
 778                             const t_pbc*   pbc,
 779                             const t_graph* g)
 780 {
 781     const int av = ia[1];
 782     const int ai = ia[2];
 783     const int aj = ia[3];
 784     rvec      fv;
 785     copy_rvec(f[av], fv);
 786
 787     rvec xij;
 788     int  sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 789     /* 6 flops */
 790
 791     const real invDistance = inverseNorm(xij);
 792     const real b           = a * invDistance;
 793     /* 4 + ?10? flops */
 794
 795     const real fproj = iprod(xij, fv) * invDistance * invDistance;
 796
 797     rvec fj;
 798     fj[XX] = b * (fv[XX] - fproj * xij[XX]);
 799     fj[YY] = b * (fv[YY] - fproj * xij[YY]);
 800     fj[ZZ] = b * (fv[ZZ] - fproj * xij[ZZ]);
 801     /* 9 */
 802
 803     /* b is already calculated in constr_vsite2FD
 804        storing b somewhere will save flops.     */
 805
 806     f[ai][XX] += fv[XX] - fj[XX];
 807     f[ai][YY] += fv[YY] - fj[YY];
 808     f[ai][ZZ] += fv[ZZ] - fj[ZZ];
 809     f[aj][XX] += fj[XX];
 810     f[aj][YY] += fj[YY];
 811     f[aj][ZZ] += fj[ZZ];
 812     /* 9 Flops */
 813
 814     if (fshift)
 815     {
 816         int svi;
 817         if (g)
 818         {
 819             ivec di;
 820             ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 821             svi = IVEC2IS(di);
 822             ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 823             sji = IVEC2IS(di);
 824         }
 825         else if (pbc)
 826         {
 827             rvec xvi;
 828             svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 829         }
 830         else
 831         {
 832             svi = CENTRAL;
 833         }
 834
 835         if (svi != CENTRAL || sji != CENTRAL)
 836         {
 837             rvec_dec(fshift[svi], fv);
 838             fshift[CENTRAL][XX] += fv[XX] - fj[XX];
 839             fshift[CENTRAL][YY] += fv[YY] - fj[YY];
 840             fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ];
 841             fshift[sji][XX] += fj[XX];
 842             fshift[sji][YY] += fj[YY];
 843             fshift[sji][ZZ] += fj[ZZ];
 844         }
 845     }
 846
 847     if (VirCorr)
 848     {
 849         /* When VirCorr=TRUE, the virial for the current forces is not
 850          * calculated from the redistributed forces. This means that
 851          * the effect of non-linear virtual site constructions on the virial
 852          * needs to be added separately. This contribution can be calculated
 853          * in many ways, but the simplest and cheapest way is to use
 854          * the first constructing atom ai as a reference position in space:
 855          * subtract (xv-xi)*fv and add (xj-xi)*fj.
 856          */
 857         rvec xiv;
 858
 859         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 860
 861         for (int i = 0; i < DIM; i++)
 862         {
 863             for (int j = 0; j < DIM; j++)
 864             {
 865                 /* As xix is a linear combination of j and k, use that here */
 866                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j];
 867             }
 868         }
 869     }
 870
 871     /* TOTAL: 38 flops */
 872 }
 873
 874 static void spread_vsite3(const t_iatom  ia[],
 875                           real           a,
 876                           real           b,
 877                           const rvec     x[],
 878                           rvec           f[],
 879                           rvec           fshift[],
 880                           const t_pbc*   pbc,
 881                           const t_graph* g)
 882 {
 883     rvec fi, fj, fk, dx;
 884     int  av, ai, aj, ak;
 885     ivec di;
 886     int  siv, sij, sik;
 887
 888     av = ia[1];
 889     ai = ia[2];
 890     aj = ia[3];
 891     ak = ia[4];
 892
 893     svmul(1 - a - b, f[av], fi);
 894     svmul(a, f[av], fj);
 895     svmul(b, f[av], fk);
 896     /* 11 flops */
 897
 898     rvec_inc(f[ai], fi);
 899     rvec_inc(f[aj], fj);
 900     rvec_inc(f[ak], fk);
 901     /* 9 Flops */
 902
 903     if (g)
 904     {
 905         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 906         siv = IVEC2IS(di);
 907         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 908         sij = IVEC2IS(di);
 909         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 910         sik = IVEC2IS(di);
 911     }
 912     else if (pbc)
 913     {
 914         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 915         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 916         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 917     }
 918     else
 919     {
 920         siv = CENTRAL;
 921         sij = CENTRAL;
 922         sik = CENTRAL;
 923     }
 924
 925     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 926     {
 927         rvec_inc(fshift[siv], f[av]);
 928         rvec_dec(fshift[CENTRAL], fi);
 929         rvec_dec(fshift[sij], fj);
 930         rvec_dec(fshift[sik], fk);
 931     }
 932
 933     /* TOTAL: 20 flops */
 934 }
 935
 936 static void spread_vsite3FD(const t_iatom  ia[],
 937                             real           a,
 938                             real           b,
 939                             const rvec     x[],
 940                             rvec           f[],
 941                             rvec           fshift[],
 942                             gmx_bool       VirCorr,
 943                             matrix         dxdf,
 944                             const t_pbc*   pbc,
 945                             const t_graph* g)
 946 {
 947     real    fproj, a1;
 948     rvec    xvi, xij, xjk, xix, fv, temp;
 949     t_iatom av, ai, aj, ak;
 950     int     svi, sji, skj;
 951     ivec    di;
 952
 953     av = ia[1];
 954     ai = ia[2];
 955     aj = ia[3];
 956     ak = ia[4];
 957     copy_rvec(f[av], fv);
 958
 959     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 960     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 961     /* 6 flops */
 962
 963     /* xix goes from i to point x on the line jk */
 964     xix[XX] = xij[XX] + a * xjk[XX];
 965     xix[YY] = xij[YY] + a * xjk[YY];
 966     xix[ZZ] = xij[ZZ] + a * xjk[ZZ];
 967     /* 6 flops */
 968
 969     const real invDistance = inverseNorm(xix);
 970     const real c           = b * invDistance;
 971     /* 4 + ?10? flops */
 972
 973     fproj = iprod(xix, fv) * invDistance * invDistance; /* = (xix . f)/(xix . xix) */
 974
 975     temp[XX] = c * (fv[XX] - fproj * xix[XX]);
 976     temp[YY] = c * (fv[YY] - fproj * xix[YY]);
 977     temp[ZZ] = c * (fv[ZZ] - fproj * xix[ZZ]);
 978     /* 16 */
 979
 980     /* c is already calculated in constr_vsite3FD
 981        storing c somewhere will save 26 flops!     */
 982
 983     a1 = 1 - a;
 984     f[ai][XX] += fv[XX] - temp[XX];
 985     f[ai][YY] += fv[YY] - temp[YY];
 986     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 987     f[aj][XX] += a1 * temp[XX];
 988     f[aj][YY] += a1 * temp[YY];
 989     f[aj][ZZ] += a1 * temp[ZZ];
 990     f[ak][XX] += a * temp[XX];
 991     f[ak][YY] += a * temp[YY];
 992     f[ak][ZZ] += a * temp[ZZ];
 993     /* 19 Flops */
 994
 995     if (g)
 996     {
 997         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 998         svi = IVEC2IS(di);
 999         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1000         sji = IVEC2IS(di);
1001         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1002         skj = IVEC2IS(di);
1003     }
1004     else if (pbc)
1005     {
1006         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1007     }
1008     else
1009     {
1010         svi = CENTRAL;
1011     }
1012
1013     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1014     {
1015         rvec_dec(fshift[svi], fv);
1016         fshift[CENTRAL][XX] += fv[XX] - (1 + a) * temp[XX];
1017         fshift[CENTRAL][YY] += fv[YY] - (1 + a) * temp[YY];
1018         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a) * temp[ZZ];
1019         fshift[sji][XX] += temp[XX];
1020         fshift[sji][YY] += temp[YY];
1021         fshift[sji][ZZ] += temp[ZZ];
1022         fshift[skj][XX] += a * temp[XX];
1023         fshift[skj][YY] += a * temp[YY];
1024         fshift[skj][ZZ] += a * temp[ZZ];
1025     }
1026
1027     if (VirCorr)
1028     {
1029         /* When VirCorr=TRUE, the virial for the current forces is not
1030          * calculated from the redistributed forces. This means that
1031          * the effect of non-linear virtual site constructions on the virial
1032          * needs to be added separately. This contribution can be calculated
1033          * in many ways, but the simplest and cheapest way is to use
1034          * the first constructing atom ai as a reference position in space:
1035          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
1036          */
1037         rvec xiv;
1038
1039         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1040
1041         for (int i = 0; i < DIM; i++)
1042         {
1043             for (int j = 0; j < DIM; j++)
1044             {
1045                 /* As xix is a linear combination of j and k, use that here */
1046                 dxdf[i][j] += -xiv[i] * fv[j] + xix[i] * temp[j];
1047             }
1048         }
1049     }
1050
1051     /* TOTAL: 61 flops */
1052 }
1053
1054 static void spread_vsite3FAD(const t_iatom  ia[],
1055                              real           a,
1056                              real           b,
1057                              const rvec     x[],
1058                              rvec           f[],
1059                              rvec           fshift[],
1060                              gmx_bool       VirCorr,
1061                              matrix         dxdf,
1062                              const t_pbc*   pbc,
1063                              const t_graph* g)
1064 {
1065     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
1066     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
1067     t_iatom av, ai, aj, ak;
1068     int     svi, sji, skj, d;
1069     ivec    di;
1070
1071     av = ia[1];
1072     ai = ia[2];
1073     aj = ia[3];
1074     ak = ia[4];
1075     copy_rvec(f[ia[1]], fv);
1076
1077     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1078     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1079     /* 6 flops */
1080
1081     invdij    = inverseNorm(xij);
1082     invdij2   = invdij * invdij;
1083     c1        = iprod(xij, xjk) * invdij2;
1084     xperp[XX] = xjk[XX] - c1 * xij[XX];
1085     xperp[YY] = xjk[YY] - c1 * xij[YY];
1086     xperp[ZZ] = xjk[ZZ] - c1 * xij[ZZ];
1087     /* xperp in plane ijk, perp. to ij */
1088     invdp = inverseNorm(xperp);
1089     a1    = a * invdij;
1090     b1    = b * invdp;
1091     /* 45 flops */
1092
1093     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
1094        storing them somewhere will save 45 flops!     */
1095
1096     fproj = iprod(xij, fv) * invdij2;
1097     svmul(fproj, xij, Fpij);                              /* proj. f on xij */
1098     svmul(iprod(xperp, fv) * invdp * invdp, xperp, Fppp); /* proj. f on xperp */
1099     svmul(b1 * fproj, xperp, f3);
1100     /* 23 flops */
1101
1102     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
1103     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
1104     for (d = 0; (d < DIM); d++)
1105     {
1106         f1[d] *= a1;
1107         f2[d] *= b1;
1108     }
1109     /* 12 flops */
1110
1111     c2 = 1 + c1;
1112     f[ai][XX] += fv[XX] - f1[XX] + c1 * f2[XX] + f3[XX];
1113     f[ai][YY] += fv[YY] - f1[YY] + c1 * f2[YY] + f3[YY];
1114     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1 * f2[ZZ] + f3[ZZ];
1115     f[aj][XX] += f1[XX] - c2 * f2[XX] - f3[XX];
1116     f[aj][YY] += f1[YY] - c2 * f2[YY] - f3[YY];
1117     f[aj][ZZ] += f1[ZZ] - c2 * f2[ZZ] - f3[ZZ];
1118     f[ak][XX] += f2[XX];
1119     f[ak][YY] += f2[YY];
1120     f[ak][ZZ] += f2[ZZ];
1121     /* 30 Flops */
1122
1123     if (g)
1124     {
1125         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1126         svi = IVEC2IS(di);
1127         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1128         sji = IVEC2IS(di);
1129         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1130         skj = IVEC2IS(di);
1131     }
1132     else if (pbc)
1133     {
1134         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1135     }
1136     else
1137     {
1138         svi = CENTRAL;
1139     }
1140
1141     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1142     {
1143         rvec_dec(fshift[svi], fv);
1144         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1 - c1) * f2[XX] + f3[XX];
1145         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1 - c1) * f2[YY] + f3[YY];
1146         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1 - c1) * f2[ZZ] + f3[ZZ];
1147         fshift[sji][XX] += f1[XX] - c1 * f2[XX] - f3[XX];
1148         fshift[sji][YY] += f1[YY] - c1 * f2[YY] - f3[YY];
1149         fshift[sji][ZZ] += f1[ZZ] - c1 * f2[ZZ] - f3[ZZ];
1150         fshift[skj][XX] += f2[XX];
1151         fshift[skj][YY] += f2[YY];
1152         fshift[skj][ZZ] += f2[ZZ];
1153     }
1154
1155     if (VirCorr)
1156     {
1157         rvec xiv;
1158         int  i, j;
1159
1160         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1161
1162         for (i = 0; i < DIM; i++)
1163         {
1164             for (j = 0; j < DIM; j++)
1165             {
1166                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1167                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * (f1[j] + (1 - c2) * f2[j] - f3[j])
1168                               + xjk[i] * f2[j];
1169             }
1170         }
1171     }
1172
1173     /* TOTAL: 113 flops */
1174 }
1175
1176 static void spread_vsite3OUT(const t_iatom  ia[],
1177                              real           a,
1178                              real           b,
1179                              real           c,
1180                              const rvec     x[],
1181                              rvec           f[],
1182                              rvec           fshift[],
1183                              gmx_bool       VirCorr,
1184                              matrix         dxdf,
1185                              const t_pbc*   pbc,
1186                              const t_graph* g)
1187 {
1188     rvec xvi, xij, xik, fv, fj, fk;
1189     real cfx, cfy, cfz;
1190     int  av, ai, aj, ak;
1191     ivec di;
1192     int  svi, sji, ski;
1193
1194     av = ia[1];
1195     ai = ia[2];
1196     aj = ia[3];
1197     ak = ia[4];
1198
1199     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1200     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1201     /* 6 Flops */
1202
1203     copy_rvec(f[av], fv);
1204
1205     cfx = c * fv[XX];
1206     cfy = c * fv[YY];
1207     cfz = c * fv[ZZ];
1208     /* 3 Flops */
1209
1210     fj[XX] = a * fv[XX] - xik[ZZ] * cfy + xik[YY] * cfz;
1211     fj[YY] = xik[ZZ] * cfx + a * fv[YY] - xik[XX] * cfz;
1212     fj[ZZ] = -xik[YY] * cfx + xik[XX] * cfy + a * fv[ZZ];
1213
1214     fk[XX] = b * fv[XX] + xij[ZZ] * cfy - xij[YY] * cfz;
1215     fk[YY] = -xij[ZZ] * cfx + b * fv[YY] + xij[XX] * cfz;
1216     fk[ZZ] = xij[YY] * cfx - xij[XX] * cfy + b * fv[ZZ];
1217     /* 30 Flops */
1218
1219     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1220     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1221     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1222     rvec_inc(f[aj], fj);
1223     rvec_inc(f[ak], fk);
1224     /* 15 Flops */
1225
1226     if (g)
1227     {
1228         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1229         svi = IVEC2IS(di);
1230         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1231         sji = IVEC2IS(di);
1232         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1233         ski = IVEC2IS(di);
1234     }
1235     else if (pbc)
1236     {
1237         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1238     }
1239     else
1240     {
1241         svi = CENTRAL;
1242     }
1243
1244     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1245     {
1246         rvec_dec(fshift[svi], fv);
1247         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1248         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1249         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1250         rvec_inc(fshift[sji], fj);
1251         rvec_inc(fshift[ski], fk);
1252     }
1253
1254     if (VirCorr)
1255     {
1256         rvec xiv;
1257
1258         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1259
1260         for (int i = 0; i < DIM; i++)
1261         {
1262             for (int j = 0; j < DIM; j++)
1263             {
1264                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j] + xik[i] * fk[j];
1265             }
1266         }
1267     }
1268
1269     /* TOTAL: 54 flops */
1270 }
1271
1272 static void spread_vsite4FD(const t_iatom  ia[],
1273                             real           a,
1274                             real           b,
1275                             real           c,
1276                             const rvec     x[],
1277                             rvec           f[],
1278                             rvec           fshift[],
1279                             gmx_bool       VirCorr,
1280                             matrix         dxdf,
1281                             const t_pbc*   pbc,
1282                             const t_graph* g)
1283 {
1284     real fproj, a1;
1285     rvec xvi, xij, xjk, xjl, xix, fv, temp;
1286     int  av, ai, aj, ak, al;
1287     ivec di;
1288     int  svi, sji, skj, slj, m;
1289
1290     av = ia[1];
1291     ai = ia[2];
1292     aj = ia[3];
1293     ak = ia[4];
1294     al = ia[5];
1295
1296     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1297     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1298     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1299     /* 9 flops */
1300
1301     /* xix goes from i to point x on the plane jkl */
1302     for (m = 0; m < DIM; m++)
1303     {
1304         xix[m] = xij[m] + a * xjk[m] + b * xjl[m];
1305     }
1306     /* 12 flops */
1307
1308     const real invDistance = inverseNorm(xix);
1309     const real d           = c * invDistance;
1310     /* 4 + ?10? flops */
1311
1312     copy_rvec(f[av], fv);
1313
1314     fproj = iprod(xix, fv) * invDistance * invDistance; /* = (xix . f)/(xix . xix) */
1315
1316     for (m = 0; m < DIM; m++)
1317     {
1318         temp[m] = d * (fv[m] - fproj * xix[m]);
1319     }
1320     /* 16 */
1321
1322     /* c is already calculated in constr_vsite3FD
1323        storing c somewhere will save 35 flops!     */
1324
1325     a1 = 1 - a - b;
1326     for (m = 0; m < DIM; m++)
1327     {
1328         f[ai][m] += fv[m] - temp[m];
1329         f[aj][m] += a1 * temp[m];
1330         f[ak][m] += a * temp[m];
1331         f[al][m] += b * temp[m];
1332     }
1333     /* 26 Flops */
1334
1335     if (g)
1336     {
1337         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1338         svi = IVEC2IS(di);
1339         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1340         sji = IVEC2IS(di);
1341         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1342         skj = IVEC2IS(di);
1343         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1344         slj = IVEC2IS(di);
1345     }
1346     else if (pbc)
1347     {
1348         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1349     }
1350     else
1351     {
1352         svi = CENTRAL;
1353     }
1354
1355     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1356     {
1357         rvec_dec(fshift[svi], fv);
1358         for (m = 0; m < DIM; m++)
1359         {
1360             fshift[CENTRAL][m] += fv[m] - (1 + a + b) * temp[m];
1361             fshift[sji][m] += temp[m];
1362             fshift[skj][m] += a * temp[m];
1363             fshift[slj][m] += b * temp[m];
1364         }
1365     }
1366
1367     if (VirCorr)
1368     {
1369         rvec xiv;
1370         int  i, j;
1371
1372         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1373
1374         for (i = 0; i < DIM; i++)
1375         {
1376             for (j = 0; j < DIM; j++)
1377             {
1378                 dxdf[i][j] += -xiv[i] * fv[j] + xix[i] * temp[j];
1379             }
1380         }
1381     }
1382
1383     /* TOTAL: 77 flops */
1384 }
1385
1386
1387 static void spread_vsite4FDN(const t_iatom  ia[],
1388                              real           a,
1389                              real           b,
1390                              real           c,
1391                              const rvec     x[],
1392                              rvec           f[],
1393                              rvec           fshift[],
1394                              gmx_bool       VirCorr,
1395                              matrix         dxdf,
1396                              const t_pbc*   pbc,
1397                              const t_graph* g)
1398 {
1399     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1400     rvec fv, fj, fk, fl;
1401     real invrm, denom;
1402     real cfx, cfy, cfz;
1403     ivec di;
1404     int  av, ai, aj, ak, al;
1405     int  svi, sij, sik, sil;
1406
1407     /* DEBUG: check atom indices */
1408     av = ia[1];
1409     ai = ia[2];
1410     aj = ia[3];
1411     ak = ia[4];
1412     al = ia[5];
1413
1414     copy_rvec(f[av], fv);
1415
1416     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1417     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1418     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1419     /* 9 flops */
1420
1421     ra[XX] = a * xik[XX];
1422     ra[YY] = a * xik[YY];
1423     ra[ZZ] = a * xik[ZZ];
1424
1425     rb[XX] = b * xil[XX];
1426     rb[YY] = b * xil[YY];
1427     rb[ZZ] = b * xil[ZZ];
1428
1429     /* 6 flops */
1430
1431     rvec_sub(ra, xij, rja);
1432     rvec_sub(rb, xij, rjb);
1433     rvec_sub(rb, ra, rab);
1434     /* 9 flops */
1435
1436     cprod(rja, rjb, rm);
1437     /* 9 flops */
1438
1439     invrm = inverseNorm(rm);
1440     denom = invrm * invrm;
1441     /* 5+5+2 flops */
1442
1443     cfx = c * invrm * fv[XX];
1444     cfy = c * invrm * fv[YY];
1445     cfz = c * invrm * fv[ZZ];
1446     /* 6 Flops */
1447
1448     cprod(rm, rab, rt);
1449     /* 9 flops */
1450
1451     rt[XX] *= denom;
1452     rt[YY] *= denom;
1453     rt[ZZ] *= denom;
1454     /* 3flops */
1455
1456     fj[XX] = (-rm[XX] * rt[XX]) * cfx + (rab[ZZ] - rm[YY] * rt[XX]) * cfy
1457              + (-rab[YY] - rm[ZZ] * rt[XX]) * cfz;
1458     fj[YY] = (-rab[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1459              + (rab[XX] - rm[ZZ] * rt[YY]) * cfz;
1460     fj[ZZ] = (rab[YY] - rm[XX] * rt[ZZ]) * cfx + (-rab[XX] - rm[YY] * rt[ZZ]) * cfy
1461              + (-rm[ZZ] * rt[ZZ]) * cfz;
1462     /* 30 flops */
1463
1464     cprod(rjb, rm, rt);
1465     /* 9 flops */
1466
1467     rt[XX] *= denom * a;
1468     rt[YY] *= denom * a;
1469     rt[ZZ] *= denom * a;
1470     /* 3flops */
1471
1472     fk[XX] = (-rm[XX] * rt[XX]) * cfx + (-a * rjb[ZZ] - rm[YY] * rt[XX]) * cfy
1473              + (a * rjb[YY] - rm[ZZ] * rt[XX]) * cfz;
1474     fk[YY] = (a * rjb[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1475              + (-a * rjb[XX] - rm[ZZ] * rt[YY]) * cfz;
1476     fk[ZZ] = (-a * rjb[YY] - rm[XX] * rt[ZZ]) * cfx + (a * rjb[XX] - rm[YY] * rt[ZZ]) * cfy
1477              + (-rm[ZZ] * rt[ZZ]) * cfz;
1478     /* 36 flops */
1479
1480     cprod(rm, rja, rt);
1481     /* 9 flops */
1482
1483     rt[XX] *= denom * b;
1484     rt[YY] *= denom * b;
1485     rt[ZZ] *= denom * b;
1486     /* 3flops */
1487
1488     fl[XX] = (-rm[XX] * rt[XX]) * cfx + (b * rja[ZZ] - rm[YY] * rt[XX]) * cfy
1489              + (-b * rja[YY] - rm[ZZ] * rt[XX]) * cfz;
1490     fl[YY] = (-b * rja[ZZ] - rm[XX] * rt[YY]) * cfx + (-rm[YY] * rt[YY]) * cfy
1491              + (b * rja[XX] - rm[ZZ] * rt[YY]) * cfz;
1492     fl[ZZ] = (b * rja[YY] - rm[XX] * rt[ZZ]) * cfx + (-b * rja[XX] - rm[YY] * rt[ZZ]) * cfy
1493              + (-rm[ZZ] * rt[ZZ]) * cfz;
1494     /* 36 flops */
1495
1496     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1497     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1498     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1499     rvec_inc(f[aj], fj);
1500     rvec_inc(f[ak], fk);
1501     rvec_inc(f[al], fl);
1502     /* 21 flops */
1503
1504     if (g)
1505     {
1506         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1507         svi = IVEC2IS(di);
1508         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1509         sij = IVEC2IS(di);
1510         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1511         sik = IVEC2IS(di);
1512         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1513         sil = IVEC2IS(di);
1514     }
1515     else if (pbc)
1516     {
1517         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1518     }
1519     else
1520     {
1521         svi = CENTRAL;
1522     }
1523
1524     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1525     {
1526         rvec_dec(fshift[svi], fv);
1527         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1528         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1529         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1530         rvec_inc(fshift[sij], fj);
1531         rvec_inc(fshift[sik], fk);
1532         rvec_inc(fshift[sil], fl);
1533     }
1534
1535     if (VirCorr)
1536     {
1537         rvec xiv;
1538         int  i, j;
1539
1540         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1541
1542         for (i = 0; i < DIM; i++)
1543         {
1544             for (j = 0; j < DIM; j++)
1545             {
1546                 dxdf[i][j] += -xiv[i] * fv[j] + xij[i] * fj[j] + xik[i] * fk[j] + xil[i] * fl[j];
1547             }
1548         }
1549     }
1550
1551     /* Total: 207 flops (Yuck!) */
1552 }
1553
1554
1555 static int spread_vsiten(const t_iatom   ia[],
1556                          const t_iparams ip[],
1557                          const rvec      x[],
1558                          rvec            f[],
1559                          rvec            fshift[],
1560                          const t_pbc*    pbc,
1561                          const t_graph*  g)
1562 {
1563     rvec xv, dx, fi;
1564     int  n3, av, i, ai;
1565     real a;
1566     ivec di;
1567     int  siv;
1568
1569     n3 = 3 * ip[ia[0]].vsiten.n;
1570     av = ia[1];
1571     copy_rvec(x[av], xv);
1572
1573     for (i = 0; i < n3; i += 3)
1574     {
1575         ai = ia[i + 2];
1576         if (g)
1577         {
1578             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1579             siv = IVEC2IS(di);
1580         }
1581         else if (pbc)
1582         {
1583             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1584         }
1585         else
1586         {
1587             siv = CENTRAL;
1588         }
1589         a = ip[ia[i]].vsiten.a;
1590         svmul(a, f[av], fi);
1591         rvec_inc(f[ai], fi);
1592         if (fshift && siv != CENTRAL)
1593         {
1594             rvec_inc(fshift[siv], fi);
1595             rvec_dec(fshift[CENTRAL], fi);
1596         }
1597         /* 6 Flops */
1598     }
1599
1600     return n3;
1601 }
1602
1603
1604 static int vsite_count(const t_ilist* ilist, int ftype)
1605 {
1606     if (ftype == F_VSITEN)
1607     {
1608         return ilist[ftype].nr / 3;
1609     }
1610     else
1611     {
1612         return ilist[ftype].nr / (1 + interaction_function[ftype].nratoms);
1613     }
1614 }
1615
1616 static void spread_vsite_f_thread(const rvec     x[],
1617                                   rvec           f[],
1618                                   rvec*          fshift,
1619                                   gmx_bool       VirCorr,
1620                                   matrix         dxdf,
1621                                   t_iparams      ip[],
1622                                   const t_ilist  ilist[],
1623                                   const t_graph* g,
1624                                   const t_pbc*   pbc_null)
1625 {
1626     const PbcMode pbcMode = getPbcMode(pbc_null);
1627     /* We need another pbc pointer, as with charge groups we switch per vsite */
1628     const t_pbc*             pbc_null2 = pbc_null;
1629     gmx::ArrayRef<const int> vsite_pbc;
1630
1631     /* this loop goes backwards to be able to build *
1632      * higher type vsites from lower types         */
1633     for (int ftype = c_ftypeVsiteEnd - 1; ftype >= c_ftypeVsiteStart; ftype--)
1634     {
1635         if (ilist[ftype].nr == 0)
1636         {
1637             continue;
1638         }
1639
1640         { // TODO remove me
1641             int nra = interaction_function[ftype].nratoms;
1642             int inc = 1 + nra;
1643             int nr  = ilist[ftype].nr;
1644
1645             const t_iatom* ia = ilist[ftype].iatoms;
1646
1647             if (pbcMode == PbcMode::all)
1648             {
1649                 pbc_null2 = pbc_null;
1650             }
1651
1652             for (int i = 0; i < nr;)
1653             {
1654                 int tp = ia[0];
1655
1656                 /* Constants for constructing */
1657                 real a1, b1, c1;
1658                 a1 = ip[tp].vsite.a;
1659                 /* Construct the vsite depending on type */
1660                 switch (ftype)
1661                 {
1662                     case F_VSITE2: spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g); break;
1663                     case F_VSITE2FD:
1664                         spread_vsite2FD(ia, a1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1665                         break;
1666                     case F_VSITE3:
1667                         b1 = ip[tp].vsite.b;
1668                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1669                         break;
1670                     case F_VSITE3FD:
1671                         b1 = ip[tp].vsite.b;
1672                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1673                         break;
1674                     case F_VSITE3FAD:
1675                         b1 = ip[tp].vsite.b;
1676                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1677                         break;
1678                     case F_VSITE3OUT:
1679                         b1 = ip[tp].vsite.b;
1680                         c1 = ip[tp].vsite.c;
1681                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1682                         break;
1683                     case F_VSITE4FD:
1684                         b1 = ip[tp].vsite.b;
1685                         c1 = ip[tp].vsite.c;
1686                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1687                         break;
1688                     case F_VSITE4FDN:
1689                         b1 = ip[tp].vsite.b;
1690                         c1 = ip[tp].vsite.c;
1691                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1692                         break;
1693                     case F_VSITEN: inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g); break;
1694                     default:
1695                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d", ftype, __FILE__, __LINE__);
1696                 }
1697                 clear_rvec(f[ia[1]]);
1698
1699                 /* Increment loop variables */
1700                 i += inc;
1701                 ia += inc;
1702             }
1703         }
1704     }
1705 }
1706
1707 /*! \brief Clears the task force buffer elements that are written by task idTask */
1708 static void clearTaskForceBufferUsedElements(InterdependentTask* idTask)
1709 {
1710     int ntask = idTask->spreadTask.size();
1711     for (int ti = 0; ti < ntask; ti++)
1712     {
1713         const AtomIndex* atomList = &idTask->atomIndex[idTask->spreadTask[ti]];
1714         int              natom    = atomList->atom.size();
1715         RVec*            force    = idTask->force.data();
1716         for (int i = 0; i < natom; i++)
1717         {
1718             clear_rvec(force[atomList->atom[i]]);
1719         }
1720     }
1721 }
1722
1723 void spread_vsite_f(const gmx_vsite_t* vsite,
1724                     const rvec* gmx_restrict x,
1725                     rvec* gmx_restrict f,
1726                     rvec* gmx_restrict fshift,
1727                     gmx_bool           VirCorr,
1728                     matrix             vir,
1729                     t_nrnb*            nrnb,
1730                     const t_idef*      idef,
1731                     int                ePBC,
1732                     gmx_bool           bMolPBC,
1733                     const t_graph*     g,
1734                     const matrix       box,
1735                     const t_commrec*   cr,
1736                     gmx_wallcycle*     wcycle)
1737 {
1738     wallcycle_start(wcycle, ewcVSITESPREAD);
1739     const bool useDomdec = vsite->useDomdec;
1740     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)),
1741                "When vsites are set up with domain decomposition, we need a valid commrec");
1742
1743     t_pbc pbc, *pbc_null;
1744
1745     /* We only need to do pbc when we have inter-cg vsites */
1746     if ((useDomdec || bMolPBC) && vsite->numInterUpdategroupVsites)
1747     {
1748         /* This is wasting some CPU time as we now do this multiple times
1749          * per MD step.
1750          */
1751         pbc_null = set_pbc_dd(&pbc, ePBC, useDomdec ? cr->dd->numCells : nullptr, FALSE, box);
1752     }
1753     else
1754     {
1755         pbc_null = nullptr;
1756     }
1757
1758     if (useDomdec)
1759     {
1760         dd_clear_f_vsites(cr->dd, f);
1761     }
1762
1763     if (vsite->nthreads == 1)
1764     {
1765         matrix dxdf;
1766         if (VirCorr)
1767         {
1768             clear_mat(dxdf);
1769         }
1770         spread_vsite_f_thread(x, f, fshift, VirCorr, dxdf, idef->iparams, idef->il, g, pbc_null);
1771
1772         if (VirCorr)
1773         {
1774             for (int i = 0; i < DIM; i++)
1775             {
1776                 for (int j = 0; j < DIM; j++)
1777                 {
1778                     vir[i][j] += -0.5 * dxdf[i][j];
1779                 }
1780             }
1781         }
1782     }
1783     else
1784     {
1785         /* First spread the vsites that might depend on non-local vsites */
1786         if (VirCorr)
1787         {
1788             clear_mat(vsite->tData[vsite->nthreads]->dxdf);
1789         }
1790         spread_vsite_f_thread(x, f, fshift, VirCorr, vsite->tData[vsite->nthreads]->dxdf,
1791                               idef->iparams, vsite->tData[vsite->nthreads]->ilist, g, pbc_null);
1792
1793 #pragma omp parallel num_threads(vsite->nthreads)
1794         {
1795             try
1796             {
1797                 int          thread = gmx_omp_get_thread_num();
1798                 VsiteThread& tData  = *vsite->tData[thread];
1799
1800                 rvec* fshift_t;
1801                 if (thread == 0 || fshift == nullptr)
1802                 {
1803                     fshift_t = fshift;
1804                 }
1805                 else
1806                 {
1807                     fshift_t = tData.fshift;
1808
1809                     for (int i = 0; i < SHIFTS; i++)
1810                     {
1811                         clear_rvec(fshift_t[i]);
1812                     }
1813                 }
1814                 if (VirCorr)
1815                 {
1816                     clear_mat(tData.dxdf);
1817                 }
1818
1819                 if (tData.useInterdependentTask)
1820                 {
1821                     /* Spread the vsites that spread outside our local range.
1822                      * This is done using a thread-local force buffer force.
1823                      * First we need to copy the input vsite forces to force.
1824                      */
1825                     InterdependentTask* idTask = &tData.idTask;
1826
1827                     /* Clear the buffer elements set by our task during
1828                      * the last call to spread_vsite_f.
1829                      */
1830                     clearTaskForceBufferUsedElements(idTask);
1831
1832                     int nvsite = idTask->vsite.size();
1833                     for (int i = 0; i < nvsite; i++)
1834                     {
1835                         copy_rvec(f[idTask->vsite[i]], idTask->force[idTask->vsite[i]]);
1836                     }
1837                     spread_vsite_f_thread(x, as_rvec_array(idTask->force.data()), fshift_t, VirCorr,
1838                                           tData.dxdf, idef->iparams, tData.idTask.ilist, g, pbc_null);
1839
1840                     /* We need a barrier before reducing forces below
1841                      * that have been produced by a different thread above.
1842                      */
1843 #pragma omp barrier
1844
1845                     /* Loop over all thread task and reduce forces they
1846                      * produced on atoms that fall in our range.
1847                      * Note that atomic reduction would be a simpler solution,
1848                      * but that might not have good support on all platforms.
1849                      */
1850                     int ntask = idTask->reduceTask.size();
1851                     for (int ti = 0; ti < ntask; ti++)
1852                     {
1853                         const InterdependentTask* idt_foreign =
1854                                 &vsite->tData[idTask->reduceTask[ti]]->idTask;
1855                         const AtomIndex* atomList  = &idt_foreign->atomIndex[thread];
1856                         const RVec*      f_foreign = idt_foreign->force.data();
1857
1858                         int natom = atomList->atom.size();
1859                         for (int i = 0; i < natom; i++)
1860                         {
1861                             int ind = atomList->atom[i];
1862                             rvec_inc(f[ind], f_foreign[ind]);
1863                             /* Clearing of f_foreign is done at the next step */
1864                         }
1865                     }
1866                     /* Clear the vsite forces, both in f and force */
1867                     for (int i = 0; i < nvsite; i++)
1868                     {
1869                         int ind = tData.idTask.vsite[i];
1870                         clear_rvec(f[ind]);
1871                         clear_rvec(tData.idTask.force[ind]);
1872                     }
1873                 }
1874
1875                 /* Spread the vsites that spread locally only */
1876                 spread_vsite_f_thread(x, f, fshift_t, VirCorr, tData.dxdf, idef->iparams,
1877                                       tData.ilist, g, pbc_null);
1878             }
1879             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1880         }
1881
1882         if (fshift != nullptr)
1883         {
1884             for (int th = 1; th < vsite->nthreads; th++)
1885             {
1886                 for (int i = 0; i < SHIFTS; i++)
1887                 {
1888                     rvec_inc(fshift[i], vsite->tData[th]->fshift[i]);
1889                 }
1890             }
1891         }
1892
1893         if (VirCorr)
1894         {
1895             for (int th = 0; th < vsite->nthreads + 1; th++)
1896             {
1897                 /* MSVC doesn't like matrix references, so we use a pointer */
1898                 const matrix* dxdf = &vsite->tData[th]->dxdf;
1899
1900                 for (int i = 0; i < DIM; i++)
1901                 {
1902                     for (int j = 0; j < DIM; j++)
1903                     {
1904                         vir[i][j] += -0.5 * (*dxdf)[i][j];
1905                     }
1906                 }
1907             }
1908         }
1909     }
1910
1911     if (useDomdec)
1912     {
1913         dd_move_f_vsites(cr->dd, f, fshift);
1914     }
1915
1916     inc_nrnb(nrnb, eNR_VSITE2, vsite_count(idef->il, F_VSITE2));
1917     inc_nrnb(nrnb, eNR_VSITE2FD, vsite_count(idef->il, F_VSITE2FD));
1918     inc_nrnb(nrnb, eNR_VSITE3, vsite_count(idef->il, F_VSITE3));
1919     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
1920     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
1921     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
1922     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
1923     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
1924     inc_nrnb(nrnb, eNR_VSITEN, vsite_count(idef->il, F_VSITEN));
1925
1926     wallcycle_stop(wcycle, ewcVSITESPREAD);
1927 }
1928
1929 /*! \brief Returns the an array with group indices for each atom
1930  *
1931  * \param[in] grouping  The paritioning of the atom range into atom groups
1932  */
1933 static std::vector<int> makeAtomToGroupMapping(const gmx::RangePartitioning& grouping)
1934 {
1935     std::vector<int> atomToGroup(grouping.fullRange().end(), 0);
1936
1937     for (int group = 0; group < grouping.numBlocks(); group++)
1938     {
1939         auto block = grouping.block(group);
1940         std::fill(atomToGroup.begin() + block.begin(), atomToGroup.begin() + block.end(), group);
1941     }
1942
1943     return atomToGroup;
1944 }
1945
1946 int countNonlinearVsites(const gmx_mtop_t& mtop)
1947 {
1948     int numNonlinearVsites = 0;
1949     for (const gmx_molblock_t& molb : mtop.molblock)
1950     {
1951         const gmx_moltype_t& molt = mtop.moltype[molb.type];
1952
1953         for (const auto& ilist : extractILists(molt.ilist, IF_VSITE))
1954         {
1955             if (ilist.functionType != F_VSITE2 && ilist.functionType != F_VSITE3
1956                 && ilist.functionType != F_VSITEN)
1957             {
1958                 numNonlinearVsites += molb.nmol * ilist.iatoms.size() / (1 + NRAL(ilist.functionType));
1959             }
1960         }
1961     }
1962
1963     return numNonlinearVsites;
1964 }
1965
1966 int countInterUpdategroupVsites(const gmx_mtop_t&                           mtop,
1967                                 gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingPerMoleculetype)
1968 {
1969     int n_intercg_vsite = 0;
1970     for (const gmx_molblock_t& molb : mtop.molblock)
1971     {
1972         const gmx_moltype_t& molt = mtop.moltype[molb.type];
1973
1974         std::vector<int> atomToGroup;
1975         if (!updateGroupingPerMoleculetype.empty())
1976         {
1977             atomToGroup = makeAtomToGroupMapping(updateGroupingPerMoleculetype[molb.type]);
1978         }
1979         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1980         {
1981             const int              nral = NRAL(ftype);
1982             const InteractionList& il   = molt.ilist[ftype];
1983             for (int i = 0; i < il.size(); i += 1 + nral)
1984             {
1985                 bool isInterGroup = atomToGroup.empty();
1986                 if (!isInterGroup)
1987                 {
1988                     const int group = atomToGroup[il.iatoms[1 + i]];
1989                     for (int a = 1; a < nral; a++)
1990                     {
1991                         if (atomToGroup[il.iatoms[1 + a]] != group)
1992                         {
1993                             isInterGroup = true;
1994                             break;
1995                         }
1996                     }
1997                 }
1998                 if (isInterGroup)
1999                 {
2000                     n_intercg_vsite += molb.nmol;
2001                 }
2002             }
2003         }
2004     }
2005
2006     return n_intercg_vsite;
2007 }
2008
2009 std::unique_ptr<gmx_vsite_t> initVsite(const gmx_mtop_t& mtop, const t_commrec* cr)
2010 {
2011     GMX_RELEASE_ASSERT(cr != nullptr, "We need a valid commrec");
2012
2013     std::unique_ptr<gmx_vsite_t> vsite;
2014
2015     /* check if there are vsites */
2016     int nvsite = 0;
2017     for (int ftype = 0; ftype < F_NRE; ftype++)
2018     {
2019         if (interaction_function[ftype].flags & IF_VSITE)
2020         {
2021             GMX_ASSERT(ftype >= c_ftypeVsiteStart && ftype < c_ftypeVsiteEnd,
2022                        "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2023
2024             nvsite += gmx_mtop_ftype_count(&mtop, ftype);
2025         }
2026         else
2027         {
2028             GMX_ASSERT(ftype < c_ftypeVsiteStart || ftype >= c_ftypeVsiteEnd,
2029                        "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2030         }
2031     }
2032
2033     if (nvsite == 0)
2034     {
2035         return vsite;
2036     }
2037
2038     vsite = std::make_unique<gmx_vsite_t>();
2039
2040     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingPerMoleculetype;
2041     if (DOMAINDECOMP(cr))
2042     {
2043         updateGroupingPerMoleculetype = getUpdateGroupingPerMoleculetype(*cr->dd);
2044     }
2045     vsite->numInterUpdategroupVsites = countInterUpdategroupVsites(mtop, updateGroupingPerMoleculetype);
2046
2047     vsite->useDomdec = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
2048
2049     vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
2050
2051     if (vsite->nthreads > 1)
2052     {
2053         /* We need one extra thread data structure for the overlap vsites */
2054         vsite->tData.resize(vsite->nthreads + 1);
2055 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2056         for (int thread = 0; thread < vsite->nthreads; thread++)
2057         {
2058             try
2059             {
2060                 vsite->tData[thread] = std::make_unique<VsiteThread>();
2061
2062                 InterdependentTask& idTask = vsite->tData[thread]->idTask;
2063                 idTask.nuse                = 0;
2064                 idTask.atomIndex.resize(vsite->nthreads);
2065             }
2066             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2067         }
2068         if (vsite->nthreads > 1)
2069         {
2070             vsite->tData[vsite->nthreads] = std::make_unique<VsiteThread>();
2071         }
2072     }
2073
2074     return vsite;
2075 }
2076
2077 gmx_vsite_t::gmx_vsite_t() {}
2078
2079 gmx_vsite_t::~gmx_vsite_t() {}
2080
2081 static inline void flagAtom(InterdependentTask* idTask, int atom, int thread, int nthread, int natperthread)
2082 {
2083     if (!idTask->use[atom])
2084     {
2085         idTask->use[atom] = true;
2086         thread            = atom / natperthread;
2087         /* Assign all non-local atom force writes to thread 0 */
2088         if (thread >= nthread)
2089         {
2090             thread = 0;
2091         }
2092         idTask->atomIndex[thread].atom.push_back(atom);
2093     }
2094 }
2095
2096 /*\brief Here we try to assign all vsites that are in our local range.
2097  *
2098  * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2099  * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2100  * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2101  * but not on other vsites are assigned to task tData->id_task.ilist.
2102  * taskIndex[] is set for all vsites in our range, either to our local tasks
2103  * or to the single last task as taskIndex[]=2*nthreads.
2104  */
2105 static void assignVsitesToThread(VsiteThread*          tData,
2106                                  int                   thread,
2107                                  int                   nthread,
2108                                  int                   natperthread,
2109                                  gmx::ArrayRef<int>    taskIndex,
2110                                  const t_ilist*        ilist,
2111                                  const t_iparams*      ip,
2112                                  const unsigned short* ptype)
2113 {
2114     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2115     {
2116         tData->ilist[ftype].nr        = 0;
2117         tData->idTask.ilist[ftype].nr = 0;
2118
2119         int      nral1 = 1 + NRAL(ftype);
2120         int      inc   = nral1;
2121         t_iatom* iat   = ilist[ftype].iatoms;
2122         for (int i = 0; i < ilist[ftype].nr;)
2123         {
2124             if (ftype == F_VSITEN)
2125             {
2126                 /* The 3 below is from 1+NRAL(ftype)=3 */
2127                 inc = ip[iat[i]].vsiten.n * 3;
2128             }
2129
2130             if (iat[1 + i] < tData->rangeStart || iat[1 + i] >= tData->rangeEnd)
2131             {
2132                 /* This vsite belongs to a different thread */
2133                 i += inc;
2134                 continue;
2135             }
2136
2137             /* We would like to assign this vsite to task thread,
2138              * but it might depend on atoms outside the atom range of thread
2139              * or on another vsite not assigned to task thread.
2140              */
2141             int task = thread;
2142             if (ftype != F_VSITEN)
2143             {
2144                 for (int j = i + 2; j < i + nral1; j++)
2145                 {
2146                     /* Do a range check to avoid a harmless race on taskIndex */
2147                     if (iat[j] < tData->rangeStart || iat[j] >= tData->rangeEnd || taskIndex[iat[j]] != thread)
2148                     {
2149                         if (!tData->useInterdependentTask || ptype[iat[j]] == eptVSite)
2150                         {
2151                             /* At least one constructing atom is a vsite
2152                              * that is not assigned to the same thread.
2153                              * Put this vsite into a separate task.
2154                              */
2155                             task = 2 * nthread;
2156                             break;
2157                         }
2158
2159                         /* There are constructing atoms outside our range,
2160                          * put this vsite into a second task to be executed
2161                          * on the same thread. During construction no barrier
2162                          * is needed between the two tasks on the same thread.
2163                          * During spreading we need to run this task with
2164                          * an additional thread-local intermediate force buffer
2165                          * (or atomic reduction) and a barrier between the two
2166                          * tasks.
2167                          */
2168                         task = nthread + thread;
2169                     }
2170                 }
2171             }
2172             else
2173             {
2174                 for (int j = i + 2; j < i + inc; j += 3)
2175                 {
2176                     /* Do a range check to avoid a harmless race on taskIndex */
2177                     if (iat[j] < tData->rangeStart || iat[j] >= tData->rangeEnd || taskIndex[iat[j]] != thread)
2178                     {
2179                         GMX_ASSERT(ptype[iat[j]] != eptVSite,
2180                                    "A vsite to be assigned in assignVsitesToThread has a vsite as "
2181                                    "a constructing atom that does not belong to our task, such "
2182                                    "vsites should be assigned to the single 'master' task");
2183
2184                         task = nthread + thread;
2185                     }
2186                 }
2187             }
2188
2189             /* Update this vsite's thread index entry */
2190             taskIndex[iat[1 + i]] = task;
2191
2192             if (task == thread || task == nthread + thread)
2193             {
2194                 /* Copy this vsite to the thread data struct of thread */
2195                 t_ilist* il_task;
2196                 if (task == thread)
2197                 {
2198                     il_task = &tData->ilist[ftype];
2199                 }
2200                 else
2201                 {
2202                     il_task = &tData->idTask.ilist[ftype];
2203                 }
2204                 /* Ensure we have sufficient memory allocated */
2205                 if (il_task->nr + inc > il_task->nalloc)
2206                 {
2207                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2208                     srenew(il_task->iatoms, il_task->nalloc);
2209                 }
2210                 /* Copy the vsite data to the thread-task local array */
2211                 for (int j = i; j < i + inc; j++)
2212                 {
2213                     il_task->iatoms[il_task->nr++] = iat[j];
2214                 }
2215                 if (task == nthread + thread)
2216                 {
2217                     /* This vsite write outside our own task force block.
2218                      * Put it into the interdependent task list and flag
2219                      * the atoms involved for reduction.
2220                      */
2221                     tData->idTask.vsite.push_back(iat[i + 1]);
2222                     if (ftype != F_VSITEN)
2223                     {
2224                         for (int j = i + 2; j < i + nral1; j++)
2225                         {
2226                             flagAtom(&tData->idTask, iat[j], thread, nthread, natperthread);
2227                         }
2228                     }
2229                     else
2230                     {
2231                         for (int j = i + 2; j < i + inc; j += 3)
2232                         {
2233                             flagAtom(&tData->idTask, iat[j], thread, nthread, natperthread);
2234                         }
2235                     }
2236                 }
2237             }
2238
2239             i += inc;
2240         }
2241     }
2242 }
2243
2244 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2245 static void assignVsitesToSingleTask(VsiteThread*             tData,
2246                                      int                      task,
2247                                      gmx::ArrayRef<const int> taskIndex,
2248                                      const t_ilist*           ilist,
2249                                      const t_iparams*         ip)
2250 {
2251     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2252     {
2253         tData->ilist[ftype].nr        = 0;
2254         tData->idTask.ilist[ftype].nr = 0;
2255
2256         int      nral1   = 1 + NRAL(ftype);
2257         int      inc     = nral1;
2258         t_iatom* iat     = ilist[ftype].iatoms;
2259         t_ilist* il_task = &tData->ilist[ftype];
2260
2261         for (int i = 0; i < ilist[ftype].nr;)
2262         {
2263             if (ftype == F_VSITEN)
2264             {
2265                 /* The 3 below is from 1+NRAL(ftype)=3 */
2266                 inc = ip[iat[i]].vsiten.n * 3;
2267             }
2268             /* Check if the vsite is assigned to our task */
2269             if (taskIndex[iat[1 + i]] == task)
2270             {
2271                 /* Ensure we have sufficient memory allocated */
2272                 if (il_task->nr + inc > il_task->nalloc)
2273                 {
2274                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2275                     srenew(il_task->iatoms, il_task->nalloc);
2276                 }
2277                 /* Copy the vsite data to the thread-task local array */
2278                 for (int j = i; j < i + inc; j++)
2279                 {
2280                     il_task->iatoms[il_task->nr++] = iat[j];
2281                 }
2282             }
2283
2284             i += inc;
2285         }
2286     }
2287 }
2288
2289 void split_vsites_over_threads(const t_ilist* ilist, const t_iparams* ip, const t_mdatoms* mdatoms, gmx_vsite_t* vsite)
2290 {
2291     int vsite_atom_range, natperthread;
2292
2293     if (vsite->nthreads == 1)
2294     {
2295         /* Nothing to do */
2296         return;
2297     }
2298
2299     /* The current way of distributing the vsites over threads in primitive.
2300      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2301      * without taking into account how the vsites are distributed.
2302      * Without domain decomposition we at least tighten the upper bound
2303      * of the range (useful for common systems such as a vsite-protein
2304      * in 3-site water).
2305      * With domain decomposition, as long as the vsites are distributed
2306      * uniformly in each domain along the major dimension, usually x,
2307      * it will also perform well.
2308      */
2309     if (!vsite->useDomdec)
2310     {
2311         vsite_atom_range = -1;
2312         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2313         {
2314             { // TODO remove me
2315                 if (ftype != F_VSITEN)
2316                 {
2317                     int            nral1 = 1 + NRAL(ftype);
2318                     const t_iatom* iat   = ilist[ftype].iatoms;
2319                     for (int i = 0; i < ilist[ftype].nr; i += nral1)
2320                     {
2321                         for (int j = i + 1; j < i + nral1; j++)
2322                         {
2323                             vsite_atom_range = std::max(vsite_atom_range, iat[j]);
2324                         }
2325                     }
2326                 }
2327                 else
2328                 {
2329                     int vs_ind_end;
2330
2331                     const t_iatom* iat = ilist[ftype].iatoms;
2332
2333                     int i = 0;
2334                     while (i < ilist[ftype].nr)
2335                     {
2336                         /* The 3 below is from 1+NRAL(ftype)=3 */
2337                         vs_ind_end = i + ip[iat[i]].vsiten.n * 3;
2338
2339                         vsite_atom_range = std::max(vsite_atom_range, iat[i + 1]);
2340                         while (i < vs_ind_end)
2341                         {
2342                             vsite_atom_range = std::max(vsite_atom_range, iat[i + 2]);
2343                             i += 3;
2344                         }
2345                     }
2346                 }
2347             }
2348         }
2349         vsite_atom_range++;
2350         natperthread = (vsite_atom_range + vsite->nthreads - 1) / vsite->nthreads;
2351     }
2352     else
2353     {
2354         /* Any local or not local atom could be involved in virtual sites.
2355          * But since we usually have very few non-local virtual sites
2356          * (only non-local vsites that depend on local vsites),
2357          * we distribute the local atom range equally over the threads.
2358          * When assigning vsites to threads, we should take care that the last
2359          * threads also covers the non-local range.
2360          */
2361         vsite_atom_range = mdatoms->nr;
2362         natperthread     = (mdatoms->homenr + vsite->nthreads - 1) / vsite->nthreads;
2363     }
2364
2365     if (debug)
2366     {
2367         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n",
2368                 mdatoms->nr, vsite_atom_range, natperthread);
2369     }
2370
2371     /* To simplify the vsite assignment, we make an index which tells us
2372      * to which task particles, both non-vsites and vsites, are assigned.
2373      */
2374     vsite->taskIndex.resize(mdatoms->nr);
2375
2376     /* Initialize the task index array. Here we assign the non-vsite
2377      * particles to task=thread, so we easily figure out if vsites
2378      * depend on local and/or non-local particles in assignVsitesToThread.
2379      */
2380     gmx::ArrayRef<int> taskIndex = vsite->taskIndex;
2381     {
2382         int thread = 0;
2383         for (int i = 0; i < mdatoms->nr; i++)
2384         {
2385             if (mdatoms->ptype[i] == eptVSite)
2386             {
2387                 /* vsites are not assigned to a task yet */
2388                 taskIndex[i] = -1;
2389             }
2390             else
2391             {
2392                 /* assign non-vsite particles to task thread */
2393                 taskIndex[i] = thread;
2394             }
2395             if (i == (thread + 1) * natperthread && thread < vsite->nthreads)
2396             {
2397                 thread++;
2398             }
2399         }
2400     }
2401
2402 #pragma omp parallel num_threads(vsite->nthreads)
2403     {
2404         try
2405         {
2406             int          thread = gmx_omp_get_thread_num();
2407             VsiteThread& tData  = *vsite->tData[thread];
2408
2409             /* Clear the buffer use flags that were set before */
2410             if (tData.useInterdependentTask)
2411             {
2412                 InterdependentTask& idTask = tData.idTask;
2413
2414                 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2415                  * we clear the force buffer at the next step,
2416                  * so we need to do it here as well.
2417                  */
2418                 clearTaskForceBufferUsedElements(&idTask);
2419
2420                 idTask.vsite.resize(0);
2421                 for (int t = 0; t < vsite->nthreads; t++)
2422                 {
2423                     AtomIndex& atomIndex = idTask.atomIndex[t];
2424                     int        natom     = atomIndex.atom.size();
2425                     for (int i = 0; i < natom; i++)
2426                     {
2427                         idTask.use[atomIndex.atom[i]] = false;
2428                     }
2429                     atomIndex.atom.resize(0);
2430                 }
2431                 idTask.nuse = 0;
2432             }
2433
2434             /* To avoid large f_buf allocations of #threads*vsite_atom_range
2435              * we don't use task2 with more than 200000 atoms. This doesn't
2436              * affect performance, since with such a large range relatively few
2437              * vsites will end up in the separate task.
2438              * Note that useTask2 should be the same for all threads.
2439              */
2440             tData.useInterdependentTask = (vsite_atom_range <= 200000);
2441             if (tData.useInterdependentTask)
2442             {
2443                 size_t              natoms_use_in_vsites = vsite_atom_range;
2444                 InterdependentTask& idTask               = tData.idTask;
2445                 /* To avoid resizing and re-clearing every nstlist steps,
2446                  * we never down size the force buffer.
2447                  */
2448                 if (natoms_use_in_vsites > idTask.force.size() || natoms_use_in_vsites > idTask.use.size())
2449                 {
2450                     idTask.force.resize(natoms_use_in_vsites, { 0, 0, 0 });
2451                     idTask.use.resize(natoms_use_in_vsites, false);
2452                 }
2453             }
2454
2455             /* Assign all vsites that can execute independently on threads */
2456             tData.rangeStart = thread * natperthread;
2457             if (thread < vsite->nthreads - 1)
2458             {
2459                 tData.rangeEnd = (thread + 1) * natperthread;
2460             }
2461             else
2462             {
2463                 /* The last thread should cover up to the end of the range */
2464                 tData.rangeEnd = mdatoms->nr;
2465             }
2466             assignVsitesToThread(&tData, thread, vsite->nthreads, natperthread, taskIndex, ilist,
2467                                  ip, mdatoms->ptype);
2468
2469             if (tData.useInterdependentTask)
2470             {
2471                 /* In the worst case, all tasks write to force ranges of
2472                  * all other tasks, leading to #tasks^2 scaling (this is only
2473                  * the overhead, the actual flops remain constant).
2474                  * But in most cases there is far less coupling. To improve
2475                  * scaling at high thread counts we therefore construct
2476                  * an index to only loop over the actually affected tasks.
2477                  */
2478                 InterdependentTask& idTask = tData.idTask;
2479
2480                 /* Ensure assignVsitesToThread finished on other threads */
2481 #pragma omp barrier
2482
2483                 idTask.spreadTask.resize(0);
2484                 idTask.reduceTask.resize(0);
2485                 for (int t = 0; t < vsite->nthreads; t++)
2486                 {
2487                     /* Do we write to the force buffer of task t? */
2488                     if (!idTask.atomIndex[t].atom.empty())
2489                     {
2490                         idTask.spreadTask.push_back(t);
2491                     }
2492                     /* Does task t write to our force buffer? */
2493                     if (!vsite->tData[t]->idTask.atomIndex[thread].atom.empty())
2494                     {
2495                         idTask.reduceTask.push_back(t);
2496                     }
2497                 }
2498             }
2499         }
2500         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2501     }
2502     /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2503      * to a single task that will not run in parallel with other tasks.
2504      */
2505     assignVsitesToSingleTask(vsite->tData[vsite->nthreads].get(), 2 * vsite->nthreads, taskIndex,
2506                              ilist, ip);
2507
2508     if (debug && vsite->nthreads > 1)
2509     {
2510         fprintf(debug, "virtual site useInterdependentTask %d, nuse:\n",
2511                 static_cast<int>(vsite->tData[0]->useInterdependentTask));
2512         for (int th = 0; th < vsite->nthreads + 1; th++)
2513         {
2514             fprintf(debug, " %4d", vsite->tData[th]->idTask.nuse);
2515         }
2516         fprintf(debug, "\n");
2517
2518         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2519         {
2520             if (ilist[ftype].nr > 0)
2521             {
2522                 fprintf(debug, "%-20s thread dist:", interaction_function[ftype].longname);
2523                 for (int th = 0; th < vsite->nthreads + 1; th++)
2524                 {
2525                     fprintf(debug, " %4d %4d ", vsite->tData[th]->ilist[ftype].nr,
2526                             vsite->tData[th]->idTask.ilist[ftype].nr);
2527                 }
2528                 fprintf(debug, "\n");
2529             }
2530         }
2531     }
2532
2533 #ifndef NDEBUG
2534     int nrOrig     = vsiteIlistNrCount(ilist);
2535     int nrThreaded = 0;
2536     for (int th = 0; th < vsite->nthreads + 1; th++)
2537     {
2538         nrThreaded += vsiteIlistNrCount(vsite->tData[th]->ilist)
2539                       + vsiteIlistNrCount(vsite->tData[th]->idTask.ilist);
2540     }
2541     GMX_ASSERT(nrThreaded == nrOrig,
2542                "The number of virtual sites assigned to all thread task has to match the total "
2543                "number of virtual sites");
2544 #endif
2545 }
2546
2547 void set_vsite_top(gmx_vsite_t* vsite, const gmx_localtop_t* top, const t_mdatoms* md)
2548 {
2549     if (vsite->nthreads > 1)
2550     {
2551         split_vsites_over_threads(top->idef.il, top->idef.iparams, md, vsite);
2552     }
2553 }