src/gromacs/mdlib/vsite.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "vsite.h"
  40
  41 #include <stdio.h>
  42
  43 #include <algorithm>
  44 #include <vector>
  45
  46 #include "gromacs/domdec/domdec.h"
  47 #include "gromacs/domdec/domdec_struct.h"
  48 #include "gromacs/gmxlib/network.h"
  49 #include "gromacs/gmxlib/nrnb.h"
  50 #include "gromacs/math/functions.h"
  51 #include "gromacs/math/vec.h"
  52 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  53 #include "gromacs/mdtypes/commrec.h"
  54 #include "gromacs/mdtypes/mdatom.h"
  55 #include "gromacs/pbcutil/ishift.h"
  56 #include "gromacs/pbcutil/mshift.h"
  57 #include "gromacs/pbcutil/pbc.h"
  58 #include "gromacs/timing/wallcycle.h"
  59 #include "gromacs/topology/ifunc.h"
  60 #include "gromacs/topology/mtop_util.h"
  61 #include "gromacs/topology/topology.h"
  62 #include "gromacs/utility/exceptions.h"
  63 #include "gromacs/utility/fatalerror.h"
  64 #include "gromacs/utility/gmxassert.h"
  65 #include "gromacs/utility/gmxomp.h"
  66 #include "gromacs/utility/smalloc.h"
  67
  68 /* The strategy used here for assigning virtual sites to (thread-)tasks
  69  * is as follows:
  70  *
  71  * We divide the atom range that vsites operate on (natoms_local with DD,
  72  * 0 - last atom involved in vsites without DD) equally over all threads.
  73  *
  74  * Vsites in the local range constructed from atoms in the local range
  75  * and/or other vsites that are fully local are assigned to a simple,
  76  * independent task.
  77  *
  78  * Vsites that are not assigned after using the above criterion get assigned
  79  * to a so called "interdependent" thread task when none of the constructing
  80  * atoms is a vsite. These tasks are called interdependent, because one task
  81  * accesses atoms assigned to a different task/thread.
  82  * Note that this option is turned off with large (local) atom counts
  83  * to avoid high memory usage.
  84  *
  85  * Any remaining vsites are assigned to a separate master thread task.
  86  */
  87
  88 using gmx::RVec;
  89
  90 static void init_ilist(t_ilist *ilist)
  91 {
  92     for (int i = 0; i < F_NRE; i++)
  93     {
  94         ilist[i].nr     = 0;
  95         ilist[i].nalloc = 0;
  96         ilist[i].iatoms = nullptr;
  97     }
  98 }
  99
 100 /*! \brief List of atom indices belonging to a task */
 101 struct AtomIndex {
 102     //! List of atom indices
 103     std::vector<int> atom;
 104 };
 105
 106 /*! \brief Data structure for thread tasks that use constructing atoms outside their own atom range */
 107 struct InterdependentTask
 108 {
 109     //! The interaction lists, only vsite entries are used
 110     t_ilist                ilist[F_NRE];
 111     //! Thread/task-local force buffer
 112     std::vector<RVec>      force;
 113     //! The atom indices of the vsites of our task
 114     std::vector<int>       vsite;
 115     //! Flags if elements in force are spread to or not
 116     std::vector<bool>      use;
 117     //! The number of entries set to true in use
 118     int                    nuse;
 119     //! Array of atoms indices, size nthreads, covering all nuse set elements in use
 120     std::vector<AtomIndex> atomIndex;
 121     //! List of tasks (force blocks) this task spread forces to
 122     std::vector<int>       spreadTask;
 123     //! List of tasks that write to this tasks force block range
 124     std::vector<int>       reduceTask;
 125
 126     InterdependentTask()
 127     {
 128         init_ilist(ilist);
 129         nuse = 0;
 130     }
 131 };
 132
 133 /*! \brief Vsite thread task data structure */
 134 struct VsiteThread {
 135     //! Start of atom range of this task
 136     int                rangeStart;
 137     //! End of atom range of this task
 138     int                rangeEnd;
 139     //! The interaction lists, only vsite entries are used
 140     t_ilist            ilist[F_NRE];
 141     //! Local fshift accumulation buffer
 142     rvec               fshift[SHIFTS];
 143     //! Local virial dx*df accumulation buffer
 144     matrix             dxdf;
 145     //! Tells if interdependent task idTask should be used (in addition to the rest of this task), this bool has the same value on all threads
 146     bool               useInterdependentTask;
 147     //! Data for vsites that involve constructing atoms in the atom range of other threads/tasks
 148     InterdependentTask idTask;
 149
 150     /*! \brief Constructor */
 151     VsiteThread()
 152     {
 153         rangeStart            = -1;
 154         rangeEnd              = -1;
 155         init_ilist(ilist);
 156         clear_rvecs(SHIFTS, fshift);
 157         clear_mat(dxdf);
 158         useInterdependentTask = false;
 159     }
 160 };
 161
 162
 163 /* The start and end values of for the vsite indices in the ftype enum.
 164  * The validity of these values is checked in init_vsite.
 165  * This is used to avoid loops over all ftypes just to get the vsite entries.
 166  * (We should replace the fixed ilist array by only the used entries.)
 167  */
 168 static const int c_ftypeVsiteStart = F_VSITE2;
 169 static const int c_ftypeVsiteEnd   = F_VSITEN + 1;
 170
 171
 172 /*! \brief Returns the sum of the vsite ilist sizes over all vsite types
 173  *
 174  * \param[in] ilist  The interaction list
 175  */
 176 static int vsiteIlistNrCount(const t_ilist *ilist)
 177 {
 178     int nr = 0;
 179     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 180     {
 181         nr += ilist[ftype].nr;
 182     }
 183
 184     return nr;
 185 }
 186
 187 static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 188 {
 189     if (pbc)
 190     {
 191         return pbc_dx_aiuc(pbc, xi, xj, dx);
 192     }
 193     else
 194     {
 195         rvec_sub(xi, xj, dx);
 196         return CENTRAL;
 197     }
 198 }
 199
 200 /* Vsite construction routines */
 201
 202 static void constr_vsite2(const rvec xi, const rvec xj, rvec x, real a, const t_pbc *pbc)
 203 {
 204     real b = 1 - a;
 205     /* 1 flop */
 206
 207     if (pbc)
 208     {
 209         rvec dx;
 210         pbc_dx_aiuc(pbc, xj, xi, dx);
 211         x[XX] = xi[XX] + a*dx[XX];
 212         x[YY] = xi[YY] + a*dx[YY];
 213         x[ZZ] = xi[ZZ] + a*dx[ZZ];
 214     }
 215     else
 216     {
 217         x[XX] = b*xi[XX] + a*xj[XX];
 218         x[YY] = b*xi[YY] + a*xj[YY];
 219         x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 220         /* 9 Flops */
 221     }
 222
 223     /* TOTAL: 10 flops */
 224 }
 225
 226 static void constr_vsite3(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 227                           const t_pbc *pbc)
 228 {
 229     real c = 1 - a - b;
 230     /* 2 flops */
 231
 232     if (pbc)
 233     {
 234         rvec dxj, dxk;
 235
 236         pbc_dx_aiuc(pbc, xj, xi, dxj);
 237         pbc_dx_aiuc(pbc, xk, xi, dxk);
 238         x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 239         x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 240         x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 241     }
 242     else
 243     {
 244         x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 245         x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 246         x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 247         /* 15 Flops */
 248     }
 249
 250     /* TOTAL: 17 flops */
 251 }
 252
 253 static void constr_vsite3FD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b,
 254                             const t_pbc *pbc)
 255 {
 256     rvec xij, xjk, temp;
 257     real c;
 258
 259     pbc_rvec_sub(pbc, xj, xi, xij);
 260     pbc_rvec_sub(pbc, xk, xj, xjk);
 261     /* 6 flops */
 262
 263     /* temp goes from i to a point on the line jk */
 264     temp[XX] = xij[XX] + a*xjk[XX];
 265     temp[YY] = xij[YY] + a*xjk[YY];
 266     temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 267     /* 6 flops */
 268
 269     c = b*gmx::invsqrt(iprod(temp, temp));
 270     /* 6 + 10 flops */
 271
 272     x[XX] = xi[XX] + c*temp[XX];
 273     x[YY] = xi[YY] + c*temp[YY];
 274     x[ZZ] = xi[ZZ] + c*temp[ZZ];
 275     /* 6 Flops */
 276
 277     /* TOTAL: 34 flops */
 278 }
 279
 280 static void constr_vsite3FAD(const rvec xi, const rvec xj, const rvec xk, rvec x, real a, real b, const t_pbc *pbc)
 281 {
 282     rvec xij, xjk, xp;
 283     real a1, b1, c1, invdij;
 284
 285     pbc_rvec_sub(pbc, xj, xi, xij);
 286     pbc_rvec_sub(pbc, xk, xj, xjk);
 287     /* 6 flops */
 288
 289     invdij = gmx::invsqrt(iprod(xij, xij));
 290     c1     = invdij * invdij * iprod(xij, xjk);
 291     xp[XX] = xjk[XX] - c1*xij[XX];
 292     xp[YY] = xjk[YY] - c1*xij[YY];
 293     xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 294     a1     = a*invdij;
 295     b1     = b*gmx::invsqrt(iprod(xp, xp));
 296     /* 45 */
 297
 298     x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 299     x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 300     x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 301     /* 12 Flops */
 302
 303     /* TOTAL: 63 flops */
 304 }
 305
 306 static void constr_vsite3OUT(const rvec xi, const rvec xj, const rvec xk, rvec x,
 307                              real a, real b, real c, const t_pbc *pbc)
 308 {
 309     rvec xij, xik, temp;
 310
 311     pbc_rvec_sub(pbc, xj, xi, xij);
 312     pbc_rvec_sub(pbc, xk, xi, xik);
 313     cprod(xij, xik, temp);
 314     /* 15 Flops */
 315
 316     x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 317     x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 318     x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 319     /* 18 Flops */
 320
 321     /* TOTAL: 33 flops */
 322 }
 323
 324 static void constr_vsite4FD(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 325                             real a, real b, real c, const t_pbc *pbc)
 326 {
 327     rvec xij, xjk, xjl, temp;
 328     real d;
 329
 330     pbc_rvec_sub(pbc, xj, xi, xij);
 331     pbc_rvec_sub(pbc, xk, xj, xjk);
 332     pbc_rvec_sub(pbc, xl, xj, xjl);
 333     /* 9 flops */
 334
 335     /* temp goes from i to a point on the plane jkl */
 336     temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 337     temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 338     temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 339     /* 12 flops */
 340
 341     d = c*gmx::invsqrt(iprod(temp, temp));
 342     /* 6 + 10 flops */
 343
 344     x[XX] = xi[XX] + d*temp[XX];
 345     x[YY] = xi[YY] + d*temp[YY];
 346     x[ZZ] = xi[ZZ] + d*temp[ZZ];
 347     /* 6 Flops */
 348
 349     /* TOTAL: 43 flops */
 350 }
 351
 352 static void constr_vsite4FDN(const rvec xi, const rvec xj, const rvec xk, const rvec xl, rvec x,
 353                              real a, real b, real c, const t_pbc *pbc)
 354 {
 355     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 356     real d;
 357
 358     pbc_rvec_sub(pbc, xj, xi, xij);
 359     pbc_rvec_sub(pbc, xk, xi, xik);
 360     pbc_rvec_sub(pbc, xl, xi, xil);
 361     /* 9 flops */
 362
 363     ra[XX] = a*xik[XX];
 364     ra[YY] = a*xik[YY];
 365     ra[ZZ] = a*xik[ZZ];
 366
 367     rb[XX] = b*xil[XX];
 368     rb[YY] = b*xil[YY];
 369     rb[ZZ] = b*xil[ZZ];
 370
 371     /* 6 flops */
 372
 373     rvec_sub(ra, xij, rja);
 374     rvec_sub(rb, xij, rjb);
 375     /* 6 flops */
 376
 377     cprod(rja, rjb, rm);
 378     /* 9 flops */
 379
 380     d = c*gmx::invsqrt(norm2(rm));
 381     /* 5+5+1 flops */
 382
 383     x[XX] = xi[XX] + d*rm[XX];
 384     x[YY] = xi[YY] + d*rm[YY];
 385     x[ZZ] = xi[ZZ] + d*rm[ZZ];
 386     /* 6 Flops */
 387
 388     /* TOTAL: 47 flops */
 389 }
 390
 391
 392 static int constr_vsiten(const t_iatom *ia, const t_iparams ip[],
 393                          rvec *x, const t_pbc *pbc)
 394 {
 395     rvec x1, dx;
 396     dvec dsum;
 397     int  n3, av, ai;
 398     real a;
 399
 400     n3 = 3*ip[ia[0]].vsiten.n;
 401     av = ia[1];
 402     ai = ia[2];
 403     copy_rvec(x[ai], x1);
 404     clear_dvec(dsum);
 405     for (int i = 3; i < n3; i += 3)
 406     {
 407         ai = ia[i+2];
 408         a  = ip[ia[i]].vsiten.a;
 409         if (pbc)
 410         {
 411             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 412         }
 413         else
 414         {
 415             rvec_sub(x[ai], x1, dx);
 416         }
 417         dsum[XX] += a*dx[XX];
 418         dsum[YY] += a*dx[YY];
 419         dsum[ZZ] += a*dx[ZZ];
 420         /* 9 Flops */
 421     }
 422
 423     x[av][XX] = x1[XX] + dsum[XX];
 424     x[av][YY] = x1[YY] + dsum[YY];
 425     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 426
 427     return n3;
 428 }
 429
 430 /*! \brief PBC modes for vsite construction and spreading */
 431 enum class PbcMode
 432 {
 433     all,         // Apply normal, simple PBC for all vsites
 434     chargeGroup, // Keep vsite in the same periodic image as the rest of it's charge group
 435     none         // No PBC treatment needed
 436 };
 437
 438 /*! \brief Returns the PBC mode based on the system PBC and vsite properties
 439  *
 440  * \param[in] pbcPtr  A pointer to a PBC struct or nullptr when no PBC treatment is required
 441  * \param[in] vsite   A pointer to the vsite struct, can be nullptr
 442  */
 443 static PbcMode getPbcMode(const t_pbc       *pbcPtr,
 444                           const gmx_vsite_t *vsite)
 445 {
 446     if (pbcPtr == nullptr)
 447     {
 448         return PbcMode::none;
 449     }
 450     else if (vsite != nullptr && vsite->bHaveChargeGroups)
 451     {
 452         return PbcMode::chargeGroup;
 453     }
 454     else
 455     {
 456         return PbcMode::all;
 457     }
 458 }
 459
 460 static void construct_vsites_thread(const gmx_vsite_t *vsite,
 461                                     rvec x[],
 462                                     real dt, rvec *v,
 463                                     const t_iparams ip[], const t_ilist ilist[],
 464                                     const t_pbc *pbc_null)
 465 {
 466     real         inv_dt;
 467     if (v != nullptr)
 468     {
 469         inv_dt = 1.0/dt;
 470     }
 471     else
 472     {
 473         inv_dt = 1.0;
 474     }
 475
 476     const PbcMode  pbcMode   = getPbcMode(pbc_null, vsite);
 477     /* We need another pbc pointer, as with charge groups we switch per vsite */
 478     const t_pbc   *pbc_null2 = pbc_null;
 479     const int     *vsite_pbc = nullptr;
 480
 481     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
 482     {
 483         if (ilist[ftype].nr == 0)
 484         {
 485             continue;
 486         }
 487
 488         {   // TODO remove me
 489             int            nra = interaction_function[ftype].nratoms;
 490             int            inc = 1 + nra;
 491             int            nr  = ilist[ftype].nr;
 492
 493             const t_iatom *ia = ilist[ftype].iatoms;
 494
 495             if (pbcMode == PbcMode::chargeGroup)
 496             {
 497                 vsite_pbc = vsite->vsite_pbc_loc[ftype - c_ftypeVsiteStart];
 498             }
 499
 500             for (int i = 0; i < nr; )
 501             {
 502                 int  tp     = ia[0];
 503                 /* The vsite and constructing atoms */
 504                 int  avsite = ia[1];
 505                 int  ai     = ia[2];
 506                 /* Constants for constructing vsites */
 507                 real a1     = ip[tp].vsite.a;
 508                 /* Check what kind of pbc we need to use */
 509                 int  pbc_atom;
 510                 rvec xpbc;
 511                 if (pbcMode == PbcMode::all)
 512                 {
 513                     /* No charge groups, vsite follows its own pbc */
 514                     pbc_atom = avsite;
 515                     copy_rvec(x[avsite], xpbc);
 516                 }
 517                 else if (pbcMode == PbcMode::chargeGroup)
 518                 {
 519                     pbc_atom = vsite_pbc[i/(1 + nra)];
 520                     if (pbc_atom > -2)
 521                     {
 522                         if (pbc_atom >= 0)
 523                         {
 524                             /* We need to copy the coordinates here,
 525                              * single for single atom cg's pbc_atom
 526                              * is the vsite itself.
 527                              */
 528                             copy_rvec(x[pbc_atom], xpbc);
 529                         }
 530                         pbc_null2 = pbc_null;
 531                     }
 532                     else
 533                     {
 534                         pbc_null2 = nullptr;
 535                     }
 536                 }
 537                 else
 538                 {
 539                     pbc_atom = -2;
 540                 }
 541                 /* Copy the old position */
 542                 rvec xv;
 543                 copy_rvec(x[avsite], xv);
 544
 545                 /* Construct the vsite depending on type */
 546                 int  aj, ak, al;
 547                 real b1, c1;
 548                 switch (ftype)
 549                 {
 550                     case F_VSITE2:
 551                         aj = ia[3];
 552                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 553                         break;
 554                     case F_VSITE3:
 555                         aj = ia[3];
 556                         ak = ia[4];
 557                         b1 = ip[tp].vsite.b;
 558                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 559                         break;
 560                     case F_VSITE3FD:
 561                         aj = ia[3];
 562                         ak = ia[4];
 563                         b1 = ip[tp].vsite.b;
 564                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 565                         break;
 566                     case F_VSITE3FAD:
 567                         aj = ia[3];
 568                         ak = ia[4];
 569                         b1 = ip[tp].vsite.b;
 570                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 571                         break;
 572                     case F_VSITE3OUT:
 573                         aj = ia[3];
 574                         ak = ia[4];
 575                         b1 = ip[tp].vsite.b;
 576                         c1 = ip[tp].vsite.c;
 577                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 578                         break;
 579                     case F_VSITE4FD:
 580                         aj = ia[3];
 581                         ak = ia[4];
 582                         al = ia[5];
 583                         b1 = ip[tp].vsite.b;
 584                         c1 = ip[tp].vsite.c;
 585                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 586                                         pbc_null2);
 587                         break;
 588                     case F_VSITE4FDN:
 589                         aj = ia[3];
 590                         ak = ia[4];
 591                         al = ia[5];
 592                         b1 = ip[tp].vsite.b;
 593                         c1 = ip[tp].vsite.c;
 594                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 595                                          pbc_null2);
 596                         break;
 597                     case F_VSITEN:
 598                         inc = constr_vsiten(ia, ip, x, pbc_null2);
 599                         break;
 600                     default:
 601                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 602                                   ftype, __FILE__, __LINE__);
 603                 }
 604
 605                 if (pbc_atom >= 0)
 606                 {
 607                     /* Match the pbc of this vsite to the rest of its charge group */
 608                     rvec dx;
 609                     int  ishift = pbc_dx_aiuc(pbc_null, x[avsite], xpbc, dx);
 610                     if (ishift != CENTRAL)
 611                     {
 612                         rvec_add(xpbc, dx, x[avsite]);
 613                     }
 614                 }
 615                 if (v != nullptr)
 616                 {
 617                     /* Calculate velocity of vsite... */
 618                     rvec vv;
 619                     rvec_sub(x[avsite], xv, vv);
 620                     svmul(inv_dt, vv, v[avsite]);
 621                 }
 622
 623                 /* Increment loop variables */
 624                 i  += inc;
 625                 ia += inc;
 626             }
 627         }
 628     }
 629 }
 630
 631 void construct_vsites(const gmx_vsite_t *vsite,
 632                       rvec x[],
 633                       real dt, rvec *v,
 634                       const t_iparams ip[], const t_ilist ilist[],
 635                       int ePBC, gmx_bool bMolPBC,
 636                       const t_commrec *cr,
 637                       const matrix box)
 638 {
 639     const bool useDomdec = (vsite != nullptr && vsite->useDomdec);
 640     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)), "When vsites are set up with domain decomposition, we need a valid commrec");
 641     // TODO: Remove this assertion when we remove charge groups
 642     GMX_ASSERT(vsite != nullptr || ePBC == epbcNONE, "Without a vsite struct we can not do PBC (in case we have charge groups)");
 643
 644     t_pbc     pbc, *pbc_null;
 645
 646     /* We only need to do pbc when we have inter-cg vsites.
 647      * Note that with domain decomposition we do not need to apply PBC here
 648      * when we have at least 3 domains along each dimension. Currently we
 649      * do not optimize this case.
 650      */
 651     if (ePBC != epbcNONE && (useDomdec || bMolPBC) &&
 652         !(vsite != nullptr && vsite->n_intercg_vsite == 0))
 653     {
 654         /* This is wasting some CPU time as we now do this multiple times
 655          * per MD step.
 656          */
 657         ivec null_ivec;
 658         clear_ivec(null_ivec);
 659         pbc_null = set_pbc_dd(&pbc, ePBC,
 660                               useDomdec ? cr->dd->nc : null_ivec,
 661                               FALSE, box);
 662     }
 663     else
 664     {
 665         pbc_null = nullptr;
 666     }
 667
 668     if (useDomdec)
 669     {
 670         dd_move_x_vsites(cr->dd, box, x);
 671     }
 672
 673     // cppcheck-suppress nullPointerRedundantCheck
 674     if (vsite == nullptr || vsite->nthreads == 1)
 675     {
 676         construct_vsites_thread(vsite,
 677                                 x, dt, v,
 678                                 ip, ilist,
 679                                 pbc_null);
 680     }
 681     else
 682     {
 683 #pragma omp parallel num_threads(vsite->nthreads)
 684         {
 685             try
 686             {
 687                 const int          th    = gmx_omp_get_thread_num();
 688                 const VsiteThread &tData = *vsite->tData[th];
 689                 GMX_ASSERT(tData.rangeStart >= 0, "The thread data should be initialized before calling construct_vsites");
 690
 691                 construct_vsites_thread(vsite,
 692                                         x, dt, v,
 693                                         ip, tData.ilist,
 694                                         pbc_null);
 695                 if (tData.useInterdependentTask)
 696                 {
 697                     /* Here we don't need a barrier (unlike the spreading),
 698                      * since both tasks only construct vsites from particles,
 699                      * or local vsites, not from non-local vsites.
 700                      */
 701                     construct_vsites_thread(vsite,
 702                                             x, dt, v,
 703                                             ip, tData.idTask.ilist,
 704                                             pbc_null);
 705                 }
 706             }
 707             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 708         }
 709         /* Now we can construct the vsites that might depend on other vsites */
 710         construct_vsites_thread(vsite,
 711                                 x, dt, v,
 712                                 ip, vsite->tData[vsite->nthreads]->ilist,
 713                                 pbc_null);
 714     }
 715 }
 716
 717 static void spread_vsite2(const t_iatom ia[], real a,
 718                           const rvec x[],
 719                           rvec f[], rvec fshift[],
 720                           const t_pbc *pbc, const t_graph *g)
 721 {
 722     rvec    fi, fj, dx;
 723     t_iatom av, ai, aj;
 724     ivec    di;
 725     int     siv, sij;
 726
 727     av = ia[1];
 728     ai = ia[2];
 729     aj = ia[3];
 730
 731     svmul(1 - a, f[av], fi);
 732     svmul(    a, f[av], fj);
 733     /* 7 flop */
 734
 735     rvec_inc(f[ai], fi);
 736     rvec_inc(f[aj], fj);
 737     /* 6 Flops */
 738
 739     if (g)
 740     {
 741         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 742         siv = IVEC2IS(di);
 743         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 744         sij = IVEC2IS(di);
 745     }
 746     else if (pbc)
 747     {
 748         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 749         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 750     }
 751     else
 752     {
 753         siv = CENTRAL;
 754         sij = CENTRAL;
 755     }
 756
 757     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 758     {
 759         rvec_inc(fshift[siv], f[av]);
 760         rvec_dec(fshift[CENTRAL], fi);
 761         rvec_dec(fshift[sij], fj);
 762     }
 763
 764     /* TOTAL: 13 flops */
 765 }
 766
 767 void constructVsitesGlobal(const gmx_mtop_t         &mtop,
 768                            gmx::ArrayRef<gmx::RVec>  x)
 769 {
 770     GMX_ASSERT(x.size() >= static_cast<size_t>(mtop.natoms), "x should contain the whole system");
 771
 772     for (const gmx_molblock_t &molb : mtop.molblock)
 773     {
 774         const gmx_moltype_t  &molt = mtop.moltype[molb.type];
 775         if (vsiteIlistNrCount(molt.ilist) > 0)
 776         {
 777             int atomOffset = molb.globalAtomStart;
 778             for (int mol = 0; mol < molb.nmol; mol++)
 779             {
 780                 construct_vsites(nullptr, as_rvec_array(x.data()) + atomOffset,
 781                                  0.0, nullptr,
 782                                  mtop.ffparams.iparams, molt.ilist,
 783                                  epbcNONE, TRUE, nullptr, nullptr);
 784                 atomOffset += molt.atoms.nr;
 785             }
 786         }
 787     }
 788 }
 789
 790 static void spread_vsite3(const t_iatom ia[], real a, real b,
 791                           const rvec x[],
 792                           rvec f[], rvec fshift[],
 793                           const t_pbc *pbc, const t_graph *g)
 794 {
 795     rvec    fi, fj, fk, dx;
 796     int     av, ai, aj, ak;
 797     ivec    di;
 798     int     siv, sij, sik;
 799
 800     av = ia[1];
 801     ai = ia[2];
 802     aj = ia[3];
 803     ak = ia[4];
 804
 805     svmul(1 - a - b, f[av], fi);
 806     svmul(        a, f[av], fj);
 807     svmul(        b, f[av], fk);
 808     /* 11 flops */
 809
 810     rvec_inc(f[ai], fi);
 811     rvec_inc(f[aj], fj);
 812     rvec_inc(f[ak], fk);
 813     /* 9 Flops */
 814
 815     if (g)
 816     {
 817         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 818         siv = IVEC2IS(di);
 819         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 820         sij = IVEC2IS(di);
 821         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 822         sik = IVEC2IS(di);
 823     }
 824     else if (pbc)
 825     {
 826         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 827         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 828         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 829     }
 830     else
 831     {
 832         siv = CENTRAL;
 833         sij = CENTRAL;
 834         sik = CENTRAL;
 835     }
 836
 837     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 838     {
 839         rvec_inc(fshift[siv], f[av]);
 840         rvec_dec(fshift[CENTRAL], fi);
 841         rvec_dec(fshift[sij], fj);
 842         rvec_dec(fshift[sik], fk);
 843     }
 844
 845     /* TOTAL: 20 flops */
 846 }
 847
 848 static void spread_vsite3FD(const t_iatom ia[], real a, real b,
 849                             const rvec x[],
 850                             rvec f[], rvec fshift[],
 851                             gmx_bool VirCorr, matrix dxdf,
 852                             const t_pbc *pbc, const t_graph *g)
 853 {
 854     real    c, invl, fproj, a1;
 855     rvec    xvi, xij, xjk, xix, fv, temp;
 856     t_iatom av, ai, aj, ak;
 857     int     svi, sji, skj;
 858     ivec    di;
 859
 860     av = ia[1];
 861     ai = ia[2];
 862     aj = ia[3];
 863     ak = ia[4];
 864     copy_rvec(f[av], fv);
 865
 866     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 867     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 868     /* 6 flops */
 869
 870     /* xix goes from i to point x on the line jk */
 871     xix[XX] = xij[XX]+a*xjk[XX];
 872     xix[YY] = xij[YY]+a*xjk[YY];
 873     xix[ZZ] = xij[ZZ]+a*xjk[ZZ];
 874     /* 6 flops */
 875
 876     invl = gmx::invsqrt(iprod(xix, xix));
 877     c    = b*invl;
 878     /* 4 + ?10? flops */
 879
 880     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 881
 882     temp[XX] = c*(fv[XX]-fproj*xix[XX]);
 883     temp[YY] = c*(fv[YY]-fproj*xix[YY]);
 884     temp[ZZ] = c*(fv[ZZ]-fproj*xix[ZZ]);
 885     /* 16 */
 886
 887     /* c is already calculated in constr_vsite3FD
 888        storing c somewhere will save 26 flops!     */
 889
 890     a1         = 1 - a;
 891     f[ai][XX] += fv[XX] - temp[XX];
 892     f[ai][YY] += fv[YY] - temp[YY];
 893     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 894     f[aj][XX] += a1*temp[XX];
 895     f[aj][YY] += a1*temp[YY];
 896     f[aj][ZZ] += a1*temp[ZZ];
 897     f[ak][XX] += a*temp[XX];
 898     f[ak][YY] += a*temp[YY];
 899     f[ak][ZZ] += a*temp[ZZ];
 900     /* 19 Flops */
 901
 902     if (g)
 903     {
 904         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 905         svi = IVEC2IS(di);
 906         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 907         sji = IVEC2IS(di);
 908         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 909         skj = IVEC2IS(di);
 910     }
 911     else if (pbc)
 912     {
 913         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 914     }
 915     else
 916     {
 917         svi = CENTRAL;
 918     }
 919
 920     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 921     {
 922         rvec_dec(fshift[svi], fv);
 923         fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 924         fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 925         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 926         fshift[    sji][XX] += temp[XX];
 927         fshift[    sji][YY] += temp[YY];
 928         fshift[    sji][ZZ] += temp[ZZ];
 929         fshift[    skj][XX] += a*temp[XX];
 930         fshift[    skj][YY] += a*temp[YY];
 931         fshift[    skj][ZZ] += a*temp[ZZ];
 932     }
 933
 934     if (VirCorr)
 935     {
 936         /* When VirCorr=TRUE, the virial for the current forces is not
 937          * calculated from the redistributed forces. This means that
 938          * the effect of non-linear virtual site constructions on the virial
 939          * needs to be added separately. This contribution can be calculated
 940          * in many ways, but the simplest and cheapest way is to use
 941          * the first constructing atom ai as a reference position in space:
 942          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 943          */
 944         rvec xiv;
 945
 946         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 947
 948         for (int i = 0; i < DIM; i++)
 949         {
 950             for (int j = 0; j < DIM; j++)
 951             {
 952                 /* As xix is a linear combination of j and k, use that here */
 953                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 954             }
 955         }
 956     }
 957
 958     /* TOTAL: 61 flops */
 959 }
 960
 961 static void spread_vsite3FAD(const t_iatom ia[], real a, real b,
 962                              const rvec x[],
 963                              rvec f[], rvec fshift[],
 964                              gmx_bool VirCorr, matrix dxdf,
 965                              const t_pbc *pbc, const t_graph *g)
 966 {
 967     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 968     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 969     t_iatom av, ai, aj, ak;
 970     int     svi, sji, skj, d;
 971     ivec    di;
 972
 973     av = ia[1];
 974     ai = ia[2];
 975     aj = ia[3];
 976     ak = ia[4];
 977     copy_rvec(f[ia[1]], fv);
 978
 979     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 980     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 981     /* 6 flops */
 982
 983     invdij    = gmx::invsqrt(iprod(xij, xij));
 984     invdij2   = invdij * invdij;
 985     c1        = iprod(xij, xjk) * invdij2;
 986     xperp[XX] = xjk[XX] - c1*xij[XX];
 987     xperp[YY] = xjk[YY] - c1*xij[YY];
 988     xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 989     /* xperp in plane ijk, perp. to ij */
 990     invdp = gmx::invsqrt(iprod(xperp, xperp));
 991     a1    = a*invdij;
 992     b1    = b*invdp;
 993     /* 45 flops */
 994
 995     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 996        storing them somewhere will save 45 flops!     */
 997
 998     fproj = iprod(xij, fv)*invdij2;
 999     svmul(fproj,                      xij,  Fpij);    /* proj. f on xij */
1000     svmul(iprod(xperp, fv)*invdp*invdp, xperp, Fppp); /* proj. f on xperp */
1001     svmul(b1*fproj,                   xperp, f3);
1002     /* 23 flops */
1003
1004     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
1005     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
1006     for (d = 0; (d < DIM); d++)
1007     {
1008         f1[d] *= a1;
1009         f2[d] *= b1;
1010     }
1011     /* 12 flops */
1012
1013     c2         = 1 + c1;
1014     f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
1015     f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
1016     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
1017     f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
1018     f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
1019     f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
1020     f[ak][XX] +=                      f2[XX];
1021     f[ak][YY] +=                      f2[YY];
1022     f[ak][ZZ] +=                      f2[ZZ];
1023     /* 30 Flops */
1024
1025     if (g)
1026     {
1027         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1028         svi = IVEC2IS(di);
1029         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1030         sji = IVEC2IS(di);
1031         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1032         skj = IVEC2IS(di);
1033     }
1034     else if (pbc)
1035     {
1036         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1037     }
1038     else
1039     {
1040         svi = CENTRAL;
1041     }
1042
1043     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1044     {
1045         rvec_dec(fshift[svi], fv);
1046         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
1047         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
1048         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
1049         fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
1050         fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
1051         fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
1052         fshift[    skj][XX] +=                          f2[XX];
1053         fshift[    skj][YY] +=                          f2[YY];
1054         fshift[    skj][ZZ] +=                          f2[ZZ];
1055     }
1056
1057     if (VirCorr)
1058     {
1059         rvec xiv;
1060         int  i, j;
1061
1062         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1063
1064         for (i = 0; i < DIM; i++)
1065         {
1066             for (j = 0; j < DIM; j++)
1067             {
1068                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1069                 dxdf[i][j] +=
1070                     -xiv[i]*fv[j]
1071                     + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
1072                     + xjk[i]*f2[j];
1073             }
1074         }
1075     }
1076
1077     /* TOTAL: 113 flops */
1078 }
1079
1080 static void spread_vsite3OUT(const t_iatom ia[], real a, real b, real c,
1081                              const rvec x[],
1082                              rvec f[], rvec fshift[],
1083                              gmx_bool VirCorr, matrix dxdf,
1084                              const t_pbc *pbc, const t_graph *g)
1085 {
1086     rvec    xvi, xij, xik, fv, fj, fk;
1087     real    cfx, cfy, cfz;
1088     int     av, ai, aj, ak;
1089     ivec    di;
1090     int     svi, sji, ski;
1091
1092     av = ia[1];
1093     ai = ia[2];
1094     aj = ia[3];
1095     ak = ia[4];
1096
1097     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1098     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1099     /* 6 Flops */
1100
1101     copy_rvec(f[av], fv);
1102
1103     cfx = c*fv[XX];
1104     cfy = c*fv[YY];
1105     cfz = c*fv[ZZ];
1106     /* 3 Flops */
1107
1108     fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
1109     fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
1110     fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
1111
1112     fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
1113     fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
1114     fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
1115     /* 30 Flops */
1116
1117     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1118     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1119     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1120     rvec_inc(f[aj], fj);
1121     rvec_inc(f[ak], fk);
1122     /* 15 Flops */
1123
1124     if (g)
1125     {
1126         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1127         svi = IVEC2IS(di);
1128         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1129         sji = IVEC2IS(di);
1130         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1131         ski = IVEC2IS(di);
1132     }
1133     else if (pbc)
1134     {
1135         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1136     }
1137     else
1138     {
1139         svi = CENTRAL;
1140     }
1141
1142     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1143     {
1144         rvec_dec(fshift[svi], fv);
1145         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1146         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1147         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1148         rvec_inc(fshift[sji], fj);
1149         rvec_inc(fshift[ski], fk);
1150     }
1151
1152     if (VirCorr)
1153     {
1154         rvec xiv;
1155
1156         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1157
1158         for (int i = 0; i < DIM; i++)
1159         {
1160             for (int j = 0; j < DIM; j++)
1161             {
1162                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
1163             }
1164         }
1165     }
1166
1167     /* TOTAL: 54 flops */
1168 }
1169
1170 static void spread_vsite4FD(const t_iatom ia[], real a, real b, real c,
1171                             const rvec x[],
1172                             rvec f[], rvec fshift[],
1173                             gmx_bool VirCorr, matrix dxdf,
1174                             const t_pbc *pbc, const t_graph *g)
1175 {
1176     real    d, invl, fproj, a1;
1177     rvec    xvi, xij, xjk, xjl, xix, fv, temp;
1178     int     av, ai, aj, ak, al;
1179     ivec    di;
1180     int     svi, sji, skj, slj, m;
1181
1182     av = ia[1];
1183     ai = ia[2];
1184     aj = ia[3];
1185     ak = ia[4];
1186     al = ia[5];
1187
1188     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1189     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1190     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1191     /* 9 flops */
1192
1193     /* xix goes from i to point x on the plane jkl */
1194     for (m = 0; m < DIM; m++)
1195     {
1196         xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
1197     }
1198     /* 12 flops */
1199
1200     invl = gmx::invsqrt(iprod(xix, xix));
1201     d    = c*invl;
1202     /* 4 + ?10? flops */
1203
1204     copy_rvec(f[av], fv);
1205
1206     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
1207
1208     for (m = 0; m < DIM; m++)
1209     {
1210         temp[m] = d*(fv[m] - fproj*xix[m]);
1211     }
1212     /* 16 */
1213
1214     /* c is already calculated in constr_vsite3FD
1215        storing c somewhere will save 35 flops!     */
1216
1217     a1 = 1 - a - b;
1218     for (m = 0; m < DIM; m++)
1219     {
1220         f[ai][m] += fv[m] - temp[m];
1221         f[aj][m] += a1*temp[m];
1222         f[ak][m] += a*temp[m];
1223         f[al][m] += b*temp[m];
1224     }
1225     /* 26 Flops */
1226
1227     if (g)
1228     {
1229         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1230         svi = IVEC2IS(di);
1231         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1232         sji = IVEC2IS(di);
1233         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1234         skj = IVEC2IS(di);
1235         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1236         slj = IVEC2IS(di);
1237     }
1238     else if (pbc)
1239     {
1240         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1241     }
1242     else
1243     {
1244         svi = CENTRAL;
1245     }
1246
1247     if (fshift &&
1248         (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1249     {
1250         rvec_dec(fshift[svi], fv);
1251         for (m = 0; m < DIM; m++)
1252         {
1253             fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
1254             fshift[    sji][m] += temp[m];
1255             fshift[    skj][m] += a*temp[m];
1256             fshift[    slj][m] += b*temp[m];
1257         }
1258     }
1259
1260     if (VirCorr)
1261     {
1262         rvec xiv;
1263         int  i, j;
1264
1265         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1266
1267         for (i = 0; i < DIM; i++)
1268         {
1269             for (j = 0; j < DIM; j++)
1270             {
1271                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
1272             }
1273         }
1274     }
1275
1276     /* TOTAL: 77 flops */
1277 }
1278
1279
1280 static void spread_vsite4FDN(const t_iatom ia[], real a, real b, real c,
1281                              const rvec x[],
1282                              rvec f[], rvec fshift[],
1283                              gmx_bool VirCorr, matrix dxdf,
1284                              const t_pbc *pbc, const t_graph *g)
1285 {
1286     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1287     rvec fv, fj, fk, fl;
1288     real invrm, denom;
1289     real cfx, cfy, cfz;
1290     ivec di;
1291     int  av, ai, aj, ak, al;
1292     int  svi, sij, sik, sil;
1293
1294     /* DEBUG: check atom indices */
1295     av = ia[1];
1296     ai = ia[2];
1297     aj = ia[3];
1298     ak = ia[4];
1299     al = ia[5];
1300
1301     copy_rvec(f[av], fv);
1302
1303     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1304     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1305     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1306     /* 9 flops */
1307
1308     ra[XX] = a*xik[XX];
1309     ra[YY] = a*xik[YY];
1310     ra[ZZ] = a*xik[ZZ];
1311
1312     rb[XX] = b*xil[XX];
1313     rb[YY] = b*xil[YY];
1314     rb[ZZ] = b*xil[ZZ];
1315
1316     /* 6 flops */
1317
1318     rvec_sub(ra, xij, rja);
1319     rvec_sub(rb, xij, rjb);
1320     rvec_sub(rb, ra, rab);
1321     /* 9 flops */
1322
1323     cprod(rja, rjb, rm);
1324     /* 9 flops */
1325
1326     invrm = gmx::invsqrt(norm2(rm));
1327     denom = invrm*invrm;
1328     /* 5+5+2 flops */
1329
1330     cfx = c*invrm*fv[XX];
1331     cfy = c*invrm*fv[YY];
1332     cfz = c*invrm*fv[ZZ];
1333     /* 6 Flops */
1334
1335     cprod(rm, rab, rt);
1336     /* 9 flops */
1337
1338     rt[XX] *= denom;
1339     rt[YY] *= denom;
1340     rt[ZZ] *= denom;
1341     /* 3flops */
1342
1343     fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
1344     fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
1345     fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
1346     /* 30 flops */
1347
1348     cprod(rjb, rm, rt);
1349     /* 9 flops */
1350
1351     rt[XX] *= denom*a;
1352     rt[YY] *= denom*a;
1353     rt[ZZ] *= denom*a;
1354     /* 3flops */
1355
1356     fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
1357     fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
1358     fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1359     /* 36 flops */
1360
1361     cprod(rm, rja, rt);
1362     /* 9 flops */
1363
1364     rt[XX] *= denom*b;
1365     rt[YY] *= denom*b;
1366     rt[ZZ] *= denom*b;
1367     /* 3flops */
1368
1369     fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
1370     fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
1371     fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1372     /* 36 flops */
1373
1374     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1375     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1376     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1377     rvec_inc(f[aj], fj);
1378     rvec_inc(f[ak], fk);
1379     rvec_inc(f[al], fl);
1380     /* 21 flops */
1381
1382     if (g)
1383     {
1384         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1385         svi = IVEC2IS(di);
1386         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1387         sij = IVEC2IS(di);
1388         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1389         sik = IVEC2IS(di);
1390         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1391         sil = IVEC2IS(di);
1392     }
1393     else if (pbc)
1394     {
1395         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1396     }
1397     else
1398     {
1399         svi = CENTRAL;
1400     }
1401
1402     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1403     {
1404         rvec_dec(fshift[svi], fv);
1405         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1406         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1407         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1408         rvec_inc(fshift[sij], fj);
1409         rvec_inc(fshift[sik], fk);
1410         rvec_inc(fshift[sil], fl);
1411     }
1412
1413     if (VirCorr)
1414     {
1415         rvec xiv;
1416         int  i, j;
1417
1418         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1419
1420         for (i = 0; i < DIM; i++)
1421         {
1422             for (j = 0; j < DIM; j++)
1423             {
1424                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
1425             }
1426         }
1427     }
1428
1429     /* Total: 207 flops (Yuck!) */
1430 }
1431
1432
1433 static int spread_vsiten(const t_iatom ia[], const t_iparams ip[],
1434                          const rvec x[],
1435                          rvec f[], rvec fshift[],
1436                          const t_pbc *pbc, const t_graph *g)
1437 {
1438     rvec xv, dx, fi;
1439     int  n3, av, i, ai;
1440     real a;
1441     ivec di;
1442     int  siv;
1443
1444     n3 = 3*ip[ia[0]].vsiten.n;
1445     av = ia[1];
1446     copy_rvec(x[av], xv);
1447
1448     for (i = 0; i < n3; i += 3)
1449     {
1450         ai = ia[i+2];
1451         if (g)
1452         {
1453             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1454             siv = IVEC2IS(di);
1455         }
1456         else if (pbc)
1457         {
1458             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1459         }
1460         else
1461         {
1462             siv = CENTRAL;
1463         }
1464         a = ip[ia[i]].vsiten.a;
1465         svmul(a, f[av], fi);
1466         rvec_inc(f[ai], fi);
1467         if (fshift && siv != CENTRAL)
1468         {
1469             rvec_inc(fshift[siv], fi);
1470             rvec_dec(fshift[CENTRAL], fi);
1471         }
1472         /* 6 Flops */
1473     }
1474
1475     return n3;
1476 }
1477
1478
1479 static int vsite_count(const t_ilist *ilist, int ftype)
1480 {
1481     if (ftype == F_VSITEN)
1482     {
1483         return ilist[ftype].nr/3;
1484     }
1485     else
1486     {
1487         return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
1488     }
1489 }
1490
1491 static void spread_vsite_f_thread(const gmx_vsite_t *vsite,
1492                                   const rvec x[],
1493                                   rvec f[], rvec *fshift,
1494                                   gmx_bool VirCorr, matrix dxdf,
1495                                   t_iparams ip[], const t_ilist ilist[],
1496                                   const t_graph *g, const t_pbc *pbc_null)
1497 {
1498     const PbcMode  pbcMode   = getPbcMode(pbc_null, vsite);
1499     /* We need another pbc pointer, as with charge groups we switch per vsite */
1500     const t_pbc   *pbc_null2 = pbc_null;
1501     const int     *vsite_pbc = nullptr;
1502
1503     /* this loop goes backwards to be able to build *
1504      * higher type vsites from lower types         */
1505     for (int ftype = c_ftypeVsiteEnd - 1; ftype >= c_ftypeVsiteStart; ftype--)
1506     {
1507         if (ilist[ftype].nr == 0)
1508         {
1509             continue;
1510         }
1511
1512         {   // TODO remove me
1513             int            nra = interaction_function[ftype].nratoms;
1514             int            inc = 1 + nra;
1515             int            nr  = ilist[ftype].nr;
1516
1517             const t_iatom *ia = ilist[ftype].iatoms;
1518
1519             if (pbcMode == PbcMode::all)
1520             {
1521                 pbc_null2 = pbc_null;
1522             }
1523             else if (pbcMode == PbcMode::chargeGroup)
1524             {
1525                 vsite_pbc = vsite->vsite_pbc_loc[ftype - c_ftypeVsiteStart];
1526             }
1527
1528             for (int i = 0; i < nr; )
1529             {
1530                 if (vsite_pbc != nullptr)
1531                 {
1532                     if (vsite_pbc[i/(1 + nra)] > -2)
1533                     {
1534                         pbc_null2 = pbc_null;
1535                     }
1536                     else
1537                     {
1538                         pbc_null2 = nullptr;
1539                     }
1540                 }
1541
1542                 int tp = ia[0];
1543
1544                 /* Constants for constructing */
1545                 real a1, b1, c1;
1546                 a1 = ip[tp].vsite.a;
1547                 /* Construct the vsite depending on type */
1548                 switch (ftype)
1549                 {
1550                     case F_VSITE2:
1551                         spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g);
1552                         break;
1553                     case F_VSITE3:
1554                         b1 = ip[tp].vsite.b;
1555                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1556                         break;
1557                     case F_VSITE3FD:
1558                         b1 = ip[tp].vsite.b;
1559                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1560                         break;
1561                     case F_VSITE3FAD:
1562                         b1 = ip[tp].vsite.b;
1563                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1564                         break;
1565                     case F_VSITE3OUT:
1566                         b1 = ip[tp].vsite.b;
1567                         c1 = ip[tp].vsite.c;
1568                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1569                         break;
1570                     case F_VSITE4FD:
1571                         b1 = ip[tp].vsite.b;
1572                         c1 = ip[tp].vsite.c;
1573                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1574                         break;
1575                     case F_VSITE4FDN:
1576                         b1 = ip[tp].vsite.b;
1577                         c1 = ip[tp].vsite.c;
1578                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1579                         break;
1580                     case F_VSITEN:
1581                         inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g);
1582                         break;
1583                     default:
1584                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
1585                                   ftype, __FILE__, __LINE__);
1586                 }
1587                 clear_rvec(f[ia[1]]);
1588
1589                 /* Increment loop variables */
1590                 i  += inc;
1591                 ia += inc;
1592             }
1593         }
1594     }
1595 }
1596
1597 /*! \brief Clears the task force buffer elements that are written by task idTask */
1598 static void clearTaskForceBufferUsedElements(InterdependentTask *idTask)
1599 {
1600     int ntask = idTask->spreadTask.size();
1601     for (int ti = 0; ti < ntask; ti++)
1602     {
1603         const AtomIndex *atomList = &idTask->atomIndex[idTask->spreadTask[ti]];
1604         int              natom    = atomList->atom.size();
1605         RVec            *force    = idTask->force.data();
1606         for (int i = 0; i < natom; i++)
1607         {
1608             clear_rvec(force[atomList->atom[i]]);
1609         }
1610     }
1611 }
1612
1613 void spread_vsite_f(const gmx_vsite_t *vsite,
1614                     const rvec * gmx_restrict x,
1615                     rvec * gmx_restrict f, rvec * gmx_restrict fshift,
1616                     gmx_bool VirCorr, matrix vir,
1617                     t_nrnb *nrnb, const t_idef *idef,
1618                     int ePBC, gmx_bool bMolPBC, const t_graph *g, const matrix box,
1619                     const t_commrec *cr, gmx_wallcycle *wcycle)
1620 {
1621     wallcycle_start(wcycle, ewcVSITESPREAD);
1622     const bool useDomdec = vsite->useDomdec;
1623     GMX_ASSERT(!useDomdec || (cr != nullptr && DOMAINDECOMP(cr)), "When vsites are set up with domain decomposition, we need a valid commrec");
1624
1625     t_pbc pbc, *pbc_null;
1626
1627     /* We only need to do pbc when we have inter-cg vsites */
1628     if ((useDomdec || bMolPBC) && vsite->n_intercg_vsite)
1629     {
1630         /* This is wasting some CPU time as we now do this multiple times
1631          * per MD step.
1632          */
1633         pbc_null = set_pbc_dd(&pbc, ePBC, useDomdec ? cr->dd->nc : nullptr, FALSE, box);
1634     }
1635     else
1636     {
1637         pbc_null = nullptr;
1638     }
1639
1640     if (useDomdec)
1641     {
1642         dd_clear_f_vsites(cr->dd, f);
1643     }
1644
1645     if (vsite->nthreads == 1)
1646     {
1647         matrix dxdf;
1648         if (VirCorr)
1649         {
1650             clear_mat(dxdf);
1651         }
1652         spread_vsite_f_thread(vsite,
1653                               x, f, fshift,
1654                               VirCorr, dxdf,
1655                               idef->iparams, idef->il,
1656                               g, pbc_null);
1657
1658         if (VirCorr)
1659         {
1660             for (int i = 0; i < DIM; i++)
1661             {
1662                 for (int j = 0; j < DIM; j++)
1663                 {
1664                     vir[i][j] += -0.5*dxdf[i][j];
1665                 }
1666             }
1667         }
1668     }
1669     else
1670     {
1671         /* First spread the vsites that might depend on non-local vsites */
1672         if (VirCorr)
1673         {
1674             clear_mat(vsite->tData[vsite->nthreads]->dxdf);
1675         }
1676         spread_vsite_f_thread(vsite,
1677                               x, f, fshift,
1678                               VirCorr, vsite->tData[vsite->nthreads]->dxdf,
1679                               idef->iparams,
1680                               vsite->tData[vsite->nthreads]->ilist,
1681                               g, pbc_null);
1682
1683 #pragma omp parallel num_threads(vsite->nthreads)
1684         {
1685             try
1686             {
1687                 int          thread = gmx_omp_get_thread_num();
1688                 VsiteThread *tData  = vsite->tData[thread];
1689
1690                 rvec        *fshift_t;
1691                 if (thread == 0 || fshift == nullptr)
1692                 {
1693                     fshift_t = fshift;
1694                 }
1695                 else
1696                 {
1697                     fshift_t = tData->fshift;
1698
1699                     for (int i = 0; i < SHIFTS; i++)
1700                     {
1701                         clear_rvec(fshift_t[i]);
1702                     }
1703                 }
1704                 if (VirCorr)
1705                 {
1706                     clear_mat(tData->dxdf);
1707                 }
1708
1709                 if (tData->useInterdependentTask)
1710                 {
1711                     /* Spread the vsites that spread outside our local range.
1712                      * This is done using a thread-local force buffer force.
1713                      * First we need to copy the input vsite forces to force.
1714                      */
1715                     InterdependentTask *idTask = &tData->idTask;
1716
1717                     /* Clear the buffer elements set by our task during
1718                      * the last call to spread_vsite_f.
1719                      */
1720                     clearTaskForceBufferUsedElements(idTask);
1721
1722                     int nvsite = idTask->vsite.size();
1723                     for (int i = 0; i < nvsite; i++)
1724                     {
1725                         copy_rvec(f[idTask->vsite[i]],
1726                                   idTask->force[idTask->vsite[i]]);
1727                     }
1728                     spread_vsite_f_thread(vsite,
1729                                           x, as_rvec_array(idTask->force.data()), fshift_t,
1730                                           VirCorr, tData->dxdf,
1731                                           idef->iparams,
1732                                           tData->idTask.ilist,
1733                                           g, pbc_null);
1734
1735                     /* We need a barrier before reducing forces below
1736                      * that have been produced by a different thread above.
1737                      */
1738 #pragma omp barrier
1739
1740                     /* Loop over all thread task and reduce forces they
1741                      * produced on atoms that fall in our range.
1742                      * Note that atomic reduction would be a simpler solution,
1743                      * but that might not have good support on all platforms.
1744                      */
1745                     int ntask = idTask->reduceTask.size();
1746                     for (int ti = 0; ti < ntask; ti++)
1747                     {
1748                         const InterdependentTask *idt_foreign = &vsite->tData[idTask->reduceTask[ti]]->idTask;
1749                         const AtomIndex          *atomList    = &idt_foreign->atomIndex[thread];
1750                         const RVec               *f_foreign   = idt_foreign->force.data();
1751
1752                         int natom = atomList->atom.size();
1753                         for (int i = 0; i < natom; i++)
1754                         {
1755                             int ind = atomList->atom[i];
1756                             rvec_inc(f[ind], f_foreign[ind]);
1757                             /* Clearing of f_foreign is done at the next step */
1758                         }
1759                     }
1760                     /* Clear the vsite forces, both in f and force */
1761                     for (int i = 0; i < nvsite; i++)
1762                     {
1763                         int ind = tData->idTask.vsite[i];
1764                         clear_rvec(f[ind]);
1765                         clear_rvec(tData->idTask.force[ind]);
1766                     }
1767                 }
1768
1769                 /* Spread the vsites that spread locally only */
1770                 spread_vsite_f_thread(vsite,
1771                                       x, f, fshift_t,
1772                                       VirCorr, tData->dxdf,
1773                                       idef->iparams,
1774                                       tData->ilist,
1775                                       g, pbc_null);
1776             }
1777             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1778         }
1779
1780         if (fshift != nullptr)
1781         {
1782             for (int th = 1; th < vsite->nthreads; th++)
1783             {
1784                 for (int i = 0; i < SHIFTS; i++)
1785                 {
1786                     rvec_inc(fshift[i], vsite->tData[th]->fshift[i]);
1787                 }
1788             }
1789         }
1790
1791         if (VirCorr)
1792         {
1793             for (int th = 0; th < vsite->nthreads + 1; th++)
1794             {
1795                 /* MSVC doesn't like matrix references, so we use a pointer */
1796                 const matrix *dxdf = &vsite->tData[th]->dxdf;
1797
1798                 for (int i = 0; i < DIM; i++)
1799                 {
1800                     for (int j = 0; j < DIM; j++)
1801                     {
1802                         vir[i][j] += -0.5*(*dxdf)[i][j];
1803                     }
1804                 }
1805             }
1806         }
1807     }
1808
1809     if (useDomdec)
1810     {
1811         dd_move_f_vsites(cr->dd, f, fshift);
1812     }
1813
1814     inc_nrnb(nrnb, eNR_VSITE2,   vsite_count(idef->il, F_VSITE2));
1815     inc_nrnb(nrnb, eNR_VSITE3,   vsite_count(idef->il, F_VSITE3));
1816     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
1817     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
1818     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
1819     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
1820     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
1821     inc_nrnb(nrnb, eNR_VSITEN,   vsite_count(idef->il, F_VSITEN));
1822
1823     wallcycle_stop(wcycle, ewcVSITESPREAD);
1824 }
1825
1826 /*! \brief Returns the an array with charge-group indices for each atom
1827  *
1828  * \param[in] chargeGroups  The charge group block struct
1829  */
1830 static std::vector<int> atom2cg(const t_block &chargeGroups)
1831 {
1832     std::vector<int> a2cg(chargeGroups.index[chargeGroups.nr], 0);
1833
1834     for (int chargeGroup = 0; chargeGroup < chargeGroups.nr; chargeGroup++)
1835     {
1836         std::fill(a2cg.begin() + chargeGroups.index[chargeGroup],
1837                   a2cg.begin() + chargeGroups.index[chargeGroup + 1],
1838                   chargeGroup);
1839     }
1840
1841     return a2cg;
1842 }
1843
1844 int count_intercg_vsites(const gmx_mtop_t *mtop)
1845 {
1846     int n_intercg_vsite = 0;
1847     for (const gmx_molblock_t &molb : mtop->molblock)
1848     {
1849         const gmx_moltype_t &molt = mtop->moltype[molb.type];
1850
1851         std::vector<int>     a2cg = atom2cg(molt.cgs);
1852         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1853         {
1854             int            nral = NRAL(ftype);
1855             const t_ilist &il   = molt.ilist[ftype];
1856             const t_iatom *ia   = il.iatoms;
1857             for (int i = 0; i < il.nr; i += 1 + nral)
1858             {
1859                 int cg = a2cg[ia[1+i]];
1860                 for (int a = 1; a < nral; a++)
1861                 {
1862                     if (a2cg[ia[1+a]] != cg)
1863                     {
1864                         n_intercg_vsite += molb.nmol;
1865                         break;
1866                     }
1867                 }
1868             }
1869         }
1870     }
1871
1872     return n_intercg_vsite;
1873 }
1874
1875 static int **get_vsite_pbc(const t_iparams *iparams, const t_ilist *ilist,
1876                            const t_atom *atom, const t_mdatoms *md,
1877                            const t_block &cgs)
1878 {
1879     /* Make an atom to charge group index */
1880     std::vector<int> a2cg = atom2cg(cgs);
1881
1882     /* Make an array that tells if the pbc of an atom is set */
1883     std::vector<bool> pbc_set(cgs.index[cgs.nr], false);
1884     /* PBC is set for all non vsites */
1885     for (int a = 0; a < cgs.index[cgs.nr]; a++)
1886     {
1887         if ((atom && atom[a].ptype != eptVSite) ||
1888             (md   && md->ptype[a]  != eptVSite))
1889         {
1890             pbc_set[a] = true;
1891         }
1892     }
1893
1894     int **vsite_pbc;
1895     snew(vsite_pbc, F_VSITEN-F_VSITE2+1);
1896
1897     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
1898     {
1899         {   // TODO remove me
1900             int            nral = NRAL(ftype);
1901             const t_ilist *il   = &ilist[ftype];
1902             const t_iatom *ia   = il->iatoms;
1903             int           *vsite_pbc_f;
1904
1905             snew(vsite_pbc[ftype-F_VSITE2], il->nr/(1 + nral));
1906             vsite_pbc_f = vsite_pbc[ftype-F_VSITE2];
1907
1908             int i = 0;
1909             while (i < il->nr)
1910             {
1911                 int     vsi   = i/(1 + nral);
1912                 t_iatom vsite = ia[i+1];
1913                 int     cg_v  = a2cg[vsite];
1914                 /* A value of -2 signals that this vsite and its contructing
1915                  * atoms are all within the same cg, so no pbc is required.
1916                  */
1917                 vsite_pbc_f[vsi] = -2;
1918                 /* Check if constructing atoms are outside the vsite's cg */
1919                 int nc3 = 0;
1920                 if (ftype == F_VSITEN)
1921                 {
1922                     nc3 = 3*iparams[ia[i]].vsiten.n;
1923                     for (int j = 0; j < nc3; j += 3)
1924                     {
1925                         if (a2cg[ia[i+j+2]] != cg_v)
1926                         {
1927                             vsite_pbc_f[vsi] = -1;
1928                         }
1929                     }
1930                 }
1931                 else
1932                 {
1933                     for (int a = 1; a < nral; a++)
1934                     {
1935                         if (a2cg[ia[i+1+a]] != cg_v)
1936                         {
1937                             vsite_pbc_f[vsi] = -1;
1938                         }
1939                     }
1940                 }
1941                 if (vsite_pbc_f[vsi] == -1)
1942                 {
1943                     /* Check if this is the first processed atom of a vsite only cg */
1944                     gmx_bool bViteOnlyCG_and_FirstAtom = TRUE;
1945                     for (int a = cgs.index[cg_v]; a < cgs.index[cg_v + 1]; a++)
1946                     {
1947                         /* Non-vsites already have pbc set, so simply check for pbc_set */
1948                         if (pbc_set[a])
1949                         {
1950                             bViteOnlyCG_and_FirstAtom = FALSE;
1951                             break;
1952                         }
1953                     }
1954                     if (bViteOnlyCG_and_FirstAtom)
1955                     {
1956                         /* First processed atom of a vsite only charge group.
1957                          * The pbc of the input coordinates to construct_vsites
1958                          * should be preserved.
1959                          */
1960                         vsite_pbc_f[vsi] = vsite;
1961                     }
1962                     else if (cg_v != a2cg[ia[1+i+1]])
1963                     {
1964                         /* This vsite has a different charge group index
1965                          * than it's first constructing atom
1966                          * and the charge group has more than one atom,
1967                          * search for the first normal particle
1968                          * or vsite that already had its pbc defined.
1969                          * If nothing is found, use full pbc for this vsite.
1970                          */
1971                         for (int a = cgs.index[cg_v]; a < cgs.index[cg_v + 1]; a++)
1972                         {
1973                             if (a != vsite && pbc_set[a])
1974                             {
1975                                 vsite_pbc_f[vsi] = a;
1976                                 if (gmx_debug_at)
1977                                 {
1978                                     fprintf(debug, "vsite %d match pbc with atom %d\n",
1979                                             vsite+1, a+1);
1980                                 }
1981                                 break;
1982                             }
1983                         }
1984                         if (gmx_debug_at)
1985                         {
1986                             fprintf(debug, "vsite atom %d  cg %d - %d pbc atom %d\n",
1987                                     vsite+1, cgs.index[cg_v] + 1, cgs.index[cg_v + 1],
1988                                     vsite_pbc_f[vsi] + 1);
1989                         }
1990                     }
1991                 }
1992                 if (ftype == F_VSITEN)
1993                 {
1994                     /* The other entries in vsite_pbc_f are not used for center vsites */
1995                     i += nc3;
1996                 }
1997                 else
1998                 {
1999                     i += 1 + nral;
2000                 }
2001
2002                 /* This vsite now has its pbc defined */
2003                 pbc_set[vsite] = true;
2004             }
2005         }
2006     }
2007
2008     return vsite_pbc;
2009 }
2010
2011
2012 gmx_vsite_t *initVsite(const gmx_mtop_t &mtop,
2013                        const t_commrec  *cr)
2014 {
2015     GMX_RELEASE_ASSERT(cr != nullptr, "We need a valid commrec");
2016
2017     /* check if there are vsites */
2018     int nvsite = 0;
2019     for (int ftype = 0; ftype < F_NRE; ftype++)
2020     {
2021         if (interaction_function[ftype].flags & IF_VSITE)
2022         {
2023             GMX_ASSERT(ftype >= c_ftypeVsiteStart && ftype < c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2024
2025             nvsite += gmx_mtop_ftype_count(&mtop, ftype);
2026         }
2027         else
2028         {
2029             GMX_ASSERT(ftype < c_ftypeVsiteStart || ftype >= c_ftypeVsiteEnd, "c_ftypeVsiteStart and/or c_ftypeVsiteEnd do not have correct values");
2030         }
2031     }
2032
2033     if (nvsite == 0)
2034     {
2035         return nullptr;
2036     }
2037
2038     gmx_vsite_t *vsite = new(gmx_vsite_t);
2039
2040     vsite->n_intercg_vsite   = count_intercg_vsites(&mtop);
2041
2042     vsite->bHaveChargeGroups = (ncg_mtop(&mtop) < mtop.natoms);
2043
2044     vsite->useDomdec         = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
2045
2046     /* If we don't have charge groups, the vsite follows its own pbc.
2047      *
2048      * With charge groups, each vsite needs to follow the pbc of the charge
2049      * group. Thus we need to keep track of PBC. Here we assume that without
2050      * domain decomposition all molecules are whole (which will not be
2051      * the case with periodic molecules).
2052      */
2053     if (vsite->bHaveChargeGroups &&
2054         vsite->n_intercg_vsite > 0 &&
2055         DOMAINDECOMP(cr))
2056     {
2057         vsite->nvsite_pbc_molt = mtop.moltype.size();
2058         snew(vsite->vsite_pbc_molt, vsite->nvsite_pbc_molt);
2059         for (size_t mt = 0; mt < mtop.moltype.size(); mt++)
2060         {
2061             const gmx_moltype_t &molt = mtop.moltype[mt];
2062             vsite->vsite_pbc_molt[mt] = get_vsite_pbc(mtop.ffparams.iparams,
2063                                                       molt.ilist,
2064                                                       molt.atoms.atom, nullptr,
2065                                                       molt.cgs);
2066         }
2067
2068         snew(vsite->vsite_pbc_loc_nalloc, c_ftypeVsiteEnd - c_ftypeVsiteStart);
2069         snew(vsite->vsite_pbc_loc,        c_ftypeVsiteEnd - c_ftypeVsiteStart);
2070     }
2071     else
2072     {
2073         vsite->vsite_pbc_molt = nullptr;
2074         vsite->vsite_pbc_loc  = nullptr;
2075     }
2076
2077     vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
2078
2079     if (vsite->nthreads > 1)
2080     {
2081         /* We need one extra thread data structure for the overlap vsites */
2082         snew(vsite->tData, vsite->nthreads + 1);
2083 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2084         for (int thread = 0; thread < vsite->nthreads; thread++)
2085         {
2086             try
2087             {
2088                 vsite->tData[thread] = new VsiteThread;
2089
2090                 InterdependentTask *idTask = &vsite->tData[thread]->idTask;
2091                 idTask->nuse               = 0;
2092                 idTask->atomIndex.resize(vsite->nthreads);
2093             }
2094             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2095         }
2096         if (vsite->nthreads > 1)
2097         {
2098             vsite->tData[vsite->nthreads] = new VsiteThread;
2099         }
2100     }
2101
2102     vsite->taskIndex       = nullptr;
2103     vsite->taskIndexNalloc = 0;
2104
2105     return vsite;
2106 }
2107
2108 static inline void flagAtom(InterdependentTask *idTask, int atom,
2109                             int thread, int nthread, int natperthread)
2110 {
2111     if (!idTask->use[atom])
2112     {
2113         idTask->use[atom] = true;
2114         thread            = atom/natperthread;
2115         /* Assign all non-local atom force writes to thread 0 */
2116         if (thread >= nthread)
2117         {
2118             thread        = 0;
2119         }
2120         idTask->atomIndex[thread].atom.push_back(atom);
2121     }
2122 }
2123
2124 /*\brief Here we try to assign all vsites that are in our local range.
2125  *
2126  * Our task local atom range is tData->rangeStart - tData->rangeEnd.
2127  * Vsites that depend only on local atoms, as indicated by taskIndex[]==thread,
2128  * are assigned to task tData->ilist. Vsites that depend on non-local atoms
2129  * but not on other vsites are assigned to task tData->id_task.ilist.
2130  * taskIndex[] is set for all vsites in our range, either to our local tasks
2131  * or to the single last task as taskIndex[]=2*nthreads.
2132  */
2133 static void assignVsitesToThread(VsiteThread           *tData,
2134                                  int                    thread,
2135                                  int                    nthread,
2136                                  int                    natperthread,
2137                                  int                   *taskIndex,
2138                                  const t_ilist         *ilist,
2139                                  const t_iparams       *ip,
2140                                  const unsigned short  *ptype)
2141 {
2142     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2143     {
2144         tData->ilist[ftype].nr        = 0;
2145         tData->idTask.ilist[ftype].nr = 0;
2146
2147         int      nral1 = 1 + NRAL(ftype);
2148         int      inc   = nral1;
2149         t_iatom *iat   = ilist[ftype].iatoms;
2150         for (int i = 0; i < ilist[ftype].nr; )
2151         {
2152             if (ftype == F_VSITEN)
2153             {
2154                 /* The 3 below is from 1+NRAL(ftype)=3 */
2155                 inc = ip[iat[i]].vsiten.n*3;
2156             }
2157
2158             if (iat[1 + i] <  tData->rangeStart ||
2159                 iat[1 + i] >= tData->rangeEnd)
2160             {
2161                 /* This vsite belongs to a different thread */
2162                 i += inc;
2163                 continue;
2164             }
2165
2166             /* We would like to assign this vsite to task thread,
2167              * but it might depend on atoms outside the atom range of thread
2168              * or on another vsite not assigned to task thread.
2169              */
2170             int task = thread;
2171             if (ftype != F_VSITEN)
2172             {
2173                 for (int j = i + 2; j < i + nral1; j++)
2174                 {
2175                     /* Do a range check to avoid a harmless race on taskIndex */
2176                     if (iat[j] <  tData->rangeStart ||
2177                         iat[j] >= tData->rangeEnd ||
2178                         taskIndex[iat[j]] != thread)
2179                     {
2180                         if (!tData->useInterdependentTask ||
2181                             ptype[iat[j]] == eptVSite)
2182                         {
2183                             /* At least one constructing atom is a vsite
2184                              * that is not assigned to the same thread.
2185                              * Put this vsite into a separate task.
2186                              */
2187                             task = 2*nthread;
2188                             break;
2189                         }
2190
2191                         /* There are constructing atoms outside our range,
2192                          * put this vsite into a second task to be executed
2193                          * on the same thread. During construction no barrier
2194                          * is needed between the two tasks on the same thread.
2195                          * During spreading we need to run this task with
2196                          * an additional thread-local intermediate force buffer
2197                          * (or atomic reduction) and a barrier between the two
2198                          * tasks.
2199                          */
2200                         task = nthread + thread;
2201                     }
2202                 }
2203             }
2204             else
2205             {
2206                 for (int j = i + 2; j < i + inc; j += 3)
2207                 {
2208                     /* Do a range check to avoid a harmless race on taskIndex */
2209                     if (iat[j] <  tData->rangeStart ||
2210                         iat[j] >= tData->rangeEnd ||
2211                         taskIndex[iat[j]] != thread)
2212                     {
2213                         GMX_ASSERT(ptype[iat[j]] != eptVSite, "A vsite to be assigned in assignVsitesToThread has a vsite as a constructing atom that does not belong to our task, such vsites should be assigned to the single 'master' task");
2214
2215                         task = nthread + thread;
2216                     }
2217                 }
2218             }
2219
2220             /* Update this vsite's thread index entry */
2221             taskIndex[iat[1+i]] = task;
2222
2223             if (task == thread || task == nthread + thread)
2224             {
2225                 /* Copy this vsite to the thread data struct of thread */
2226                 t_ilist *il_task;
2227                 if (task == thread)
2228                 {
2229                     il_task = &tData->ilist[ftype];
2230                 }
2231                 else
2232                 {
2233                     il_task = &tData->idTask.ilist[ftype];
2234                 }
2235                 /* Ensure we have sufficient memory allocated */
2236                 if (il_task->nr + inc > il_task->nalloc)
2237                 {
2238                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2239                     srenew(il_task->iatoms, il_task->nalloc);
2240                 }
2241                 /* Copy the vsite data to the thread-task local array */
2242                 for (int j = i; j < i + inc; j++)
2243                 {
2244                     il_task->iatoms[il_task->nr++] = iat[j];
2245                 }
2246                 if (task == nthread + thread)
2247                 {
2248                     /* This vsite write outside our own task force block.
2249                      * Put it into the interdependent task list and flag
2250                      * the atoms involved for reduction.
2251                      */
2252                     tData->idTask.vsite.push_back(iat[i + 1]);
2253                     if (ftype != F_VSITEN)
2254                     {
2255                         for (int j = i + 2; j < i + nral1; j++)
2256                         {
2257                             flagAtom(&tData->idTask, iat[j],
2258                                      thread, nthread, natperthread);
2259                         }
2260                     }
2261                     else
2262                     {
2263                         for (int j = i + 2; j < i + inc; j += 3)
2264                         {
2265                             flagAtom(&tData->idTask, iat[j],
2266                                      thread, nthread, natperthread);
2267                         }
2268                     }
2269                 }
2270             }
2271
2272             i += inc;
2273         }
2274     }
2275 }
2276
2277 /*! \brief Assign all vsites with taskIndex[]==task to task tData */
2278 static void assignVsitesToSingleTask(VsiteThread     *tData,
2279                                      int              task,
2280                                      const int       *taskIndex,
2281                                      const t_ilist   *ilist,
2282                                      const t_iparams *ip)
2283 {
2284     for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2285     {
2286         tData->ilist[ftype].nr        = 0;
2287         tData->idTask.ilist[ftype].nr = 0;
2288
2289         int      nral1   = 1 + NRAL(ftype);
2290         int      inc     = nral1;
2291         t_iatom *iat     = ilist[ftype].iatoms;
2292         t_ilist *il_task = &tData->ilist[ftype];
2293
2294         for (int i = 0; i < ilist[ftype].nr; )
2295         {
2296             if (ftype == F_VSITEN)
2297             {
2298                 /* The 3 below is from 1+NRAL(ftype)=3 */
2299                 inc = ip[iat[i]].vsiten.n*3;
2300             }
2301             /* Check if the vsite is assigned to our task */
2302             if (taskIndex[iat[1 + i]] == task)
2303             {
2304                 /* Ensure we have sufficient memory allocated */
2305                 if (il_task->nr + inc > il_task->nalloc)
2306                 {
2307                     il_task->nalloc = over_alloc_large(il_task->nr + inc);
2308                     srenew(il_task->iatoms, il_task->nalloc);
2309                 }
2310                 /* Copy the vsite data to the thread-task local array */
2311                 for (int j = i; j < i + inc; j++)
2312                 {
2313                     il_task->iatoms[il_task->nr++] = iat[j];
2314                 }
2315             }
2316
2317             i += inc;
2318         }
2319     }
2320 }
2321
2322 void split_vsites_over_threads(const t_ilist   *ilist,
2323                                const t_iparams *ip,
2324                                const t_mdatoms *mdatoms,
2325                                gmx_vsite_t     *vsite)
2326 {
2327     int      vsite_atom_range, natperthread;
2328
2329     if (vsite->nthreads == 1)
2330     {
2331         /* Nothing to do */
2332         return;
2333     }
2334
2335     /* The current way of distributing the vsites over threads in primitive.
2336      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2337      * without taking into account how the vsites are distributed.
2338      * Without domain decomposition we at least tighten the upper bound
2339      * of the range (useful for common systems such as a vsite-protein
2340      * in 3-site water).
2341      * With domain decomposition, as long as the vsites are distributed
2342      * uniformly in each domain along the major dimension, usually x,
2343      * it will also perform well.
2344      */
2345     if (!vsite->useDomdec)
2346     {
2347         vsite_atom_range = -1;
2348         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2349         {
2350             {   // TODO remove me
2351                 if (ftype != F_VSITEN)
2352                 {
2353                     int            nral1 = 1 + NRAL(ftype);
2354                     const t_iatom *iat   = ilist[ftype].iatoms;
2355                     for (int i = 0; i < ilist[ftype].nr; i += nral1)
2356                     {
2357                         for (int j = i + 1; j < i + nral1; j++)
2358                         {
2359                             vsite_atom_range = std::max(vsite_atom_range, iat[j]);
2360                         }
2361                     }
2362                 }
2363                 else
2364                 {
2365                     int            vs_ind_end;
2366
2367                     const t_iatom *iat = ilist[ftype].iatoms;
2368
2369                     int            i = 0;
2370                     while (i < ilist[ftype].nr)
2371                     {
2372                         /* The 3 below is from 1+NRAL(ftype)=3 */
2373                         vs_ind_end = i + ip[iat[i]].vsiten.n*3;
2374
2375                         vsite_atom_range = std::max(vsite_atom_range, iat[i+1]);
2376                         while (i < vs_ind_end)
2377                         {
2378                             vsite_atom_range = std::max(vsite_atom_range, iat[i+2]);
2379                             i               += 3;
2380                         }
2381                     }
2382                 }
2383             }
2384         }
2385         vsite_atom_range++;
2386         natperthread     = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
2387     }
2388     else
2389     {
2390         /* Any local or not local atom could be involved in virtual sites.
2391          * But since we usually have very few non-local virtual sites
2392          * (only non-local vsites that depend on local vsites),
2393          * we distribute the local atom range equally over the threads.
2394          * When assigning vsites to threads, we should take care that the last
2395          * threads also covers the non-local range.
2396          */
2397         vsite_atom_range = mdatoms->nr;
2398         natperthread     = (mdatoms->homenr + vsite->nthreads - 1)/vsite->nthreads;
2399     }
2400
2401     if (debug)
2402     {
2403         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms->nr, vsite_atom_range, natperthread);
2404     }
2405
2406     /* To simplify the vsite assignment, we make an index which tells us
2407      * to which task particles, both non-vsites and vsites, are assigned.
2408      */
2409     if (mdatoms->nr > vsite->taskIndexNalloc)
2410     {
2411         vsite->taskIndexNalloc = over_alloc_large(mdatoms->nr);
2412         srenew(vsite->taskIndex, vsite->taskIndexNalloc);
2413     }
2414
2415     /* Initialize the task index array. Here we assign the non-vsite
2416      * particles to task=thread, so we easily figure out if vsites
2417      * depend on local and/or non-local particles in assignVsitesToThread.
2418      */
2419     int *taskIndex = vsite->taskIndex;
2420     {
2421         int  thread = 0;
2422         for (int i = 0; i < mdatoms->nr; i++)
2423         {
2424             if (mdatoms->ptype[i] == eptVSite)
2425             {
2426                 /* vsites are not assigned to a task yet */
2427                 taskIndex[i] = -1;
2428             }
2429             else
2430             {
2431                 /* assign non-vsite particles to task thread */
2432                 taskIndex[i] = thread;
2433             }
2434             if (i == (thread + 1)*natperthread && thread < vsite->nthreads)
2435             {
2436                 thread++;
2437             }
2438         }
2439     }
2440
2441 #pragma omp parallel num_threads(vsite->nthreads)
2442     {
2443         try
2444         {
2445             int          thread = gmx_omp_get_thread_num();
2446             VsiteThread *tData  = vsite->tData[thread];
2447
2448             /* Clear the buffer use flags that were set before */
2449             if (tData->useInterdependentTask)
2450             {
2451                 InterdependentTask *idTask = &tData->idTask;
2452
2453                 /* To avoid an extra OpenMP barrier in spread_vsite_f,
2454                  * we clear the force buffer at the next step,
2455                  * so we need to do it here as well.
2456                  */
2457                 clearTaskForceBufferUsedElements(idTask);
2458
2459                 idTask->vsite.resize(0);
2460                 for (int t = 0; t < vsite->nthreads; t++)
2461                 {
2462                     AtomIndex *atomIndex = &idTask->atomIndex[t];
2463                     int        natom     = atomIndex->atom.size();
2464                     for (int i = 0; i < natom; i++)
2465                     {
2466                         idTask->use[atomIndex->atom[i]] = false;
2467                     }
2468                     atomIndex->atom.resize(0);
2469                 }
2470                 idTask->nuse = 0;
2471             }
2472
2473             /* To avoid large f_buf allocations of #threads*vsite_atom_range
2474              * we don't use task2 with more than 200000 atoms. This doesn't
2475              * affect performance, since with such a large range relatively few
2476              * vsites will end up in the separate task.
2477              * Note that useTask2 should be the same for all threads.
2478              */
2479             tData->useInterdependentTask = (vsite_atom_range <= 200000);
2480             if (tData->useInterdependentTask)
2481             {
2482                 size_t              natoms_use_in_vsites = vsite_atom_range;
2483                 InterdependentTask *idTask               = &tData->idTask;
2484                 /* To avoid resizing and re-clearing every nstlist steps,
2485                  * we never down size the force buffer.
2486                  */
2487                 if (natoms_use_in_vsites > idTask->force.size() ||
2488                     natoms_use_in_vsites > idTask->use.size())
2489                 {
2490                     idTask->force.resize(natoms_use_in_vsites, { 0, 0, 0 });
2491                     idTask->use.resize(natoms_use_in_vsites, false);
2492                 }
2493             }
2494
2495             /* Assign all vsites that can execute independently on threads */
2496             tData->rangeStart     =  thread     *natperthread;
2497             if (thread < vsite->nthreads - 1)
2498             {
2499                 tData->rangeEnd   = (thread + 1)*natperthread;
2500             }
2501             else
2502             {
2503                 /* The last thread should cover up to the end of the range */
2504                 tData->rangeEnd   = mdatoms->nr;
2505             }
2506             assignVsitesToThread(tData,
2507                                  thread, vsite->nthreads,
2508                                  natperthread,
2509                                  taskIndex,
2510                                  ilist, ip, mdatoms->ptype);
2511
2512             if (tData->useInterdependentTask)
2513             {
2514                 /* In the worst case, all tasks write to force ranges of
2515                  * all other tasks, leading to #tasks^2 scaling (this is only
2516                  * the overhead, the actual flops remain constant).
2517                  * But in most cases there is far less coupling. To improve
2518                  * scaling at high thread counts we therefore construct
2519                  * an index to only loop over the actually affected tasks.
2520                  */
2521                 InterdependentTask *idTask = &tData->idTask;
2522
2523                 /* Ensure assignVsitesToThread finished on other threads */
2524 #pragma omp barrier
2525
2526                 idTask->spreadTask.resize(0);
2527                 idTask->reduceTask.resize(0);
2528                 for (int t = 0; t < vsite->nthreads; t++)
2529                 {
2530                     /* Do we write to the force buffer of task t? */
2531                     if (idTask->atomIndex[t].atom.size() > 0)
2532                     {
2533                         idTask->spreadTask.push_back(t);
2534                     }
2535                     /* Does task t write to our force buffer? */
2536                     if (vsite->tData[t]->idTask.atomIndex[thread].atom.size() > 0)
2537                     {
2538                         idTask->reduceTask.push_back(t);
2539                     }
2540                 }
2541             }
2542         }
2543         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2544     }
2545     /* Assign all remaining vsites, that will have taskIndex[]=2*vsite->nthreads,
2546      * to a single task that will not run in parallel with other tasks.
2547      */
2548     assignVsitesToSingleTask(vsite->tData[vsite->nthreads],
2549                              2*vsite->nthreads,
2550                              taskIndex,
2551                              ilist, ip);
2552
2553     if (debug && vsite->nthreads > 1)
2554     {
2555         fprintf(debug, "virtual site useInterdependentTask %d, nuse:\n",
2556                 vsite->tData[0]->useInterdependentTask);
2557         for (int th = 0; th < vsite->nthreads + 1; th++)
2558         {
2559             fprintf(debug, " %4d", vsite->tData[th]->idTask.nuse);
2560         }
2561         fprintf(debug, "\n");
2562
2563         for (int ftype = c_ftypeVsiteStart; ftype < c_ftypeVsiteEnd; ftype++)
2564         {
2565             if (ilist[ftype].nr > 0)
2566             {
2567                 fprintf(debug, "%-20s thread dist:",
2568                         interaction_function[ftype].longname);
2569                 for (int th = 0; th < vsite->nthreads + 1; th++)
2570                 {
2571                     fprintf(debug, " %4d %4d ",
2572                             vsite->tData[th]->ilist[ftype].nr,
2573                             vsite->tData[th]->idTask.ilist[ftype].nr);
2574                 }
2575                 fprintf(debug, "\n");
2576             }
2577         }
2578     }
2579
2580 #ifndef NDEBUG
2581     int nrOrig     = vsiteIlistNrCount(ilist);
2582     int nrThreaded = 0;
2583     for (int th = 0; th < vsite->nthreads + 1; th++)
2584     {
2585         nrThreaded +=
2586             vsiteIlistNrCount(vsite->tData[th]->ilist) +
2587             vsiteIlistNrCount(vsite->tData[th]->idTask.ilist);
2588     }
2589     GMX_ASSERT(nrThreaded == nrOrig, "The number of virtual sites assigned to all thread task has to match the total number of virtual sites");
2590 #endif
2591 }
2592
2593 void set_vsite_top(gmx_vsite_t          *vsite,
2594                    const gmx_localtop_t *top,
2595                    const t_mdatoms      *md)
2596 {
2597     if (vsite->n_intercg_vsite > 0 && vsite->bHaveChargeGroups)
2598     {
2599         vsite->vsite_pbc_loc = get_vsite_pbc(top->idef.iparams,
2600                                              top->idef.il, nullptr, md,
2601                                              top->cgs);
2602     }
2603
2604     if (vsite->nthreads > 1)
2605     {
2606         if (vsite->bHaveChargeGroups)
2607         {
2608             gmx_fatal(FARGS, "The combination of threading, virtual sites and charge groups is not implemented");
2609         }
2610
2611         split_vsites_over_threads(top->idef.il, top->idef.iparams,
2612                                   md, vsite);
2613     }
2614 }