src/mdlib/vsite.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #ifdef HAVE_CONFIG_H
  39 #include <config.h>
  40 #endif
  41
  42 #include <stdio.h>
  43 #include "typedefs.h"
  44 #include "vsite.h"
  45 #include "macros.h"
  46 #include "smalloc.h"
  47 #include "nrnb.h"
  48 #include "vec.h"
  49 #include "mvdata.h"
  50 #include "network.h"
  51 #include "mshift.h"
  52 #include "pbc.h"
  53 #include "domdec.h"
  54 #include "partdec.h"
  55 #include "mtop_util.h"
  56 #include "gmx_omp_nthreads.h"
  57 #include "gmx_omp.h"
  58
  59 /* Routines to send/recieve coordinates and force
  60  * of constructing atoms.
  61  */
  62
  63 static void move_construct_x(t_comm_vsites *vsitecomm, rvec x[], t_commrec *cr)
  64 {
  65     rvec *sendbuf;
  66     rvec *recvbuf;
  67     int   i, ia;
  68
  69     sendbuf = vsitecomm->send_buf;
  70     recvbuf = vsitecomm->recv_buf;
  71
  72
  73     /* Prepare pulse left by copying to send buffer */
  74     for (i = 0; i < vsitecomm->left_export_nconstruct; i++)
  75     {
  76         ia = vsitecomm->left_export_construct[i];
  77         copy_rvec(x[ia], sendbuf[i]);
  78     }
  79
  80     /* Pulse coordinates left */
  81     gmx_tx_rx_real(cr, GMX_LEFT, (real *)sendbuf, 3*vsitecomm->left_export_nconstruct, GMX_RIGHT, (real *)recvbuf, 3*vsitecomm->right_import_nconstruct);
  82
  83     /* Copy from receive buffer to coordinate array */
  84     for (i = 0; i < vsitecomm->right_import_nconstruct; i++)
  85     {
  86         ia = vsitecomm->right_import_construct[i];
  87         copy_rvec(recvbuf[i], x[ia]);
  88     }
  89
  90     /* Prepare pulse right by copying to send buffer */
  91     for (i = 0; i < vsitecomm->right_export_nconstruct; i++)
  92     {
  93         ia = vsitecomm->right_export_construct[i];
  94         copy_rvec(x[ia], sendbuf[i]);
  95     }
  96
  97     /* Pulse coordinates right */
  98     gmx_tx_rx_real(cr, GMX_RIGHT, (real *)sendbuf, 3*vsitecomm->right_export_nconstruct, GMX_LEFT, (real *)recvbuf, 3*vsitecomm->left_import_nconstruct);
  99
 100     /* Copy from receive buffer to coordinate array */
 101     for (i = 0; i < vsitecomm->left_import_nconstruct; i++)
 102     {
 103         ia = vsitecomm->left_import_construct[i];
 104         copy_rvec(recvbuf[i], x[ia]);
 105     }
 106 }
 107
 108
 109 static void move_construct_f(t_comm_vsites *vsitecomm, rvec f[], t_commrec *cr)
 110 {
 111     rvec *sendbuf;
 112     rvec *recvbuf;
 113     int   i, ia;
 114
 115     sendbuf = vsitecomm->send_buf;
 116     recvbuf = vsitecomm->recv_buf;
 117
 118     /* Prepare pulse right by copying to send buffer */
 119     for (i = 0; i < vsitecomm->right_import_nconstruct; i++)
 120     {
 121         ia = vsitecomm->right_import_construct[i];
 122         copy_rvec(f[ia], sendbuf[i]);
 123         clear_rvec(f[ia]);     /* Zero it here after moving, just to simplify debug book-keeping... */
 124     }
 125
 126     /* Pulse forces right */
 127     gmx_tx_rx_real(cr, GMX_RIGHT, (real *)sendbuf, 3*vsitecomm->right_import_nconstruct, GMX_LEFT, (real *)recvbuf, 3*vsitecomm->left_export_nconstruct);
 128
 129     /* Copy from receive buffer to coordinate array */
 130     for (i = 0; i < vsitecomm->left_export_nconstruct; i++)
 131     {
 132         ia = vsitecomm->left_export_construct[i];
 133         rvec_inc(f[ia], recvbuf[i]);
 134     }
 135
 136     /* Prepare pulse left by copying to send buffer */
 137     for (i = 0; i < vsitecomm->left_import_nconstruct; i++)
 138     {
 139         ia = vsitecomm->left_import_construct[i];
 140         copy_rvec(f[ia], sendbuf[i]);
 141         clear_rvec(f[ia]);     /* Zero it here after moving, just to simplify debug book-keeping... */
 142     }
 143
 144     /* Pulse coordinates left */
 145     gmx_tx_rx_real(cr, GMX_LEFT, (real *)sendbuf, 3*vsitecomm->left_import_nconstruct, GMX_RIGHT, (real *)recvbuf, 3*vsitecomm->right_export_nconstruct);
 146
 147     /* Copy from receive buffer to coordinate array */
 148     for (i = 0; i < vsitecomm->right_export_nconstruct; i++)
 149     {
 150         ia = vsitecomm->right_export_construct[i];
 151         rvec_inc(f[ia], recvbuf[i]);
 152     }
 153
 154     /* All forces are now on the home processors */
 155 }
 156
 157
 158 static void
 159 pd_clear_nonlocal_constructs(t_comm_vsites *vsitecomm, rvec f[])
 160 {
 161     int i, ia;
 162
 163     for (i = 0; i < vsitecomm->left_import_nconstruct; i++)
 164     {
 165         ia = vsitecomm->left_import_construct[i];
 166         clear_rvec(f[ia]);
 167     }
 168     for (i = 0; i < vsitecomm->right_import_nconstruct; i++)
 169     {
 170         ia = vsitecomm->right_import_construct[i];
 171         clear_rvec(f[ia]);
 172     }
 173 }
 174
 175
 176
 177 static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 178 {
 179     if (pbc)
 180     {
 181         return pbc_dx_aiuc(pbc, xi, xj, dx);
 182     }
 183     else
 184     {
 185         rvec_sub(xi, xj, dx);
 186         return CENTRAL;
 187     }
 188 }
 189
 190 /* Vsite construction routines */
 191
 192 static void constr_vsite2(rvec xi, rvec xj, rvec x, real a, t_pbc *pbc)
 193 {
 194     real b;
 195     rvec dx;
 196
 197     b = 1.0-a;
 198     /* 1 flop */
 199
 200     if (pbc)
 201     {
 202         pbc_dx_aiuc(pbc, xj, xi, dx);
 203         x[XX] = xi[XX] + a*dx[XX];
 204         x[YY] = xi[YY] + a*dx[YY];
 205         x[ZZ] = xi[ZZ] + a*dx[ZZ];
 206     }
 207     else
 208     {
 209         x[XX] = b*xi[XX] + a*xj[XX];
 210         x[YY] = b*xi[YY] + a*xj[YY];
 211         x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 212         /* 9 Flops */
 213     }
 214
 215     /* TOTAL: 10 flops */
 216 }
 217
 218 static void constr_vsite3(rvec xi, rvec xj, rvec xk, rvec x, real a, real b,
 219                           t_pbc *pbc)
 220 {
 221     real c;
 222     rvec dxj, dxk;
 223
 224     c = 1.0-a-b;
 225     /* 2 flops */
 226
 227     if (pbc)
 228     {
 229         pbc_dx_aiuc(pbc, xj, xi, dxj);
 230         pbc_dx_aiuc(pbc, xk, xi, dxk);
 231         x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 232         x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 233         x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 234     }
 235     else
 236     {
 237         x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 238         x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 239         x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 240         /* 15 Flops */
 241     }
 242
 243     /* TOTAL: 17 flops */
 244 }
 245
 246 static void constr_vsite3FD(rvec xi, rvec xj, rvec xk, rvec x, real a, real b,
 247                             t_pbc *pbc)
 248 {
 249     rvec xij, xjk, temp;
 250     real c;
 251
 252     pbc_rvec_sub(pbc, xj, xi, xij);
 253     pbc_rvec_sub(pbc, xk, xj, xjk);
 254     /* 6 flops */
 255
 256     /* temp goes from i to a point on the line jk */
 257     temp[XX] = xij[XX] + a*xjk[XX];
 258     temp[YY] = xij[YY] + a*xjk[YY];
 259     temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 260     /* 6 flops */
 261
 262     c = b*gmx_invsqrt(iprod(temp, temp));
 263     /* 6 + 10 flops */
 264
 265     x[XX] = xi[XX] + c*temp[XX];
 266     x[YY] = xi[YY] + c*temp[YY];
 267     x[ZZ] = xi[ZZ] + c*temp[ZZ];
 268     /* 6 Flops */
 269
 270     /* TOTAL: 34 flops */
 271 }
 272
 273 static void constr_vsite3FAD(rvec xi, rvec xj, rvec xk, rvec x, real a, real b, t_pbc *pbc)
 274 {
 275     rvec xij, xjk, xp;
 276     real a1, b1, c1, invdij;
 277
 278     pbc_rvec_sub(pbc, xj, xi, xij);
 279     pbc_rvec_sub(pbc, xk, xj, xjk);
 280     /* 6 flops */
 281
 282     invdij = gmx_invsqrt(iprod(xij, xij));
 283     c1     = invdij * invdij * iprod(xij, xjk);
 284     xp[XX] = xjk[XX] - c1*xij[XX];
 285     xp[YY] = xjk[YY] - c1*xij[YY];
 286     xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 287     a1     = a*invdij;
 288     b1     = b*gmx_invsqrt(iprod(xp, xp));
 289     /* 45 */
 290
 291     x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 292     x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 293     x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 294     /* 12 Flops */
 295
 296     /* TOTAL: 63 flops */
 297 }
 298
 299 static void constr_vsite3OUT(rvec xi, rvec xj, rvec xk, rvec x,
 300                              real a, real b, real c, t_pbc *pbc)
 301 {
 302     rvec xij, xik, temp;
 303
 304     pbc_rvec_sub(pbc, xj, xi, xij);
 305     pbc_rvec_sub(pbc, xk, xi, xik);
 306     cprod(xij, xik, temp);
 307     /* 15 Flops */
 308
 309     x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 310     x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 311     x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 312     /* 18 Flops */
 313
 314     /* TOTAL: 33 flops */
 315 }
 316
 317 static void constr_vsite4FD(rvec xi, rvec xj, rvec xk, rvec xl, rvec x,
 318                             real a, real b, real c, t_pbc *pbc)
 319 {
 320     rvec xij, xjk, xjl, temp;
 321     real d;
 322
 323     pbc_rvec_sub(pbc, xj, xi, xij);
 324     pbc_rvec_sub(pbc, xk, xj, xjk);
 325     pbc_rvec_sub(pbc, xl, xj, xjl);
 326     /* 9 flops */
 327
 328     /* temp goes from i to a point on the plane jkl */
 329     temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 330     temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 331     temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 332     /* 12 flops */
 333
 334     d = c*gmx_invsqrt(iprod(temp, temp));
 335     /* 6 + 10 flops */
 336
 337     x[XX] = xi[XX] + d*temp[XX];
 338     x[YY] = xi[YY] + d*temp[YY];
 339     x[ZZ] = xi[ZZ] + d*temp[ZZ];
 340     /* 6 Flops */
 341
 342     /* TOTAL: 43 flops */
 343 }
 344
 345
 346 static void constr_vsite4FDN(rvec xi, rvec xj, rvec xk, rvec xl, rvec x,
 347                              real a, real b, real c, t_pbc *pbc)
 348 {
 349     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 350     real d;
 351
 352     pbc_rvec_sub(pbc, xj, xi, xij);
 353     pbc_rvec_sub(pbc, xk, xi, xik);
 354     pbc_rvec_sub(pbc, xl, xi, xil);
 355     /* 9 flops */
 356
 357     ra[XX] = a*xik[XX];
 358     ra[YY] = a*xik[YY];
 359     ra[ZZ] = a*xik[ZZ];
 360
 361     rb[XX] = b*xil[XX];
 362     rb[YY] = b*xil[YY];
 363     rb[ZZ] = b*xil[ZZ];
 364
 365     /* 6 flops */
 366
 367     rvec_sub(ra, xij, rja);
 368     rvec_sub(rb, xij, rjb);
 369     /* 6 flops */
 370
 371     cprod(rja, rjb, rm);
 372     /* 9 flops */
 373
 374     d = c*gmx_invsqrt(norm2(rm));
 375     /* 5+5+1 flops */
 376
 377     x[XX] = xi[XX] + d*rm[XX];
 378     x[YY] = xi[YY] + d*rm[YY];
 379     x[ZZ] = xi[ZZ] + d*rm[ZZ];
 380     /* 6 Flops */
 381
 382     /* TOTAL: 47 flops */
 383 }
 384
 385
 386 static int constr_vsiten(t_iatom *ia, t_iparams ip[],
 387                          rvec *x, t_pbc *pbc)
 388 {
 389     rvec xs, x1, dx;
 390     dvec dsum;
 391     int  n3, av, ai, i;
 392     real a;
 393
 394     n3 = 3*ip[ia[0]].vsiten.n;
 395     av = ia[1];
 396     ai = ia[2];
 397     copy_rvec(x[ai], x1);
 398     clear_dvec(dsum);
 399     for (i = 3; i < n3; i += 3)
 400     {
 401         ai = ia[i+2];
 402         a  = ip[ia[i]].vsiten.a;
 403         if (pbc)
 404         {
 405             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 406         }
 407         else
 408         {
 409             rvec_sub(x[ai], x1, dx);
 410         }
 411         dsum[XX] += a*dx[XX];
 412         dsum[YY] += a*dx[YY];
 413         dsum[ZZ] += a*dx[ZZ];
 414         /* 9 Flops */
 415     }
 416
 417     x[av][XX] = x1[XX] + dsum[XX];
 418     x[av][YY] = x1[YY] + dsum[YY];
 419     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 420
 421     return n3;
 422 }
 423
 424
 425 void construct_vsites_thread(gmx_vsite_t *vsite,
 426                              rvec x[], t_nrnb *nrnb,
 427                              real dt, rvec *v,
 428                              t_iparams ip[], t_ilist ilist[],
 429                              t_pbc *pbc_null)
 430 {
 431     gmx_bool   bPBCAll;
 432     rvec       xpbc, xv, vv, dx;
 433     real       a1, b1, c1, inv_dt;
 434     int        i, inc, ii, nra, nr, tp, ftype;
 435     t_iatom    avsite, ai, aj, ak, al, pbc_atom;
 436     t_iatom   *ia;
 437     t_pbc     *pbc_null2;
 438     int       *vsite_pbc, ishift;
 439     rvec       reftmp, vtmp, rtmp;
 440
 441     if (v != NULL)
 442     {
 443         inv_dt = 1.0/dt;
 444     }
 445     else
 446     {
 447         inv_dt = 1.0;
 448     }
 449
 450     bPBCAll = (pbc_null != NULL && !vsite->bHaveChargeGroups);
 451
 452     pbc_null2 = NULL;
 453     vsite_pbc = NULL;
 454     for (ftype = 0; (ftype < F_NRE); ftype++)
 455     {
 456         if ((interaction_function[ftype].flags & IF_VSITE) &&
 457             ilist[ftype].nr > 0)
 458         {
 459             nra    = interaction_function[ftype].nratoms;
 460             inc    = 1 + nra;
 461             nr     = ilist[ftype].nr;
 462             ia     = ilist[ftype].iatoms;
 463
 464             if (bPBCAll)
 465             {
 466                 pbc_null2 = pbc_null;
 467             }
 468             else if (pbc_null != NULL)
 469             {
 470                 vsite_pbc = vsite->vsite_pbc_loc[ftype-F_VSITE2];
 471             }
 472
 473             for (i = 0; i < nr; )
 474             {
 475                 tp   = ia[0];
 476
 477                 /* The vsite and constructing atoms */
 478                 avsite = ia[1];
 479                 ai     = ia[2];
 480
 481                 /* Constants for constructing vsites */
 482                 a1   = ip[tp].vsite.a;
 483                 /* Check what kind of pbc we need to use */
 484                 if (bPBCAll)
 485                 {
 486                     /* No charge groups, vsite follows its own pbc */
 487                     pbc_atom = avsite;
 488                     copy_rvec(x[avsite], xpbc);
 489                 }
 490                 else if (vsite_pbc != NULL)
 491                 {
 492                     pbc_atom = vsite_pbc[i/(1+nra)];
 493                     if (pbc_atom > -2)
 494                     {
 495                         if (pbc_atom >= 0)
 496                         {
 497                             /* We need to copy the coordinates here,
 498                              * single for single atom cg's pbc_atom
 499                              * is the vsite itself.
 500                              */
 501                             copy_rvec(x[pbc_atom], xpbc);
 502                         }
 503                         pbc_null2 = pbc_null;
 504                     }
 505                     else
 506                     {
 507                         pbc_null2 = NULL;
 508                     }
 509                 }
 510                 else
 511                 {
 512                     pbc_atom = -2;
 513                 }
 514                 /* Copy the old position */
 515                 copy_rvec(x[avsite], xv);
 516
 517                 /* Construct the vsite depending on type */
 518                 switch (ftype)
 519                 {
 520                     case F_VSITE2:
 521                         aj = ia[3];
 522                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 523                         break;
 524                     case F_VSITE3:
 525                         aj = ia[3];
 526                         ak = ia[4];
 527                         b1 = ip[tp].vsite.b;
 528                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 529                         break;
 530                     case F_VSITE3FD:
 531                         aj = ia[3];
 532                         ak = ia[4];
 533                         b1 = ip[tp].vsite.b;
 534                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 535                         break;
 536                     case F_VSITE3FAD:
 537                         aj = ia[3];
 538                         ak = ia[4];
 539                         b1 = ip[tp].vsite.b;
 540                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 541                         break;
 542                     case F_VSITE3OUT:
 543                         aj = ia[3];
 544                         ak = ia[4];
 545                         b1 = ip[tp].vsite.b;
 546                         c1 = ip[tp].vsite.c;
 547                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 548                         break;
 549                     case F_VSITE4FD:
 550                         aj = ia[3];
 551                         ak = ia[4];
 552                         al = ia[5];
 553                         b1 = ip[tp].vsite.b;
 554                         c1 = ip[tp].vsite.c;
 555                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 556                                         pbc_null2);
 557                         break;
 558                     case F_VSITE4FDN:
 559                         aj = ia[3];
 560                         ak = ia[4];
 561                         al = ia[5];
 562                         b1 = ip[tp].vsite.b;
 563                         c1 = ip[tp].vsite.c;
 564                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 565                                          pbc_null2);
 566                         break;
 567                     case F_VSITEN:
 568                         inc = constr_vsiten(ia, ip, x, pbc_null2);
 569                         break;
 570                     default:
 571                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 572                                   ftype, __FILE__, __LINE__);
 573                 }
 574
 575                 if (pbc_atom >= 0)
 576                 {
 577                     /* Match the pbc of this vsite to the rest of its charge group */
 578                     ishift = pbc_dx_aiuc(pbc_null, x[avsite], xpbc, dx);
 579                     if (ishift != CENTRAL)
 580                     {
 581                         rvec_add(xpbc, dx, x[avsite]);
 582                     }
 583                 }
 584                 if (v != NULL)
 585                 {
 586                     /* Calculate velocity of vsite... */
 587                     rvec_sub(x[avsite], xv, vv);
 588                     svmul(inv_dt, vv, v[avsite]);
 589                 }
 590
 591                 /* Increment loop variables */
 592                 i  += inc;
 593                 ia += inc;
 594             }
 595         }
 596     }
 597 }
 598
 599 void construct_vsites(FILE *log, gmx_vsite_t *vsite,
 600                       rvec x[], t_nrnb *nrnb,
 601                       real dt, rvec *v,
 602                       t_iparams ip[], t_ilist ilist[],
 603                       int ePBC, gmx_bool bMolPBC, t_graph *graph,
 604                       t_commrec *cr, matrix box)
 605 {
 606     t_pbc     pbc, *pbc_null;
 607     gmx_bool  bDomDec;
 608     int       nthreads;
 609
 610     bDomDec = cr && DOMAINDECOMP(cr);
 611
 612     /* We only need to do pbc when we have inter-cg vsites */
 613     if (ePBC != epbcNONE && (bDomDec || bMolPBC) && vsite->n_intercg_vsite)
 614     {
 615         /* This is wasting some CPU time as we now do this multiple times
 616          * per MD step. But how often do we have vsites with full pbc?
 617          */
 618         pbc_null = set_pbc_dd(&pbc, ePBC, cr != NULL ? cr->dd : NULL, FALSE, box);
 619     }
 620     else
 621     {
 622         pbc_null = NULL;
 623     }
 624
 625     if (cr)
 626     {
 627         if (bDomDec)
 628         {
 629             dd_move_x_vsites(cr->dd, box, x);
 630         }
 631         else if (vsite->bPDvsitecomm)
 632         {
 633             /* I'm not sure whether the periodicity and shift are guaranteed
 634              * to be consistent between different nodes when running e.g. polymers
 635              * in parallel. In this special case we thus unshift/shift,
 636              * but only when necessary. This is to make sure the coordinates
 637              * we move don't end up a box away...
 638              */
 639             if (graph != NULL)
 640             {
 641                 unshift_self(graph, box, x);
 642             }
 643
 644             move_construct_x(vsite->vsitecomm, x, cr);
 645
 646             if (graph != NULL)
 647             {
 648                 shift_self(graph, box, x);
 649             }
 650         }
 651     }
 652
 653     if (vsite->nthreads == 1)
 654     {
 655         construct_vsites_thread(vsite,
 656                                 x, nrnb, dt, v,
 657                                 ip, ilist,
 658                                 pbc_null);
 659     }
 660     else
 661     {
 662 #pragma omp parallel num_threads(vsite->nthreads)
 663         {
 664             construct_vsites_thread(vsite,
 665                                     x, nrnb, dt, v,
 666                                     ip, vsite->tdata[gmx_omp_get_thread_num()].ilist,
 667                                     pbc_null);
 668         }
 669         /* Now we can construct the vsites that might depend on other vsites */
 670         construct_vsites_thread(vsite,
 671                                 x, nrnb, dt, v,
 672                                 ip, vsite->tdata[vsite->nthreads].ilist,
 673                                 pbc_null);
 674     }
 675 }
 676
 677 static void spread_vsite2(t_iatom ia[], real a,
 678                           rvec x[], rvec f[], rvec fshift[],
 679                           t_pbc *pbc, t_graph *g)
 680 {
 681     rvec    fi, fj, dx;
 682     t_iatom av, ai, aj;
 683     ivec    di;
 684     real    b;
 685     int     siv, sij;
 686
 687     av = ia[1];
 688     ai = ia[2];
 689     aj = ia[3];
 690
 691     svmul(1-a, f[av], fi);
 692     svmul(  a, f[av], fj);
 693     /* 7 flop */
 694
 695     rvec_inc(f[ai], fi);
 696     rvec_inc(f[aj], fj);
 697     /* 6 Flops */
 698
 699     if (g)
 700     {
 701         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 702         siv = IVEC2IS(di);
 703         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 704         sij = IVEC2IS(di);
 705     }
 706     else if (pbc)
 707     {
 708         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 709         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 710     }
 711     else
 712     {
 713         siv = CENTRAL;
 714         sij = CENTRAL;
 715     }
 716
 717     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 718     {
 719         rvec_inc(fshift[siv], f[av]);
 720         rvec_dec(fshift[CENTRAL], fi);
 721         rvec_dec(fshift[sij], fj);
 722     }
 723
 724     /* TOTAL: 13 flops */
 725 }
 726
 727 void construct_vsites_mtop(FILE *log, gmx_vsite_t *vsite,
 728                            gmx_mtop_t *mtop, rvec x[])
 729 {
 730     int             as, mb, mol;
 731     gmx_molblock_t *molb;
 732     gmx_moltype_t  *molt;
 733
 734     as = 0;
 735     for (mb = 0; mb < mtop->nmolblock; mb++)
 736     {
 737         molb = &mtop->molblock[mb];
 738         molt = &mtop->moltype[molb->type];
 739         for (mol = 0; mol < molb->nmol; mol++)
 740         {
 741             construct_vsites(log, vsite, x+as, NULL, 0.0, NULL,
 742                              mtop->ffparams.iparams, molt->ilist,
 743                              epbcNONE, TRUE, NULL, NULL, NULL);
 744             as += molt->atoms.nr;
 745         }
 746     }
 747 }
 748
 749 static void spread_vsite3(t_iatom ia[], real a, real b,
 750                           rvec x[], rvec f[], rvec fshift[],
 751                           t_pbc *pbc, t_graph *g)
 752 {
 753     rvec    fi, fj, fk, dx;
 754     atom_id av, ai, aj, ak;
 755     ivec    di;
 756     int     siv, sij, sik;
 757
 758     av = ia[1];
 759     ai = ia[2];
 760     aj = ia[3];
 761     ak = ia[4];
 762
 763     svmul(1-a-b, f[av], fi);
 764     svmul(    a, f[av], fj);
 765     svmul(    b, f[av], fk);
 766     /* 11 flops */
 767
 768     rvec_inc(f[ai], fi);
 769     rvec_inc(f[aj], fj);
 770     rvec_inc(f[ak], fk);
 771     /* 9 Flops */
 772
 773     if (g)
 774     {
 775         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 776         siv = IVEC2IS(di);
 777         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 778         sij = IVEC2IS(di);
 779         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 780         sik = IVEC2IS(di);
 781     }
 782     else if (pbc)
 783     {
 784         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 785         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 786         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 787     }
 788     else
 789     {
 790         siv = CENTRAL;
 791         sij = CENTRAL;
 792         sik = CENTRAL;
 793     }
 794
 795     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 796     {
 797         rvec_inc(fshift[siv], f[av]);
 798         rvec_dec(fshift[CENTRAL], fi);
 799         rvec_dec(fshift[sij], fj);
 800         rvec_dec(fshift[sik], fk);
 801     }
 802
 803     /* TOTAL: 20 flops */
 804 }
 805
 806 static void spread_vsite3FD(t_iatom ia[], real a, real b,
 807                             rvec x[], rvec f[], rvec fshift[],
 808                             gmx_bool VirCorr, matrix dxdf,
 809                             t_pbc *pbc, t_graph *g)
 810 {
 811     real    fx, fy, fz, c, invl, fproj, a1;
 812     rvec    xvi, xij, xjk, xix, fv, temp;
 813     t_iatom av, ai, aj, ak;
 814     int     svi, sji, skj, d;
 815     ivec    di;
 816
 817     av = ia[1];
 818     ai = ia[2];
 819     aj = ia[3];
 820     ak = ia[4];
 821     copy_rvec(f[av], fv);
 822
 823     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 824     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 825     /* 6 flops */
 826
 827     /* xix goes from i to point x on the line jk */
 828     xix[XX] = xij[XX]+a*xjk[XX];
 829     xix[YY] = xij[YY]+a*xjk[YY];
 830     xix[ZZ] = xij[ZZ]+a*xjk[ZZ];
 831     /* 6 flops */
 832
 833     invl = gmx_invsqrt(iprod(xix, xix));
 834     c    = b*invl;
 835     /* 4 + ?10? flops */
 836
 837     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 838
 839     temp[XX] = c*(fv[XX]-fproj*xix[XX]);
 840     temp[YY] = c*(fv[YY]-fproj*xix[YY]);
 841     temp[ZZ] = c*(fv[ZZ]-fproj*xix[ZZ]);
 842     /* 16 */
 843
 844     /* c is already calculated in constr_vsite3FD
 845        storing c somewhere will save 26 flops!     */
 846
 847     a1         = 1-a;
 848     f[ai][XX] += fv[XX] - temp[XX];
 849     f[ai][YY] += fv[YY] - temp[YY];
 850     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 851     f[aj][XX] += a1*temp[XX];
 852     f[aj][YY] += a1*temp[YY];
 853     f[aj][ZZ] += a1*temp[ZZ];
 854     f[ak][XX] += a*temp[XX];
 855     f[ak][YY] += a*temp[YY];
 856     f[ak][ZZ] += a*temp[ZZ];
 857     /* 19 Flops */
 858
 859     if (g)
 860     {
 861         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 862         svi = IVEC2IS(di);
 863         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 864         sji = IVEC2IS(di);
 865         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 866         skj = IVEC2IS(di);
 867     }
 868     else if (pbc)
 869     {
 870         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 871     }
 872     else
 873     {
 874         svi = CENTRAL;
 875     }
 876
 877     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 878     {
 879         rvec_dec(fshift[svi], fv);
 880         fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 881         fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 882         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 883         fshift[    sji][XX] += temp[XX];
 884         fshift[    sji][YY] += temp[YY];
 885         fshift[    sji][ZZ] += temp[ZZ];
 886         fshift[    skj][XX] += a*temp[XX];
 887         fshift[    skj][YY] += a*temp[YY];
 888         fshift[    skj][ZZ] += a*temp[ZZ];
 889     }
 890
 891     if (VirCorr)
 892     {
 893         /* When VirCorr=TRUE, the virial for the current forces is not
 894          * calculated from the redistributed forces. This means that
 895          * the effect of non-linear virtual site constructions on the virial
 896          * needs to be added separately. This contribution can be calculated
 897          * in many ways, but the simplest and cheapest way is to use
 898          * the first constructing atom ai as a reference position in space:
 899          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 900          */
 901         rvec xiv;
 902         int  i, j;
 903
 904         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 905
 906         for (i = 0; i < DIM; i++)
 907         {
 908             for (j = 0; j < DIM; j++)
 909             {
 910                 /* As xix is a linear combination of j and k, use that here */
 911                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 912             }
 913         }
 914     }
 915
 916     /* TOTAL: 61 flops */
 917 }
 918
 919 static void spread_vsite3FAD(t_iatom ia[], real a, real b,
 920                              rvec x[], rvec f[], rvec fshift[],
 921                              gmx_bool VirCorr, matrix dxdf,
 922                              t_pbc *pbc, t_graph *g)
 923 {
 924     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 925     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 926     t_iatom av, ai, aj, ak;
 927     int     svi, sji, skj, d;
 928     ivec    di;
 929
 930     av = ia[1];
 931     ai = ia[2];
 932     aj = ia[3];
 933     ak = ia[4];
 934     copy_rvec(f[ia[1]], fv);
 935
 936     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 937     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 938     /* 6 flops */
 939
 940     invdij    = gmx_invsqrt(iprod(xij, xij));
 941     invdij2   = invdij * invdij;
 942     c1        = iprod(xij, xjk) * invdij2;
 943     xperp[XX] = xjk[XX] - c1*xij[XX];
 944     xperp[YY] = xjk[YY] - c1*xij[YY];
 945     xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 946     /* xperp in plane ijk, perp. to ij */
 947     invdp = gmx_invsqrt(iprod(xperp, xperp));
 948     a1    = a*invdij;
 949     b1    = b*invdp;
 950     /* 45 flops */
 951
 952     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 953        storing them somewhere will save 45 flops!     */
 954
 955     fproj = iprod(xij, fv)*invdij2;
 956     svmul(fproj,                      xij,  Fpij);    /* proj. f on xij */
 957     svmul(iprod(xperp, fv)*invdp*invdp, xperp, Fppp); /* proj. f on xperp */
 958     svmul(b1*fproj,                   xperp, f3);
 959     /* 23 flops */
 960
 961     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
 962     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
 963     for (d = 0; (d < DIM); d++)
 964     {
 965         f1[d] *= a1;
 966         f2[d] *= b1;
 967     }
 968     /* 12 flops */
 969
 970     c2         = 1+c1;
 971     f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
 972     f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
 973     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
 974     f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
 975     f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
 976     f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
 977     f[ak][XX] +=                      f2[XX];
 978     f[ak][YY] +=                      f2[YY];
 979     f[ak][ZZ] +=                      f2[ZZ];
 980     /* 30 Flops */
 981
 982     if (g)
 983     {
 984         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 985         svi = IVEC2IS(di);
 986         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 987         sji = IVEC2IS(di);
 988         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 989         skj = IVEC2IS(di);
 990     }
 991     else if (pbc)
 992     {
 993         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 994     }
 995     else
 996     {
 997         svi = CENTRAL;
 998     }
 999
1000     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1001     {
1002         rvec_dec(fshift[svi], fv);
1003         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
1004         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
1005         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
1006         fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
1007         fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
1008         fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
1009         fshift[    skj][XX] +=                          f2[XX];
1010         fshift[    skj][YY] +=                          f2[YY];
1011         fshift[    skj][ZZ] +=                          f2[ZZ];
1012     }
1013
1014     if (VirCorr)
1015     {
1016         rvec xiv;
1017         int  i, j;
1018
1019         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1020
1021         for (i = 0; i < DIM; i++)
1022         {
1023             for (j = 0; j < DIM; j++)
1024             {
1025                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1026                 dxdf[i][j] +=
1027                     -xiv[i]*fv[j]
1028                     + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
1029                     + xjk[i]*f2[j];
1030             }
1031         }
1032     }
1033
1034     /* TOTAL: 113 flops */
1035 }
1036
1037 static void spread_vsite3OUT(t_iatom ia[], real a, real b, real c,
1038                              rvec x[], rvec f[], rvec fshift[],
1039                              gmx_bool VirCorr, matrix dxdf,
1040                              t_pbc *pbc, t_graph *g)
1041 {
1042     rvec    xvi, xij, xik, fv, fj, fk;
1043     real    cfx, cfy, cfz;
1044     atom_id av, ai, aj, ak;
1045     ivec    di;
1046     int     svi, sji, ski;
1047
1048     av = ia[1];
1049     ai = ia[2];
1050     aj = ia[3];
1051     ak = ia[4];
1052
1053     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1054     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1055     /* 6 Flops */
1056
1057     copy_rvec(f[av], fv);
1058
1059     cfx = c*fv[XX];
1060     cfy = c*fv[YY];
1061     cfz = c*fv[ZZ];
1062     /* 3 Flops */
1063
1064     fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
1065     fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
1066     fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
1067
1068     fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
1069     fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
1070     fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
1071     /* 30 Flops */
1072
1073     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1074     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1075     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1076     rvec_inc(f[aj], fj);
1077     rvec_inc(f[ak], fk);
1078     /* 15 Flops */
1079
1080     if (g)
1081     {
1082         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1083         svi = IVEC2IS(di);
1084         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1085         sji = IVEC2IS(di);
1086         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1087         ski = IVEC2IS(di);
1088     }
1089     else if (pbc)
1090     {
1091         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1092     }
1093     else
1094     {
1095         svi = CENTRAL;
1096     }
1097
1098     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1099     {
1100         rvec_dec(fshift[svi], fv);
1101         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1102         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1103         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1104         rvec_inc(fshift[sji], fj);
1105         rvec_inc(fshift[ski], fk);
1106     }
1107
1108     if (VirCorr)
1109     {
1110         rvec xiv;
1111         int  i, j;
1112
1113         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1114
1115         for (i = 0; i < DIM; i++)
1116         {
1117             for (j = 0; j < DIM; j++)
1118             {
1119                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
1120             }
1121         }
1122     }
1123
1124     /* TOTAL: 54 flops */
1125 }
1126
1127 static void spread_vsite4FD(t_iatom ia[], real a, real b, real c,
1128                             rvec x[], rvec f[], rvec fshift[],
1129                             gmx_bool VirCorr, matrix dxdf,
1130                             t_pbc *pbc, t_graph *g)
1131 {
1132     real    d, invl, fproj, a1;
1133     rvec    xvi, xij, xjk, xjl, xix, fv, temp;
1134     atom_id av, ai, aj, ak, al;
1135     ivec    di;
1136     int     svi, sji, skj, slj, m;
1137
1138     av = ia[1];
1139     ai = ia[2];
1140     aj = ia[3];
1141     ak = ia[4];
1142     al = ia[5];
1143
1144     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1145     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1146     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1147     /* 9 flops */
1148
1149     /* xix goes from i to point x on the plane jkl */
1150     for (m = 0; m < DIM; m++)
1151     {
1152         xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
1153     }
1154     /* 12 flops */
1155
1156     invl = gmx_invsqrt(iprod(xix, xix));
1157     d    = c*invl;
1158     /* 4 + ?10? flops */
1159
1160     copy_rvec(f[av], fv);
1161
1162     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
1163
1164     for (m = 0; m < DIM; m++)
1165     {
1166         temp[m] = d*(fv[m] - fproj*xix[m]);
1167     }
1168     /* 16 */
1169
1170     /* c is already calculated in constr_vsite3FD
1171        storing c somewhere will save 35 flops!     */
1172
1173     a1 = 1 - a - b;
1174     for (m = 0; m < DIM; m++)
1175     {
1176         f[ai][m] += fv[m] - temp[m];
1177         f[aj][m] += a1*temp[m];
1178         f[ak][m] += a*temp[m];
1179         f[al][m] += b*temp[m];
1180     }
1181     /* 26 Flops */
1182
1183     if (g)
1184     {
1185         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1186         svi = IVEC2IS(di);
1187         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1188         sji = IVEC2IS(di);
1189         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1190         skj = IVEC2IS(di);
1191         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1192         slj = IVEC2IS(di);
1193     }
1194     else if (pbc)
1195     {
1196         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1197     }
1198     else
1199     {
1200         svi = CENTRAL;
1201     }
1202
1203     if (fshift &&
1204         (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1205     {
1206         rvec_dec(fshift[svi], fv);
1207         for (m = 0; m < DIM; m++)
1208         {
1209             fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
1210             fshift[    sji][m] += temp[m];
1211             fshift[    skj][m] += a*temp[m];
1212             fshift[    slj][m] += b*temp[m];
1213         }
1214     }
1215
1216     if (VirCorr)
1217     {
1218         rvec xiv;
1219         int  i, j;
1220
1221         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1222
1223         for (i = 0; i < DIM; i++)
1224         {
1225             for (j = 0; j < DIM; j++)
1226             {
1227                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
1228             }
1229         }
1230     }
1231
1232     /* TOTAL: 77 flops */
1233 }
1234
1235
1236 static void spread_vsite4FDN(t_iatom ia[], real a, real b, real c,
1237                              rvec x[], rvec f[], rvec fshift[],
1238                              gmx_bool VirCorr, matrix dxdf,
1239                              t_pbc *pbc, t_graph *g)
1240 {
1241     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1242     rvec fv, fj, fk, fl;
1243     real invrm, denom;
1244     real cfx, cfy, cfz;
1245     ivec di;
1246     int  av, ai, aj, ak, al;
1247     int  svi, sij, sik, sil;
1248
1249     /* DEBUG: check atom indices */
1250     av = ia[1];
1251     ai = ia[2];
1252     aj = ia[3];
1253     ak = ia[4];
1254     al = ia[5];
1255
1256     copy_rvec(f[av], fv);
1257
1258     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1259     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1260     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1261     /* 9 flops */
1262
1263     ra[XX] = a*xik[XX];
1264     ra[YY] = a*xik[YY];
1265     ra[ZZ] = a*xik[ZZ];
1266
1267     rb[XX] = b*xil[XX];
1268     rb[YY] = b*xil[YY];
1269     rb[ZZ] = b*xil[ZZ];
1270
1271     /* 6 flops */
1272
1273     rvec_sub(ra, xij, rja);
1274     rvec_sub(rb, xij, rjb);
1275     rvec_sub(rb, ra, rab);
1276     /* 9 flops */
1277
1278     cprod(rja, rjb, rm);
1279     /* 9 flops */
1280
1281     invrm = gmx_invsqrt(norm2(rm));
1282     denom = invrm*invrm;
1283     /* 5+5+2 flops */
1284
1285     cfx = c*invrm*fv[XX];
1286     cfy = c*invrm*fv[YY];
1287     cfz = c*invrm*fv[ZZ];
1288     /* 6 Flops */
1289
1290     cprod(rm, rab, rt);
1291     /* 9 flops */
1292
1293     rt[XX] *= denom;
1294     rt[YY] *= denom;
1295     rt[ZZ] *= denom;
1296     /* 3flops */
1297
1298     fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
1299     fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
1300     fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
1301     /* 30 flops */
1302
1303     cprod(rjb, rm, rt);
1304     /* 9 flops */
1305
1306     rt[XX] *= denom*a;
1307     rt[YY] *= denom*a;
1308     rt[ZZ] *= denom*a;
1309     /* 3flops */
1310
1311     fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
1312     fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
1313     fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1314     /* 36 flops */
1315
1316     cprod(rm, rja, rt);
1317     /* 9 flops */
1318
1319     rt[XX] *= denom*b;
1320     rt[YY] *= denom*b;
1321     rt[ZZ] *= denom*b;
1322     /* 3flops */
1323
1324     fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
1325     fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
1326     fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1327     /* 36 flops */
1328
1329     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1330     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1331     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1332     rvec_inc(f[aj], fj);
1333     rvec_inc(f[ak], fk);
1334     rvec_inc(f[al], fl);
1335     /* 21 flops */
1336
1337     if (g)
1338     {
1339         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1340         svi = IVEC2IS(di);
1341         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1342         sij = IVEC2IS(di);
1343         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1344         sik = IVEC2IS(di);
1345         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1346         sil = IVEC2IS(di);
1347     }
1348     else if (pbc)
1349     {
1350         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1351     }
1352     else
1353     {
1354         svi = CENTRAL;
1355     }
1356
1357     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1358     {
1359         rvec_dec(fshift[svi], fv);
1360         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1361         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1362         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1363         rvec_inc(fshift[sij], fj);
1364         rvec_inc(fshift[sik], fk);
1365         rvec_inc(fshift[sil], fl);
1366     }
1367
1368     if (VirCorr)
1369     {
1370         rvec xiv;
1371         int  i, j;
1372
1373         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1374
1375         for (i = 0; i < DIM; i++)
1376         {
1377             for (j = 0; j < DIM; j++)
1378             {
1379                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
1380             }
1381         }
1382     }
1383
1384     /* Total: 207 flops (Yuck!) */
1385 }
1386
1387
1388 static int spread_vsiten(t_iatom ia[], t_iparams ip[],
1389                          rvec x[], rvec f[], rvec fshift[],
1390                          t_pbc *pbc, t_graph *g)
1391 {
1392     rvec xv, dx, fi;
1393     int  n3, av, i, ai;
1394     real a;
1395     ivec di;
1396     int  siv;
1397
1398     n3 = 3*ip[ia[0]].vsiten.n;
1399     av = ia[1];
1400     copy_rvec(x[av], xv);
1401
1402     for (i = 0; i < n3; i += 3)
1403     {
1404         ai = ia[i+2];
1405         if (g)
1406         {
1407             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1408             siv = IVEC2IS(di);
1409         }
1410         else if (pbc)
1411         {
1412             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1413         }
1414         else
1415         {
1416             siv = CENTRAL;
1417         }
1418         a = ip[ia[i]].vsiten.a;
1419         svmul(a, f[av], fi);
1420         rvec_inc(f[ai], fi);
1421         if (fshift && siv != CENTRAL)
1422         {
1423             rvec_inc(fshift[siv], fi);
1424             rvec_dec(fshift[CENTRAL], fi);
1425         }
1426         /* 6 Flops */
1427     }
1428
1429     return n3;
1430 }
1431
1432
1433 static int vsite_count(const t_ilist *ilist, int ftype)
1434 {
1435     if (ftype == F_VSITEN)
1436     {
1437         return ilist[ftype].nr/3;
1438     }
1439     else
1440     {
1441         return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
1442     }
1443 }
1444
1445 static void spread_vsite_f_thread(gmx_vsite_t *vsite,
1446                                   rvec x[], rvec f[], rvec *fshift,
1447                                   gmx_bool VirCorr, matrix dxdf,
1448                                   t_iparams ip[], t_ilist ilist[],
1449                                   t_graph *g, t_pbc *pbc_null)
1450 {
1451     gmx_bool   bPBCAll;
1452     real       a1, b1, c1;
1453     int        i, inc, m, nra, nr, tp, ftype;
1454     t_iatom   *ia;
1455     t_pbc     *pbc_null2;
1456     int       *vsite_pbc;
1457
1458     if (VirCorr)
1459     {
1460         clear_mat(dxdf);
1461     }
1462
1463     bPBCAll = (pbc_null != NULL && !vsite->bHaveChargeGroups);
1464
1465     /* this loop goes backwards to be able to build *
1466      * higher type vsites from lower types         */
1467     pbc_null2 = NULL;
1468     vsite_pbc = NULL;
1469     for (ftype = F_NRE-1; (ftype >= 0); ftype--)
1470     {
1471         if ((interaction_function[ftype].flags & IF_VSITE) &&
1472             ilist[ftype].nr > 0)
1473         {
1474             nra    = interaction_function[ftype].nratoms;
1475             inc    = 1 + nra;
1476             nr     = ilist[ftype].nr;
1477             ia     = ilist[ftype].iatoms;
1478
1479             if (bPBCAll)
1480             {
1481                 pbc_null2 = pbc_null;
1482             }
1483             else if (pbc_null != NULL)
1484             {
1485                 vsite_pbc = vsite->vsite_pbc_loc[ftype-F_VSITE2];
1486             }
1487
1488             for (i = 0; i < nr; )
1489             {
1490                 if (vsite_pbc != NULL)
1491                 {
1492                     if (vsite_pbc[i/(1+nra)] > -2)
1493                     {
1494                         pbc_null2 = pbc_null;
1495                     }
1496                     else
1497                     {
1498                         pbc_null2 = NULL;
1499                     }
1500                 }
1501
1502                 tp   = ia[0];
1503
1504                 /* Constants for constructing */
1505                 a1   = ip[tp].vsite.a;
1506                 /* Construct the vsite depending on type */
1507                 switch (ftype)
1508                 {
1509                     case F_VSITE2:
1510                         spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g);
1511                         break;
1512                     case F_VSITE3:
1513                         b1 = ip[tp].vsite.b;
1514                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1515                         break;
1516                     case F_VSITE3FD:
1517                         b1 = ip[tp].vsite.b;
1518                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1519                         break;
1520                     case F_VSITE3FAD:
1521                         b1 = ip[tp].vsite.b;
1522                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1523                         break;
1524                     case F_VSITE3OUT:
1525                         b1 = ip[tp].vsite.b;
1526                         c1 = ip[tp].vsite.c;
1527                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1528                         break;
1529                     case F_VSITE4FD:
1530                         b1 = ip[tp].vsite.b;
1531                         c1 = ip[tp].vsite.c;
1532                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1533                         break;
1534                     case F_VSITE4FDN:
1535                         b1 = ip[tp].vsite.b;
1536                         c1 = ip[tp].vsite.c;
1537                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1538                         break;
1539                     case F_VSITEN:
1540                         inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g);
1541                         break;
1542                     default:
1543                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
1544                                   ftype, __FILE__, __LINE__);
1545                 }
1546                 clear_rvec(f[ia[1]]);
1547
1548                 /* Increment loop variables */
1549                 i  += inc;
1550                 ia += inc;
1551             }
1552         }
1553     }
1554 }
1555
1556 void spread_vsite_f(FILE *log, gmx_vsite_t *vsite,
1557                     rvec x[], rvec f[], rvec *fshift,
1558                     gmx_bool VirCorr, matrix vir,
1559                     t_nrnb *nrnb, t_idef *idef,
1560                     int ePBC, gmx_bool bMolPBC, t_graph *g, matrix box,
1561                     t_commrec *cr)
1562 {
1563     t_pbc pbc, *pbc_null;
1564     int   th;
1565
1566     /* We only need to do pbc when we have inter-cg vsites */
1567     if ((DOMAINDECOMP(cr) || bMolPBC) && vsite->n_intercg_vsite)
1568     {
1569         /* This is wasting some CPU time as we now do this multiple times
1570          * per MD step. But how often do we have vsites with full pbc?
1571          */
1572         pbc_null = set_pbc_dd(&pbc, ePBC, cr->dd, FALSE, box);
1573     }
1574     else
1575     {
1576         pbc_null = NULL;
1577     }
1578
1579     if (DOMAINDECOMP(cr))
1580     {
1581         dd_clear_f_vsites(cr->dd, f);
1582     }
1583     else if (PARTDECOMP(cr) && vsite->vsitecomm != NULL)
1584     {
1585         pd_clear_nonlocal_constructs(vsite->vsitecomm, f);
1586     }
1587
1588     if (vsite->nthreads == 1)
1589     {
1590         spread_vsite_f_thread(vsite,
1591                               x, f, fshift,
1592                               VirCorr, vsite->tdata[0].dxdf,
1593                               idef->iparams, idef->il,
1594                               g, pbc_null);
1595     }
1596     else
1597     {
1598         /* First spread the vsites that might depend on other vsites */
1599         spread_vsite_f_thread(vsite,
1600                               x, f, fshift,
1601                               VirCorr, vsite->tdata[vsite->nthreads].dxdf,
1602                               idef->iparams,
1603                               vsite->tdata[vsite->nthreads].ilist,
1604                               g, pbc_null);
1605
1606 #pragma omp parallel num_threads(vsite->nthreads)
1607         {
1608             int   thread;
1609             rvec *fshift_t;
1610
1611             thread = gmx_omp_get_thread_num();
1612
1613             if (thread == 0 || fshift == NULL)
1614             {
1615                 fshift_t = fshift;
1616             }
1617             else
1618             {
1619                 int i;
1620
1621                 fshift_t = vsite->tdata[thread].fshift;
1622
1623                 for (i = 0; i < SHIFTS; i++)
1624                 {
1625                     clear_rvec(fshift_t[i]);
1626                 }
1627             }
1628
1629             spread_vsite_f_thread(vsite,
1630                                   x, f, fshift_t,
1631                                   VirCorr, vsite->tdata[thread].dxdf,
1632                                   idef->iparams,
1633                                   vsite->tdata[thread].ilist,
1634                                   g, pbc_null);
1635         }
1636
1637         if (fshift != NULL)
1638         {
1639             int i;
1640
1641             for (th = 1; th < vsite->nthreads; th++)
1642             {
1643                 for (i = 0; i < SHIFTS; i++)
1644                 {
1645                     rvec_inc(fshift[i], vsite->tdata[th].fshift[i]);
1646                 }
1647             }
1648         }
1649     }
1650
1651     if (VirCorr)
1652     {
1653         int i, j;
1654
1655         for (th = 0; th < (vsite->nthreads == 1 ? 1 : vsite->nthreads+1); th++)
1656         {
1657             for (i = 0; i < DIM; i++)
1658             {
1659                 for (j = 0; j < DIM; j++)
1660                 {
1661                     vir[i][j] += -0.5*vsite->tdata[th].dxdf[i][j];
1662                 }
1663             }
1664         }
1665     }
1666
1667     if (DOMAINDECOMP(cr))
1668     {
1669         dd_move_f_vsites(cr->dd, f, fshift);
1670     }
1671     else if (vsite->bPDvsitecomm)
1672     {
1673         /* We only move forces here, and they are independent of shifts */
1674         move_construct_f(vsite->vsitecomm, f, cr);
1675     }
1676
1677     inc_nrnb(nrnb, eNR_VSITE2,   vsite_count(idef->il, F_VSITE2));
1678     inc_nrnb(nrnb, eNR_VSITE3,   vsite_count(idef->il, F_VSITE3));
1679     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
1680     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
1681     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
1682     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
1683     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
1684     inc_nrnb(nrnb, eNR_VSITEN,   vsite_count(idef->il, F_VSITEN));
1685 }
1686
1687 static int *atom2cg(t_block *cgs)
1688 {
1689     int *a2cg, cg, i;
1690
1691     snew(a2cg, cgs->index[cgs->nr]);
1692     for (cg = 0; cg < cgs->nr; cg++)
1693     {
1694         for (i = cgs->index[cg]; i < cgs->index[cg+1]; i++)
1695         {
1696             a2cg[i] = cg;
1697         }
1698     }
1699
1700     return a2cg;
1701 }
1702
1703 static int count_intercg_vsite(gmx_mtop_t *mtop,
1704                                gmx_bool   *bHaveChargeGroups)
1705 {
1706     int             mb, mt, ftype, nral, i, cg, a;
1707     gmx_molblock_t *molb;
1708     gmx_moltype_t  *molt;
1709     int            *a2cg;
1710     t_ilist        *il;
1711     t_iatom        *ia;
1712     int             n_intercg_vsite;
1713
1714     *bHaveChargeGroups = FALSE;
1715
1716     n_intercg_vsite = 0;
1717     for (mb = 0; mb < mtop->nmolblock; mb++)
1718     {
1719         molb = &mtop->molblock[mb];
1720         molt = &mtop->moltype[molb->type];
1721
1722         if (molt->cgs.nr < molt->atoms.nr)
1723         {
1724             *bHaveChargeGroups = TRUE;
1725         }
1726
1727         a2cg = atom2cg(&molt->cgs);
1728         for (ftype = 0; ftype < F_NRE; ftype++)
1729         {
1730             if (interaction_function[ftype].flags & IF_VSITE)
1731             {
1732                 nral = NRAL(ftype);
1733                 il   = &molt->ilist[ftype];
1734                 ia   = il->iatoms;
1735                 for (i = 0; i < il->nr; i += 1+nral)
1736                 {
1737                     cg = a2cg[ia[1+i]];
1738                     for (a = 1; a < nral; a++)
1739                     {
1740                         if (a2cg[ia[1+a]] != cg)
1741                         {
1742                             n_intercg_vsite += molb->nmol;
1743                             break;
1744                         }
1745                     }
1746                 }
1747             }
1748         }
1749         sfree(a2cg);
1750     }
1751
1752     return n_intercg_vsite;
1753 }
1754
1755 static int **get_vsite_pbc(t_iparams *iparams, t_ilist *ilist,
1756                            t_atom *atom, t_mdatoms *md,
1757                            t_block *cgs, int *a2cg)
1758 {
1759     int      ftype, nral, i, j, vsi, vsite, cg_v, cg_c, a, nc3 = 0;
1760     t_ilist *il;
1761     t_iatom *ia;
1762     int    **vsite_pbc, *vsite_pbc_f;
1763     char    *pbc_set;
1764     gmx_bool bViteOnlyCG_and_FirstAtom;
1765
1766     /* Make an array that tells if the pbc of an atom is set */
1767     snew(pbc_set, cgs->index[cgs->nr]);
1768     /* PBC is set for all non vsites */
1769     for (a = 0; a < cgs->index[cgs->nr]; a++)
1770     {
1771         if ((atom && atom[a].ptype != eptVSite) ||
1772             (md   && md->ptype[a]  != eptVSite))
1773         {
1774             pbc_set[a] = 1;
1775         }
1776     }
1777
1778     snew(vsite_pbc, F_VSITEN-F_VSITE2+1);
1779
1780     for (ftype = 0; ftype < F_NRE; ftype++)
1781     {
1782         if (interaction_function[ftype].flags & IF_VSITE)
1783         {
1784             nral = NRAL(ftype);
1785             il   = &ilist[ftype];
1786             ia   = il->iatoms;
1787
1788             snew(vsite_pbc[ftype-F_VSITE2], il->nr/(1+nral));
1789             vsite_pbc_f = vsite_pbc[ftype-F_VSITE2];
1790
1791             i = 0;
1792             while (i < il->nr)
1793             {
1794                 vsi   = i/(1+nral);
1795                 vsite = ia[i+1];
1796                 cg_v  = a2cg[vsite];
1797                 /* A value of -2 signals that this vsite and its contructing
1798                  * atoms are all within the same cg, so no pbc is required.
1799                  */
1800                 vsite_pbc_f[vsi] = -2;
1801                 /* Check if constructing atoms are outside the vsite's cg */
1802                 nc3 = 0;
1803                 if (ftype == F_VSITEN)
1804                 {
1805                     nc3 = 3*iparams[ia[i]].vsiten.n;
1806                     for (j = 0; j < nc3; j += 3)
1807                     {
1808                         if (a2cg[ia[i+j+2]] != cg_v)
1809                         {
1810                             vsite_pbc_f[vsi] = -1;
1811                         }
1812                     }
1813                 }
1814                 else
1815                 {
1816                     for (a = 1; a < nral; a++)
1817                     {
1818                         if (a2cg[ia[i+1+a]] != cg_v)
1819                         {
1820                             vsite_pbc_f[vsi] = -1;
1821                         }
1822                     }
1823                 }
1824                 if (vsite_pbc_f[vsi] == -1)
1825                 {
1826                     /* Check if this is the first processed atom of a vsite only cg */
1827                     bViteOnlyCG_and_FirstAtom = TRUE;
1828                     for (a = cgs->index[cg_v]; a < cgs->index[cg_v+1]; a++)
1829                     {
1830                         /* Non-vsites already have pbc set, so simply check for pbc_set */
1831                         if (pbc_set[a])
1832                         {
1833                             bViteOnlyCG_and_FirstAtom = FALSE;
1834                             break;
1835                         }
1836                     }
1837                     if (bViteOnlyCG_and_FirstAtom)
1838                     {
1839                         /* First processed atom of a vsite only charge group.
1840                          * The pbc of the input coordinates to construct_vsites
1841                          * should be preserved.
1842                          */
1843                         vsite_pbc_f[vsi] = vsite;
1844                     }
1845                     else if (cg_v != a2cg[ia[1+i+1]])
1846                     {
1847                         /* This vsite has a different charge group index
1848                          * than it's first constructing atom
1849                          * and the charge group has more than one atom,
1850                          * search for the first normal particle
1851                          * or vsite that already had its pbc defined.
1852                          * If nothing is found, use full pbc for this vsite.
1853                          */
1854                         for (a = cgs->index[cg_v]; a < cgs->index[cg_v+1]; a++)
1855                         {
1856                             if (a != vsite && pbc_set[a])
1857                             {
1858                                 vsite_pbc_f[vsi] = a;
1859                                 if (gmx_debug_at)
1860                                 {
1861                                     fprintf(debug, "vsite %d match pbc with atom %d\n",
1862                                             vsite+1, a+1);
1863                                 }
1864                                 break;
1865                             }
1866                         }
1867                         if (gmx_debug_at)
1868                         {
1869                             fprintf(debug, "vsite atom %d  cg %d - %d pbc atom %d\n",
1870                                     vsite+1, cgs->index[cg_v]+1, cgs->index[cg_v+1],
1871                                     vsite_pbc_f[vsi]+1);
1872                         }
1873                     }
1874                 }
1875                 if (ftype == F_VSITEN)
1876                 {
1877                     /* The other entries in vsite_pbc_f are not used for center vsites */
1878                     i += nc3;
1879                 }
1880                 else
1881                 {
1882                     i += 1+nral;
1883                 }
1884
1885                 /* This vsite now has its pbc defined */
1886                 pbc_set[vsite] = 1;
1887             }
1888         }
1889     }
1890
1891     sfree(pbc_set);
1892
1893     return vsite_pbc;
1894 }
1895
1896
1897 gmx_vsite_t *init_vsite(gmx_mtop_t *mtop, t_commrec *cr,
1898                         gmx_bool bSerial_NoPBC)
1899 {
1900     int            nvsite, i;
1901     int           *a2cg, cg;
1902     gmx_vsite_t   *vsite;
1903     int            mt;
1904     gmx_moltype_t *molt;
1905     int            nthreads;
1906
1907     /* check if there are vsites */
1908     nvsite = 0;
1909     for (i = 0; i < F_NRE; i++)
1910     {
1911         if (interaction_function[i].flags & IF_VSITE)
1912         {
1913             nvsite += gmx_mtop_ftype_count(mtop, i);
1914         }
1915     }
1916
1917     if (nvsite == 0)
1918     {
1919         return NULL;
1920     }
1921
1922     snew(vsite, 1);
1923
1924     vsite->n_intercg_vsite = count_intercg_vsite(mtop,
1925                                                  &vsite->bHaveChargeGroups);
1926
1927     /* If we don't have charge groups, the vsite follows its own pbc */
1928     if (!bSerial_NoPBC &&
1929         vsite->bHaveChargeGroups &&
1930         vsite->n_intercg_vsite > 0 && DOMAINDECOMP(cr))
1931     {
1932         vsite->nvsite_pbc_molt = mtop->nmoltype;
1933         snew(vsite->vsite_pbc_molt, vsite->nvsite_pbc_molt);
1934         for (mt = 0; mt < mtop->nmoltype; mt++)
1935         {
1936             molt = &mtop->moltype[mt];
1937             /* Make an atom to charge group index */
1938             a2cg = atom2cg(&molt->cgs);
1939             vsite->vsite_pbc_molt[mt] = get_vsite_pbc(mtop->ffparams.iparams,
1940                                                       molt->ilist,
1941                                                       molt->atoms.atom, NULL,
1942                                                       &molt->cgs, a2cg);
1943             sfree(a2cg);
1944         }
1945
1946         snew(vsite->vsite_pbc_loc_nalloc, F_VSITEN-F_VSITE2+1);
1947         snew(vsite->vsite_pbc_loc, F_VSITEN-F_VSITE2+1);
1948     }
1949
1950     if (bSerial_NoPBC)
1951     {
1952         vsite->nthreads = 1;
1953     }
1954     else
1955     {
1956         vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
1957     }
1958     if (!bSerial_NoPBC)
1959     {
1960         /* We need one extra thread data structure for the overlap vsites */
1961         snew(vsite->tdata, vsite->nthreads+1);
1962     }
1963
1964     vsite->th_ind        = NULL;
1965     vsite->th_ind_nalloc = 0;
1966
1967     return vsite;
1968 }
1969
1970 static void prepare_vsite_thread(const t_ilist      *ilist,
1971                                  gmx_vsite_thread_t *vsite_th)
1972 {
1973     int ftype;
1974
1975     for (ftype = 0; ftype < F_NRE; ftype++)
1976     {
1977         if (interaction_function[ftype].flags & IF_VSITE)
1978         {
1979             if (ilist[ftype].nr > vsite_th->ilist[ftype].nalloc)
1980             {
1981                 vsite_th->ilist[ftype].nalloc = over_alloc_large(ilist[ftype].nr);
1982                 srenew(vsite_th->ilist[ftype].iatoms, vsite_th->ilist[ftype].nalloc);
1983             }
1984
1985             vsite_th->ilist[ftype].nr = 0;
1986         }
1987     }
1988 }
1989
1990 void split_vsites_over_threads(const t_ilist   *ilist,
1991                                const t_mdatoms *mdatoms,
1992                                gmx_bool         bLimitRange,
1993                                gmx_vsite_t     *vsite)
1994 {
1995     int      th;
1996     int      vsite_atom_range, natperthread;
1997     int     *th_ind;
1998     int      ftype;
1999     t_iatom *iat;
2000     t_ilist *il_th;
2001     int      nral1, inc, i, j;
2002
2003     if (vsite->nthreads == 1)
2004     {
2005         /* Nothing to do */
2006         return;
2007     }
2008
2009 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2010     for (th = 0; th < vsite->nthreads; th++)
2011     {
2012         prepare_vsite_thread(ilist, &vsite->tdata[th]);
2013     }
2014     /* Master threads does the (potential) overlap vsites */
2015     prepare_vsite_thread(ilist, &vsite->tdata[vsite->nthreads]);
2016
2017     /* The current way of distributing the vsites over threads in primitive.
2018      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2019      * without taking into account how the vsites are distributed.
2020      * Without domain decomposition we bLimitRange=TRUE and we at least
2021      * tighten the upper bound of the range (useful for common systems
2022      * such as a vsite-protein in 3-site water).
2023      */
2024     if (bLimitRange)
2025     {
2026         vsite_atom_range = -1;
2027         for (ftype = 0; ftype < F_NRE; ftype++)
2028         {
2029             if ((interaction_function[ftype].flags & IF_VSITE) &&
2030                 ftype != F_VSITEN)
2031             {
2032                 nral1 = 1 + NRAL(ftype);
2033                 iat   = ilist[ftype].iatoms;
2034                 for (i = 0; i < ilist[ftype].nr; i += nral1)
2035                 {
2036                     for (j = i+1; j < i+nral1; j++)
2037                     {
2038                         vsite_atom_range = max(vsite_atom_range, iat[j]);
2039                     }
2040                 }
2041             }
2042         }
2043         vsite_atom_range++;
2044     }
2045     else
2046     {
2047         vsite_atom_range = mdatoms->homenr;
2048     }
2049     natperthread = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
2050
2051     if (debug)
2052     {
2053         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms->nr, vsite_atom_range, natperthread);
2054     }
2055
2056     /* To simplify the vsite assignment, we make an index which tells us
2057      * to which thread particles, both non-vsites and vsites, are assigned.
2058      */
2059     if (mdatoms->nr > vsite->th_ind_nalloc)
2060     {
2061         vsite->th_ind_nalloc = over_alloc_large(mdatoms->nr);
2062         srenew(vsite->th_ind, vsite->th_ind_nalloc);
2063     }
2064     th_ind = vsite->th_ind;
2065     th     = 0;
2066     for (i = 0; i < mdatoms->nr; i++)
2067     {
2068         if (mdatoms->ptype[i] == eptVSite)
2069         {
2070             /* vsites are not assigned to a thread yet */
2071             th_ind[i] = -1;
2072         }
2073         else
2074         {
2075             /* assign non-vsite particles to thread th */
2076             th_ind[i] = th;
2077         }
2078         if (i == (th + 1)*natperthread && th < vsite->nthreads)
2079         {
2080             th++;
2081         }
2082     }
2083
2084     for (ftype = 0; ftype < F_NRE; ftype++)
2085     {
2086         if ((interaction_function[ftype].flags & IF_VSITE) &&
2087             ftype != F_VSITEN)
2088         {
2089             nral1 = 1 + NRAL(ftype);
2090             inc   = nral1;
2091             iat   = ilist[ftype].iatoms;
2092             for (i = 0; i < ilist[ftype].nr; )
2093             {
2094                 th = iat[1+i]/natperthread;
2095                 /* We would like to assign this vsite the thread th,
2096                  * but it might depend on atoms outside the atom range of th
2097                  * or on another vsite not assigned to thread th.
2098                  */
2099                 if (ftype != F_VSITEN)
2100                 {
2101                     for (j = i+2; j < i+nral1; j++)
2102                     {
2103                         if (th_ind[iat[j]] != th)
2104                         {
2105                             /* Some constructing atoms are not assigned to
2106                              * thread th, move this vsite to a separate batch.
2107                              */
2108                             th = vsite->nthreads;
2109                         }
2110                     }
2111                 }
2112                 else
2113                 {
2114                     inc = iat[i];
2115                     for (j = i+2; j < i+inc; j += 3)
2116                     {
2117                         if (th_ind[iat[j]] != th)
2118                         {
2119                             th = vsite->nthreads;
2120                         }
2121                     }
2122                 }
2123                 /* Copy this vsite to the thread data struct of thread th */
2124                 il_th = &vsite->tdata[th].ilist[ftype];
2125                 for (j = i; j < i+inc; j++)
2126                 {
2127                     il_th->iatoms[il_th->nr++] = iat[j];
2128                 }
2129                 /* Update this vsite's thread index entry */
2130                 th_ind[iat[1+i]] = th;
2131
2132                 i += inc;
2133             }
2134         }
2135     }
2136
2137     if (debug)
2138     {
2139         for (ftype = 0; ftype < F_NRE; ftype++)
2140         {
2141             if ((interaction_function[ftype].flags & IF_VSITE) &&
2142                 ilist[ftype].nr > 0)
2143             {
2144                 fprintf(debug, "%-20s thread dist:",
2145                         interaction_function[ftype].longname);
2146                 for (th = 0; th < vsite->nthreads+1; th++)
2147                 {
2148                     fprintf(debug, " %4d", vsite->tdata[th].ilist[ftype].nr);
2149                 }
2150                 fprintf(debug, "\n");
2151             }
2152         }
2153     }
2154 }
2155
2156 void set_vsite_top(gmx_vsite_t *vsite, gmx_localtop_t *top, t_mdatoms *md,
2157                    t_commrec *cr)
2158 {
2159     int *a2cg;
2160
2161     if (vsite->n_intercg_vsite > 0)
2162     {
2163         if (vsite->bHaveChargeGroups)
2164         {
2165             /* Make an atom to charge group index */
2166             a2cg                 = atom2cg(&top->cgs);
2167             vsite->vsite_pbc_loc = get_vsite_pbc(top->idef.iparams,
2168                                                  top->idef.il, NULL, md,
2169                                                  &top->cgs, a2cg);
2170             sfree(a2cg);
2171         }
2172
2173         if (PARTDECOMP(cr))
2174         {
2175             snew(vsite->vsitecomm, 1);
2176             vsite->bPDvsitecomm =
2177                 setup_parallel_vsites(&(top->idef), cr, vsite->vsitecomm);
2178         }
2179     }
2180
2181     if (vsite->nthreads > 1)
2182     {
2183         if (vsite->bHaveChargeGroups || PARTDECOMP(cr))
2184         {
2185             gmx_incons("Can not use threads virtual sites combined with charge groups or particle decomposition");
2186         }
2187
2188         split_vsites_over_threads(top->idef.il, md, !DOMAINDECOMP(cr), vsite);
2189     }
2190 }