src/gromacs/mdlib/vsite.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROwing Monsters And Cloning Shrimps
  35  */
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40 #include <stdio.h>
  41 #include "typedefs.h"
  42 #include "vsite.h"
  43 #include "macros.h"
  44 #include "smalloc.h"
  45 #include "nrnb.h"
  46 #include "vec.h"
  47 #include "mvdata.h"
  48 #include "network.h"
  49 #include "mshift.h"
  50 #include "pbc.h"
  51 #include "domdec.h"
  52 #include "partdec.h"
  53 #include "mtop_util.h"
  54 #include "gmx_omp_nthreads.h"
  55
  56 #include "gromacs/utility/gmxomp.h"
  57
  58 /* Routines to send/recieve coordinates and force
  59  * of constructing atoms.
  60  */
  61
  62 static void move_construct_x(t_comm_vsites *vsitecomm, rvec x[], t_commrec *cr)
  63 {
  64     rvec *sendbuf;
  65     rvec *recvbuf;
  66     int   i, ia;
  67
  68     sendbuf = vsitecomm->send_buf;
  69     recvbuf = vsitecomm->recv_buf;
  70
  71
  72     /* Prepare pulse left by copying to send buffer */
  73     for (i = 0; i < vsitecomm->left_export_nconstruct; i++)
  74     {
  75         ia = vsitecomm->left_export_construct[i];
  76         copy_rvec(x[ia], sendbuf[i]);
  77     }
  78
  79     /* Pulse coordinates left */
  80     gmx_tx_rx_real(cr, GMX_LEFT, (real *)sendbuf, 3*vsitecomm->left_export_nconstruct, GMX_RIGHT, (real *)recvbuf, 3*vsitecomm->right_import_nconstruct);
  81
  82     /* Copy from receive buffer to coordinate array */
  83     for (i = 0; i < vsitecomm->right_import_nconstruct; i++)
  84     {
  85         ia = vsitecomm->right_import_construct[i];
  86         copy_rvec(recvbuf[i], x[ia]);
  87     }
  88
  89     /* Prepare pulse right by copying to send buffer */
  90     for (i = 0; i < vsitecomm->right_export_nconstruct; i++)
  91     {
  92         ia = vsitecomm->right_export_construct[i];
  93         copy_rvec(x[ia], sendbuf[i]);
  94     }
  95
  96     /* Pulse coordinates right */
  97     gmx_tx_rx_real(cr, GMX_RIGHT, (real *)sendbuf, 3*vsitecomm->right_export_nconstruct, GMX_LEFT, (real *)recvbuf, 3*vsitecomm->left_import_nconstruct);
  98
  99     /* Copy from receive buffer to coordinate array */
 100     for (i = 0; i < vsitecomm->left_import_nconstruct; i++)
 101     {
 102         ia = vsitecomm->left_import_construct[i];
 103         copy_rvec(recvbuf[i], x[ia]);
 104     }
 105 }
 106
 107
 108 static void move_construct_f(t_comm_vsites *vsitecomm, rvec f[], t_commrec *cr)
 109 {
 110     rvec *sendbuf;
 111     rvec *recvbuf;
 112     int   i, ia;
 113
 114     sendbuf = vsitecomm->send_buf;
 115     recvbuf = vsitecomm->recv_buf;
 116
 117     /* Prepare pulse right by copying to send buffer */
 118     for (i = 0; i < vsitecomm->right_import_nconstruct; i++)
 119     {
 120         ia = vsitecomm->right_import_construct[i];
 121         copy_rvec(f[ia], sendbuf[i]);
 122         clear_rvec(f[ia]);     /* Zero it here after moving, just to simplify debug book-keeping... */
 123     }
 124
 125     /* Pulse forces right */
 126     gmx_tx_rx_real(cr, GMX_RIGHT, (real *)sendbuf, 3*vsitecomm->right_import_nconstruct, GMX_LEFT, (real *)recvbuf, 3*vsitecomm->left_export_nconstruct);
 127
 128     /* Copy from receive buffer to coordinate array */
 129     for (i = 0; i < vsitecomm->left_export_nconstruct; i++)
 130     {
 131         ia = vsitecomm->left_export_construct[i];
 132         rvec_inc(f[ia], recvbuf[i]);
 133     }
 134
 135     /* Prepare pulse left by copying to send buffer */
 136     for (i = 0; i < vsitecomm->left_import_nconstruct; i++)
 137     {
 138         ia = vsitecomm->left_import_construct[i];
 139         copy_rvec(f[ia], sendbuf[i]);
 140         clear_rvec(f[ia]);     /* Zero it here after moving, just to simplify debug book-keeping... */
 141     }
 142
 143     /* Pulse coordinates left */
 144     gmx_tx_rx_real(cr, GMX_LEFT, (real *)sendbuf, 3*vsitecomm->left_import_nconstruct, GMX_RIGHT, (real *)recvbuf, 3*vsitecomm->right_export_nconstruct);
 145
 146     /* Copy from receive buffer to coordinate array */
 147     for (i = 0; i < vsitecomm->right_export_nconstruct; i++)
 148     {
 149         ia = vsitecomm->right_export_construct[i];
 150         rvec_inc(f[ia], recvbuf[i]);
 151     }
 152
 153     /* All forces are now on the home processors */
 154 }
 155
 156
 157 static void
 158 pd_clear_nonlocal_constructs(t_comm_vsites *vsitecomm, rvec f[])
 159 {
 160     int i, ia;
 161
 162     for (i = 0; i < vsitecomm->left_import_nconstruct; i++)
 163     {
 164         ia = vsitecomm->left_import_construct[i];
 165         clear_rvec(f[ia]);
 166     }
 167     for (i = 0; i < vsitecomm->right_import_nconstruct; i++)
 168     {
 169         ia = vsitecomm->right_import_construct[i];
 170         clear_rvec(f[ia]);
 171     }
 172 }
 173
 174
 175
 176 static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 177 {
 178     if (pbc)
 179     {
 180         return pbc_dx_aiuc(pbc, xi, xj, dx);
 181     }
 182     else
 183     {
 184         rvec_sub(xi, xj, dx);
 185         return CENTRAL;
 186     }
 187 }
 188
 189 /* Vsite construction routines */
 190
 191 static void constr_vsite2(rvec xi, rvec xj, rvec x, real a, t_pbc *pbc)
 192 {
 193     real b;
 194     rvec dx;
 195
 196     b = 1.0-a;
 197     /* 1 flop */
 198
 199     if (pbc)
 200     {
 201         pbc_dx_aiuc(pbc, xj, xi, dx);
 202         x[XX] = xi[XX] + a*dx[XX];
 203         x[YY] = xi[YY] + a*dx[YY];
 204         x[ZZ] = xi[ZZ] + a*dx[ZZ];
 205     }
 206     else
 207     {
 208         x[XX] = b*xi[XX] + a*xj[XX];
 209         x[YY] = b*xi[YY] + a*xj[YY];
 210         x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 211         /* 9 Flops */
 212     }
 213
 214     /* TOTAL: 10 flops */
 215 }
 216
 217 static void constr_vsite3(rvec xi, rvec xj, rvec xk, rvec x, real a, real b,
 218                           t_pbc *pbc)
 219 {
 220     real c;
 221     rvec dxj, dxk;
 222
 223     c = 1.0-a-b;
 224     /* 2 flops */
 225
 226     if (pbc)
 227     {
 228         pbc_dx_aiuc(pbc, xj, xi, dxj);
 229         pbc_dx_aiuc(pbc, xk, xi, dxk);
 230         x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 231         x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 232         x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 233     }
 234     else
 235     {
 236         x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 237         x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 238         x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 239         /* 15 Flops */
 240     }
 241
 242     /* TOTAL: 17 flops */
 243 }
 244
 245 static void constr_vsite3FD(rvec xi, rvec xj, rvec xk, rvec x, real a, real b,
 246                             t_pbc *pbc)
 247 {
 248     rvec xij, xjk, temp;
 249     real c;
 250
 251     pbc_rvec_sub(pbc, xj, xi, xij);
 252     pbc_rvec_sub(pbc, xk, xj, xjk);
 253     /* 6 flops */
 254
 255     /* temp goes from i to a point on the line jk */
 256     temp[XX] = xij[XX] + a*xjk[XX];
 257     temp[YY] = xij[YY] + a*xjk[YY];
 258     temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 259     /* 6 flops */
 260
 261     c = b*gmx_invsqrt(iprod(temp, temp));
 262     /* 6 + 10 flops */
 263
 264     x[XX] = xi[XX] + c*temp[XX];
 265     x[YY] = xi[YY] + c*temp[YY];
 266     x[ZZ] = xi[ZZ] + c*temp[ZZ];
 267     /* 6 Flops */
 268
 269     /* TOTAL: 34 flops */
 270 }
 271
 272 static void constr_vsite3FAD(rvec xi, rvec xj, rvec xk, rvec x, real a, real b, t_pbc *pbc)
 273 {
 274     rvec xij, xjk, xp;
 275     real a1, b1, c1, invdij;
 276
 277     pbc_rvec_sub(pbc, xj, xi, xij);
 278     pbc_rvec_sub(pbc, xk, xj, xjk);
 279     /* 6 flops */
 280
 281     invdij = gmx_invsqrt(iprod(xij, xij));
 282     c1     = invdij * invdij * iprod(xij, xjk);
 283     xp[XX] = xjk[XX] - c1*xij[XX];
 284     xp[YY] = xjk[YY] - c1*xij[YY];
 285     xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 286     a1     = a*invdij;
 287     b1     = b*gmx_invsqrt(iprod(xp, xp));
 288     /* 45 */
 289
 290     x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 291     x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 292     x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 293     /* 12 Flops */
 294
 295     /* TOTAL: 63 flops */
 296 }
 297
 298 static void constr_vsite3OUT(rvec xi, rvec xj, rvec xk, rvec x,
 299                              real a, real b, real c, t_pbc *pbc)
 300 {
 301     rvec xij, xik, temp;
 302
 303     pbc_rvec_sub(pbc, xj, xi, xij);
 304     pbc_rvec_sub(pbc, xk, xi, xik);
 305     cprod(xij, xik, temp);
 306     /* 15 Flops */
 307
 308     x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 309     x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 310     x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 311     /* 18 Flops */
 312
 313     /* TOTAL: 33 flops */
 314 }
 315
 316 static void constr_vsite4FD(rvec xi, rvec xj, rvec xk, rvec xl, rvec x,
 317                             real a, real b, real c, t_pbc *pbc)
 318 {
 319     rvec xij, xjk, xjl, temp;
 320     real d;
 321
 322     pbc_rvec_sub(pbc, xj, xi, xij);
 323     pbc_rvec_sub(pbc, xk, xj, xjk);
 324     pbc_rvec_sub(pbc, xl, xj, xjl);
 325     /* 9 flops */
 326
 327     /* temp goes from i to a point on the plane jkl */
 328     temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 329     temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 330     temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 331     /* 12 flops */
 332
 333     d = c*gmx_invsqrt(iprod(temp, temp));
 334     /* 6 + 10 flops */
 335
 336     x[XX] = xi[XX] + d*temp[XX];
 337     x[YY] = xi[YY] + d*temp[YY];
 338     x[ZZ] = xi[ZZ] + d*temp[ZZ];
 339     /* 6 Flops */
 340
 341     /* TOTAL: 43 flops */
 342 }
 343
 344
 345 static void constr_vsite4FDN(rvec xi, rvec xj, rvec xk, rvec xl, rvec x,
 346                              real a, real b, real c, t_pbc *pbc)
 347 {
 348     rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 349     real d;
 350
 351     pbc_rvec_sub(pbc, xj, xi, xij);
 352     pbc_rvec_sub(pbc, xk, xi, xik);
 353     pbc_rvec_sub(pbc, xl, xi, xil);
 354     /* 9 flops */
 355
 356     ra[XX] = a*xik[XX];
 357     ra[YY] = a*xik[YY];
 358     ra[ZZ] = a*xik[ZZ];
 359
 360     rb[XX] = b*xil[XX];
 361     rb[YY] = b*xil[YY];
 362     rb[ZZ] = b*xil[ZZ];
 363
 364     /* 6 flops */
 365
 366     rvec_sub(ra, xij, rja);
 367     rvec_sub(rb, xij, rjb);
 368     /* 6 flops */
 369
 370     cprod(rja, rjb, rm);
 371     /* 9 flops */
 372
 373     d = c*gmx_invsqrt(norm2(rm));
 374     /* 5+5+1 flops */
 375
 376     x[XX] = xi[XX] + d*rm[XX];
 377     x[YY] = xi[YY] + d*rm[YY];
 378     x[ZZ] = xi[ZZ] + d*rm[ZZ];
 379     /* 6 Flops */
 380
 381     /* TOTAL: 47 flops */
 382 }
 383
 384
 385 static int constr_vsiten(t_iatom *ia, t_iparams ip[],
 386                          rvec *x, t_pbc *pbc)
 387 {
 388     rvec xs, x1, dx;
 389     dvec dsum;
 390     int  n3, av, ai, i;
 391     real a;
 392
 393     n3 = 3*ip[ia[0]].vsiten.n;
 394     av = ia[1];
 395     ai = ia[2];
 396     copy_rvec(x[ai], x1);
 397     clear_dvec(dsum);
 398     for (i = 3; i < n3; i += 3)
 399     {
 400         ai = ia[i+2];
 401         a  = ip[ia[i]].vsiten.a;
 402         if (pbc)
 403         {
 404             pbc_dx_aiuc(pbc, x[ai], x1, dx);
 405         }
 406         else
 407         {
 408             rvec_sub(x[ai], x1, dx);
 409         }
 410         dsum[XX] += a*dx[XX];
 411         dsum[YY] += a*dx[YY];
 412         dsum[ZZ] += a*dx[ZZ];
 413         /* 9 Flops */
 414     }
 415
 416     x[av][XX] = x1[XX] + dsum[XX];
 417     x[av][YY] = x1[YY] + dsum[YY];
 418     x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 419
 420     return n3;
 421 }
 422
 423
 424 void construct_vsites_thread(gmx_vsite_t *vsite,
 425                              rvec x[],
 426                              real dt, rvec *v,
 427                              t_iparams ip[], t_ilist ilist[],
 428                              t_pbc *pbc_null)
 429 {
 430     gmx_bool   bPBCAll;
 431     rvec       xpbc, xv, vv, dx;
 432     real       a1, b1, c1, inv_dt;
 433     int        i, inc, ii, nra, nr, tp, ftype;
 434     t_iatom    avsite, ai, aj, ak, al, pbc_atom;
 435     t_iatom   *ia;
 436     t_pbc     *pbc_null2;
 437     int       *vsite_pbc, ishift;
 438     rvec       reftmp, vtmp, rtmp;
 439
 440     if (v != NULL)
 441     {
 442         inv_dt = 1.0/dt;
 443     }
 444     else
 445     {
 446         inv_dt = 1.0;
 447     }
 448
 449     bPBCAll = (pbc_null != NULL && !vsite->bHaveChargeGroups);
 450
 451     pbc_null2 = NULL;
 452     vsite_pbc = NULL;
 453     for (ftype = 0; (ftype < F_NRE); ftype++)
 454     {
 455         if ((interaction_function[ftype].flags & IF_VSITE) &&
 456             ilist[ftype].nr > 0)
 457         {
 458             nra    = interaction_function[ftype].nratoms;
 459             inc    = 1 + nra;
 460             nr     = ilist[ftype].nr;
 461             ia     = ilist[ftype].iatoms;
 462
 463             if (bPBCAll)
 464             {
 465                 pbc_null2 = pbc_null;
 466             }
 467             else if (pbc_null != NULL)
 468             {
 469                 vsite_pbc = vsite->vsite_pbc_loc[ftype-F_VSITE2];
 470             }
 471
 472             for (i = 0; i < nr; )
 473             {
 474                 tp   = ia[0];
 475
 476                 /* The vsite and constructing atoms */
 477                 avsite = ia[1];
 478                 ai     = ia[2];
 479
 480                 /* Constants for constructing vsites */
 481                 a1   = ip[tp].vsite.a;
 482                 /* Check what kind of pbc we need to use */
 483                 if (bPBCAll)
 484                 {
 485                     /* No charge groups, vsite follows its own pbc */
 486                     pbc_atom = avsite;
 487                     copy_rvec(x[avsite], xpbc);
 488                 }
 489                 else if (vsite_pbc != NULL)
 490                 {
 491                     pbc_atom = vsite_pbc[i/(1+nra)];
 492                     if (pbc_atom > -2)
 493                     {
 494                         if (pbc_atom >= 0)
 495                         {
 496                             /* We need to copy the coordinates here,
 497                              * single for single atom cg's pbc_atom
 498                              * is the vsite itself.
 499                              */
 500                             copy_rvec(x[pbc_atom], xpbc);
 501                         }
 502                         pbc_null2 = pbc_null;
 503                     }
 504                     else
 505                     {
 506                         pbc_null2 = NULL;
 507                     }
 508                 }
 509                 else
 510                 {
 511                     pbc_atom = -2;
 512                 }
 513                 /* Copy the old position */
 514                 copy_rvec(x[avsite], xv);
 515
 516                 /* Construct the vsite depending on type */
 517                 switch (ftype)
 518                 {
 519                     case F_VSITE2:
 520                         aj = ia[3];
 521                         constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 522                         break;
 523                     case F_VSITE3:
 524                         aj = ia[3];
 525                         ak = ia[4];
 526                         b1 = ip[tp].vsite.b;
 527                         constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 528                         break;
 529                     case F_VSITE3FD:
 530                         aj = ia[3];
 531                         ak = ia[4];
 532                         b1 = ip[tp].vsite.b;
 533                         constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 534                         break;
 535                     case F_VSITE3FAD:
 536                         aj = ia[3];
 537                         ak = ia[4];
 538                         b1 = ip[tp].vsite.b;
 539                         constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 540                         break;
 541                     case F_VSITE3OUT:
 542                         aj = ia[3];
 543                         ak = ia[4];
 544                         b1 = ip[tp].vsite.b;
 545                         c1 = ip[tp].vsite.c;
 546                         constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 547                         break;
 548                     case F_VSITE4FD:
 549                         aj = ia[3];
 550                         ak = ia[4];
 551                         al = ia[5];
 552                         b1 = ip[tp].vsite.b;
 553                         c1 = ip[tp].vsite.c;
 554                         constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 555                                         pbc_null2);
 556                         break;
 557                     case F_VSITE4FDN:
 558                         aj = ia[3];
 559                         ak = ia[4];
 560                         al = ia[5];
 561                         b1 = ip[tp].vsite.b;
 562                         c1 = ip[tp].vsite.c;
 563                         constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 564                                          pbc_null2);
 565                         break;
 566                     case F_VSITEN:
 567                         inc = constr_vsiten(ia, ip, x, pbc_null2);
 568                         break;
 569                     default:
 570                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 571                                   ftype, __FILE__, __LINE__);
 572                 }
 573
 574                 if (pbc_atom >= 0)
 575                 {
 576                     /* Match the pbc of this vsite to the rest of its charge group */
 577                     ishift = pbc_dx_aiuc(pbc_null, x[avsite], xpbc, dx);
 578                     if (ishift != CENTRAL)
 579                     {
 580                         rvec_add(xpbc, dx, x[avsite]);
 581                     }
 582                 }
 583                 if (v != NULL)
 584                 {
 585                     /* Calculate velocity of vsite... */
 586                     rvec_sub(x[avsite], xv, vv);
 587                     svmul(inv_dt, vv, v[avsite]);
 588                 }
 589
 590                 /* Increment loop variables */
 591                 i  += inc;
 592                 ia += inc;
 593             }
 594         }
 595     }
 596 }
 597
 598 void construct_vsites(gmx_vsite_t *vsite,
 599                       rvec x[],
 600                       real dt, rvec *v,
 601                       t_iparams ip[], t_ilist ilist[],
 602                       int ePBC, gmx_bool bMolPBC, t_graph *graph,
 603                       t_commrec *cr, matrix box)
 604 {
 605     t_pbc     pbc, *pbc_null;
 606     gmx_bool  bDomDec;
 607     int       nthreads;
 608
 609     bDomDec = cr && DOMAINDECOMP(cr);
 610
 611     /* We only need to do pbc when we have inter-cg vsites */
 612     if (ePBC != epbcNONE && (bDomDec || bMolPBC) && vsite->n_intercg_vsite)
 613     {
 614         /* This is wasting some CPU time as we now do this multiple times
 615          * per MD step. But how often do we have vsites with full pbc?
 616          */
 617         pbc_null = set_pbc_dd(&pbc, ePBC, cr != NULL ? cr->dd : NULL, FALSE, box);
 618     }
 619     else
 620     {
 621         pbc_null = NULL;
 622     }
 623
 624     if (cr)
 625     {
 626         if (bDomDec)
 627         {
 628             dd_move_x_vsites(cr->dd, box, x);
 629         }
 630         else if (vsite->bPDvsitecomm)
 631         {
 632             /* I'm not sure whether the periodicity and shift are guaranteed
 633              * to be consistent between different nodes when running e.g. polymers
 634              * in parallel. In this special case we thus unshift/shift,
 635              * but only when necessary. This is to make sure the coordinates
 636              * we move don't end up a box away...
 637              */
 638             if (graph != NULL)
 639             {
 640                 unshift_self(graph, box, x);
 641             }
 642
 643             move_construct_x(vsite->vsitecomm, x, cr);
 644
 645             if (graph != NULL)
 646             {
 647                 shift_self(graph, box, x);
 648             }
 649         }
 650     }
 651
 652     if (vsite->nthreads == 1)
 653     {
 654         construct_vsites_thread(vsite,
 655                                 x, dt, v,
 656                                 ip, ilist,
 657                                 pbc_null);
 658     }
 659     else
 660     {
 661 #pragma omp parallel num_threads(vsite->nthreads)
 662         {
 663             construct_vsites_thread(vsite,
 664                                     x, dt, v,
 665                                     ip, vsite->tdata[gmx_omp_get_thread_num()].ilist,
 666                                     pbc_null);
 667         }
 668         /* Now we can construct the vsites that might depend on other vsites */
 669         construct_vsites_thread(vsite,
 670                                 x, dt, v,
 671                                 ip, vsite->tdata[vsite->nthreads].ilist,
 672                                 pbc_null);
 673     }
 674 }
 675
 676 static void spread_vsite2(t_iatom ia[], real a,
 677                           rvec x[], rvec f[], rvec fshift[],
 678                           t_pbc *pbc, t_graph *g)
 679 {
 680     rvec    fi, fj, dx;
 681     t_iatom av, ai, aj;
 682     ivec    di;
 683     real    b;
 684     int     siv, sij;
 685
 686     av = ia[1];
 687     ai = ia[2];
 688     aj = ia[3];
 689
 690     svmul(1-a, f[av], fi);
 691     svmul(  a, f[av], fj);
 692     /* 7 flop */
 693
 694     rvec_inc(f[ai], fi);
 695     rvec_inc(f[aj], fj);
 696     /* 6 Flops */
 697
 698     if (g)
 699     {
 700         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 701         siv = IVEC2IS(di);
 702         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 703         sij = IVEC2IS(di);
 704     }
 705     else if (pbc)
 706     {
 707         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 708         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 709     }
 710     else
 711     {
 712         siv = CENTRAL;
 713         sij = CENTRAL;
 714     }
 715
 716     if (fshift && (siv != CENTRAL || sij != CENTRAL))
 717     {
 718         rvec_inc(fshift[siv], f[av]);
 719         rvec_dec(fshift[CENTRAL], fi);
 720         rvec_dec(fshift[sij], fj);
 721     }
 722
 723     /* TOTAL: 13 flops */
 724 }
 725
 726 void construct_vsites_mtop(gmx_vsite_t *vsite,
 727                            gmx_mtop_t *mtop, rvec x[])
 728 {
 729     int             as, mb, mol;
 730     gmx_molblock_t *molb;
 731     gmx_moltype_t  *molt;
 732
 733     as = 0;
 734     for (mb = 0; mb < mtop->nmolblock; mb++)
 735     {
 736         molb = &mtop->molblock[mb];
 737         molt = &mtop->moltype[molb->type];
 738         for (mol = 0; mol < molb->nmol; mol++)
 739         {
 740             construct_vsites(vsite, x+as, 0.0, NULL,
 741                              mtop->ffparams.iparams, molt->ilist,
 742                              epbcNONE, TRUE, NULL, NULL, NULL);
 743             as += molt->atoms.nr;
 744         }
 745     }
 746 }
 747
 748 static void spread_vsite3(t_iatom ia[], real a, real b,
 749                           rvec x[], rvec f[], rvec fshift[],
 750                           t_pbc *pbc, t_graph *g)
 751 {
 752     rvec    fi, fj, fk, dx;
 753     atom_id av, ai, aj, ak;
 754     ivec    di;
 755     int     siv, sij, sik;
 756
 757     av = ia[1];
 758     ai = ia[2];
 759     aj = ia[3];
 760     ak = ia[4];
 761
 762     svmul(1-a-b, f[av], fi);
 763     svmul(    a, f[av], fj);
 764     svmul(    b, f[av], fk);
 765     /* 11 flops */
 766
 767     rvec_inc(f[ai], fi);
 768     rvec_inc(f[aj], fj);
 769     rvec_inc(f[ak], fk);
 770     /* 9 Flops */
 771
 772     if (g)
 773     {
 774         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 775         siv = IVEC2IS(di);
 776         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 777         sij = IVEC2IS(di);
 778         ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 779         sik = IVEC2IS(di);
 780     }
 781     else if (pbc)
 782     {
 783         siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 784         sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 785         sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 786     }
 787     else
 788     {
 789         siv = CENTRAL;
 790         sij = CENTRAL;
 791         sik = CENTRAL;
 792     }
 793
 794     if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 795     {
 796         rvec_inc(fshift[siv], f[av]);
 797         rvec_dec(fshift[CENTRAL], fi);
 798         rvec_dec(fshift[sij], fj);
 799         rvec_dec(fshift[sik], fk);
 800     }
 801
 802     /* TOTAL: 20 flops */
 803 }
 804
 805 static void spread_vsite3FD(t_iatom ia[], real a, real b,
 806                             rvec x[], rvec f[], rvec fshift[],
 807                             gmx_bool VirCorr, matrix dxdf,
 808                             t_pbc *pbc, t_graph *g)
 809 {
 810     real    fx, fy, fz, c, invl, fproj, a1;
 811     rvec    xvi, xij, xjk, xix, fv, temp;
 812     t_iatom av, ai, aj, ak;
 813     int     svi, sji, skj, d;
 814     ivec    di;
 815
 816     av = ia[1];
 817     ai = ia[2];
 818     aj = ia[3];
 819     ak = ia[4];
 820     copy_rvec(f[av], fv);
 821
 822     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 823     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 824     /* 6 flops */
 825
 826     /* xix goes from i to point x on the line jk */
 827     xix[XX] = xij[XX]+a*xjk[XX];
 828     xix[YY] = xij[YY]+a*xjk[YY];
 829     xix[ZZ] = xij[ZZ]+a*xjk[ZZ];
 830     /* 6 flops */
 831
 832     invl = gmx_invsqrt(iprod(xix, xix));
 833     c    = b*invl;
 834     /* 4 + ?10? flops */
 835
 836     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 837
 838     temp[XX] = c*(fv[XX]-fproj*xix[XX]);
 839     temp[YY] = c*(fv[YY]-fproj*xix[YY]);
 840     temp[ZZ] = c*(fv[ZZ]-fproj*xix[ZZ]);
 841     /* 16 */
 842
 843     /* c is already calculated in constr_vsite3FD
 844        storing c somewhere will save 26 flops!     */
 845
 846     a1         = 1-a;
 847     f[ai][XX] += fv[XX] - temp[XX];
 848     f[ai][YY] += fv[YY] - temp[YY];
 849     f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 850     f[aj][XX] += a1*temp[XX];
 851     f[aj][YY] += a1*temp[YY];
 852     f[aj][ZZ] += a1*temp[ZZ];
 853     f[ak][XX] += a*temp[XX];
 854     f[ak][YY] += a*temp[YY];
 855     f[ak][ZZ] += a*temp[ZZ];
 856     /* 19 Flops */
 857
 858     if (g)
 859     {
 860         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 861         svi = IVEC2IS(di);
 862         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 863         sji = IVEC2IS(di);
 864         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 865         skj = IVEC2IS(di);
 866     }
 867     else if (pbc)
 868     {
 869         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 870     }
 871     else
 872     {
 873         svi = CENTRAL;
 874     }
 875
 876     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 877     {
 878         rvec_dec(fshift[svi], fv);
 879         fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 880         fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 881         fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 882         fshift[    sji][XX] += temp[XX];
 883         fshift[    sji][YY] += temp[YY];
 884         fshift[    sji][ZZ] += temp[ZZ];
 885         fshift[    skj][XX] += a*temp[XX];
 886         fshift[    skj][YY] += a*temp[YY];
 887         fshift[    skj][ZZ] += a*temp[ZZ];
 888     }
 889
 890     if (VirCorr)
 891     {
 892         /* When VirCorr=TRUE, the virial for the current forces is not
 893          * calculated from the redistributed forces. This means that
 894          * the effect of non-linear virtual site constructions on the virial
 895          * needs to be added separately. This contribution can be calculated
 896          * in many ways, but the simplest and cheapest way is to use
 897          * the first constructing atom ai as a reference position in space:
 898          * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 899          */
 900         rvec xiv;
 901         int  i, j;
 902
 903         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 904
 905         for (i = 0; i < DIM; i++)
 906         {
 907             for (j = 0; j < DIM; j++)
 908             {
 909                 /* As xix is a linear combination of j and k, use that here */
 910                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 911             }
 912         }
 913     }
 914
 915     /* TOTAL: 61 flops */
 916 }
 917
 918 static void spread_vsite3FAD(t_iatom ia[], real a, real b,
 919                              rvec x[], rvec f[], rvec fshift[],
 920                              gmx_bool VirCorr, matrix dxdf,
 921                              t_pbc *pbc, t_graph *g)
 922 {
 923     rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 924     real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 925     t_iatom av, ai, aj, ak;
 926     int     svi, sji, skj, d;
 927     ivec    di;
 928
 929     av = ia[1];
 930     ai = ia[2];
 931     aj = ia[3];
 932     ak = ia[4];
 933     copy_rvec(f[ia[1]], fv);
 934
 935     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 936     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 937     /* 6 flops */
 938
 939     invdij    = gmx_invsqrt(iprod(xij, xij));
 940     invdij2   = invdij * invdij;
 941     c1        = iprod(xij, xjk) * invdij2;
 942     xperp[XX] = xjk[XX] - c1*xij[XX];
 943     xperp[YY] = xjk[YY] - c1*xij[YY];
 944     xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 945     /* xperp in plane ijk, perp. to ij */
 946     invdp = gmx_invsqrt(iprod(xperp, xperp));
 947     a1    = a*invdij;
 948     b1    = b*invdp;
 949     /* 45 flops */
 950
 951     /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 952        storing them somewhere will save 45 flops!     */
 953
 954     fproj = iprod(xij, fv)*invdij2;
 955     svmul(fproj,                      xij,  Fpij);    /* proj. f on xij */
 956     svmul(iprod(xperp, fv)*invdp*invdp, xperp, Fppp); /* proj. f on xperp */
 957     svmul(b1*fproj,                   xperp, f3);
 958     /* 23 flops */
 959
 960     rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
 961     rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
 962     for (d = 0; (d < DIM); d++)
 963     {
 964         f1[d] *= a1;
 965         f2[d] *= b1;
 966     }
 967     /* 12 flops */
 968
 969     c2         = 1+c1;
 970     f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
 971     f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
 972     f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
 973     f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
 974     f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
 975     f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
 976     f[ak][XX] +=                      f2[XX];
 977     f[ak][YY] +=                      f2[YY];
 978     f[ak][ZZ] +=                      f2[ZZ];
 979     /* 30 Flops */
 980
 981     if (g)
 982     {
 983         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 984         svi = IVEC2IS(di);
 985         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 986         sji = IVEC2IS(di);
 987         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 988         skj = IVEC2IS(di);
 989     }
 990     else if (pbc)
 991     {
 992         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 993     }
 994     else
 995     {
 996         svi = CENTRAL;
 997     }
 998
 999     if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
1000     {
1001         rvec_dec(fshift[svi], fv);
1002         fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
1003         fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
1004         fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
1005         fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
1006         fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
1007         fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
1008         fshift[    skj][XX] +=                          f2[XX];
1009         fshift[    skj][YY] +=                          f2[YY];
1010         fshift[    skj][ZZ] +=                          f2[ZZ];
1011     }
1012
1013     if (VirCorr)
1014     {
1015         rvec xiv;
1016         int  i, j;
1017
1018         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1019
1020         for (i = 0; i < DIM; i++)
1021         {
1022             for (j = 0; j < DIM; j++)
1023             {
1024                 /* Note that xik=xij+xjk, so we have to add xij*f2 */
1025                 dxdf[i][j] +=
1026                     -xiv[i]*fv[j]
1027                     + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
1028                     + xjk[i]*f2[j];
1029             }
1030         }
1031     }
1032
1033     /* TOTAL: 113 flops */
1034 }
1035
1036 static void spread_vsite3OUT(t_iatom ia[], real a, real b, real c,
1037                              rvec x[], rvec f[], rvec fshift[],
1038                              gmx_bool VirCorr, matrix dxdf,
1039                              t_pbc *pbc, t_graph *g)
1040 {
1041     rvec    xvi, xij, xik, fv, fj, fk;
1042     real    cfx, cfy, cfz;
1043     atom_id av, ai, aj, ak;
1044     ivec    di;
1045     int     svi, sji, ski;
1046
1047     av = ia[1];
1048     ai = ia[2];
1049     aj = ia[3];
1050     ak = ia[4];
1051
1052     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1053     ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1054     /* 6 Flops */
1055
1056     copy_rvec(f[av], fv);
1057
1058     cfx = c*fv[XX];
1059     cfy = c*fv[YY];
1060     cfz = c*fv[ZZ];
1061     /* 3 Flops */
1062
1063     fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
1064     fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
1065     fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
1066
1067     fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
1068     fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
1069     fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
1070     /* 30 Flops */
1071
1072     f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
1073     f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
1074     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1075     rvec_inc(f[aj], fj);
1076     rvec_inc(f[ak], fk);
1077     /* 15 Flops */
1078
1079     if (g)
1080     {
1081         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1082         svi = IVEC2IS(di);
1083         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1084         sji = IVEC2IS(di);
1085         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1086         ski = IVEC2IS(di);
1087     }
1088     else if (pbc)
1089     {
1090         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1091     }
1092     else
1093     {
1094         svi = CENTRAL;
1095     }
1096
1097     if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
1098     {
1099         rvec_dec(fshift[svi], fv);
1100         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
1101         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
1102         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
1103         rvec_inc(fshift[sji], fj);
1104         rvec_inc(fshift[ski], fk);
1105     }
1106
1107     if (VirCorr)
1108     {
1109         rvec xiv;
1110         int  i, j;
1111
1112         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1113
1114         for (i = 0; i < DIM; i++)
1115         {
1116             for (j = 0; j < DIM; j++)
1117             {
1118                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
1119             }
1120         }
1121     }
1122
1123     /* TOTAL: 54 flops */
1124 }
1125
1126 static void spread_vsite4FD(t_iatom ia[], real a, real b, real c,
1127                             rvec x[], rvec f[], rvec fshift[],
1128                             gmx_bool VirCorr, matrix dxdf,
1129                             t_pbc *pbc, t_graph *g)
1130 {
1131     real    d, invl, fproj, a1;
1132     rvec    xvi, xij, xjk, xjl, xix, fv, temp;
1133     atom_id av, ai, aj, ak, al;
1134     ivec    di;
1135     int     svi, sji, skj, slj, m;
1136
1137     av = ia[1];
1138     ai = ia[2];
1139     aj = ia[3];
1140     ak = ia[4];
1141     al = ia[5];
1142
1143     sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1144     skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
1145     slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
1146     /* 9 flops */
1147
1148     /* xix goes from i to point x on the plane jkl */
1149     for (m = 0; m < DIM; m++)
1150     {
1151         xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
1152     }
1153     /* 12 flops */
1154
1155     invl = gmx_invsqrt(iprod(xix, xix));
1156     d    = c*invl;
1157     /* 4 + ?10? flops */
1158
1159     copy_rvec(f[av], fv);
1160
1161     fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
1162
1163     for (m = 0; m < DIM; m++)
1164     {
1165         temp[m] = d*(fv[m] - fproj*xix[m]);
1166     }
1167     /* 16 */
1168
1169     /* c is already calculated in constr_vsite3FD
1170        storing c somewhere will save 35 flops!     */
1171
1172     a1 = 1 - a - b;
1173     for (m = 0; m < DIM; m++)
1174     {
1175         f[ai][m] += fv[m] - temp[m];
1176         f[aj][m] += a1*temp[m];
1177         f[ak][m] += a*temp[m];
1178         f[al][m] += b*temp[m];
1179     }
1180     /* 26 Flops */
1181
1182     if (g)
1183     {
1184         ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
1185         svi = IVEC2IS(di);
1186         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1187         sji = IVEC2IS(di);
1188         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
1189         skj = IVEC2IS(di);
1190         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
1191         slj = IVEC2IS(di);
1192     }
1193     else if (pbc)
1194     {
1195         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1196     }
1197     else
1198     {
1199         svi = CENTRAL;
1200     }
1201
1202     if (fshift &&
1203         (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
1204     {
1205         rvec_dec(fshift[svi], fv);
1206         for (m = 0; m < DIM; m++)
1207         {
1208             fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
1209             fshift[    sji][m] += temp[m];
1210             fshift[    skj][m] += a*temp[m];
1211             fshift[    slj][m] += b*temp[m];
1212         }
1213     }
1214
1215     if (VirCorr)
1216     {
1217         rvec xiv;
1218         int  i, j;
1219
1220         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1221
1222         for (i = 0; i < DIM; i++)
1223         {
1224             for (j = 0; j < DIM; j++)
1225             {
1226                 dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
1227             }
1228         }
1229     }
1230
1231     /* TOTAL: 77 flops */
1232 }
1233
1234
1235 static void spread_vsite4FDN(t_iatom ia[], real a, real b, real c,
1236                              rvec x[], rvec f[], rvec fshift[],
1237                              gmx_bool VirCorr, matrix dxdf,
1238                              t_pbc *pbc, t_graph *g)
1239 {
1240     rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
1241     rvec fv, fj, fk, fl;
1242     real invrm, denom;
1243     real cfx, cfy, cfz;
1244     ivec di;
1245     int  av, ai, aj, ak, al;
1246     int  svi, sij, sik, sil;
1247
1248     /* DEBUG: check atom indices */
1249     av = ia[1];
1250     ai = ia[2];
1251     aj = ia[3];
1252     ak = ia[4];
1253     al = ia[5];
1254
1255     copy_rvec(f[av], fv);
1256
1257     sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
1258     sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
1259     sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
1260     /* 9 flops */
1261
1262     ra[XX] = a*xik[XX];
1263     ra[YY] = a*xik[YY];
1264     ra[ZZ] = a*xik[ZZ];
1265
1266     rb[XX] = b*xil[XX];
1267     rb[YY] = b*xil[YY];
1268     rb[ZZ] = b*xil[ZZ];
1269
1270     /* 6 flops */
1271
1272     rvec_sub(ra, xij, rja);
1273     rvec_sub(rb, xij, rjb);
1274     rvec_sub(rb, ra, rab);
1275     /* 9 flops */
1276
1277     cprod(rja, rjb, rm);
1278     /* 9 flops */
1279
1280     invrm = gmx_invsqrt(norm2(rm));
1281     denom = invrm*invrm;
1282     /* 5+5+2 flops */
1283
1284     cfx = c*invrm*fv[XX];
1285     cfy = c*invrm*fv[YY];
1286     cfz = c*invrm*fv[ZZ];
1287     /* 6 Flops */
1288
1289     cprod(rm, rab, rt);
1290     /* 9 flops */
1291
1292     rt[XX] *= denom;
1293     rt[YY] *= denom;
1294     rt[ZZ] *= denom;
1295     /* 3flops */
1296
1297     fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
1298     fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
1299     fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
1300     /* 30 flops */
1301
1302     cprod(rjb, rm, rt);
1303     /* 9 flops */
1304
1305     rt[XX] *= denom*a;
1306     rt[YY] *= denom*a;
1307     rt[ZZ] *= denom*a;
1308     /* 3flops */
1309
1310     fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
1311     fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
1312     fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1313     /* 36 flops */
1314
1315     cprod(rm, rja, rt);
1316     /* 9 flops */
1317
1318     rt[XX] *= denom*b;
1319     rt[YY] *= denom*b;
1320     rt[ZZ] *= denom*b;
1321     /* 3flops */
1322
1323     fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
1324     fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
1325     fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
1326     /* 36 flops */
1327
1328     f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1329     f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1330     f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1331     rvec_inc(f[aj], fj);
1332     rvec_inc(f[ak], fk);
1333     rvec_inc(f[al], fl);
1334     /* 21 flops */
1335
1336     if (g)
1337     {
1338         ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
1339         svi = IVEC2IS(di);
1340         ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
1341         sij = IVEC2IS(di);
1342         ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
1343         sik = IVEC2IS(di);
1344         ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
1345         sil = IVEC2IS(di);
1346     }
1347     else if (pbc)
1348     {
1349         svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
1350     }
1351     else
1352     {
1353         svi = CENTRAL;
1354     }
1355
1356     if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
1357     {
1358         rvec_dec(fshift[svi], fv);
1359         fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
1360         fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
1361         fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
1362         rvec_inc(fshift[sij], fj);
1363         rvec_inc(fshift[sik], fk);
1364         rvec_inc(fshift[sil], fl);
1365     }
1366
1367     if (VirCorr)
1368     {
1369         rvec xiv;
1370         int  i, j;
1371
1372         pbc_rvec_sub(pbc, x[av], x[ai], xiv);
1373
1374         for (i = 0; i < DIM; i++)
1375         {
1376             for (j = 0; j < DIM; j++)
1377             {
1378                 dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
1379             }
1380         }
1381     }
1382
1383     /* Total: 207 flops (Yuck!) */
1384 }
1385
1386
1387 static int spread_vsiten(t_iatom ia[], t_iparams ip[],
1388                          rvec x[], rvec f[], rvec fshift[],
1389                          t_pbc *pbc, t_graph *g)
1390 {
1391     rvec xv, dx, fi;
1392     int  n3, av, i, ai;
1393     real a;
1394     ivec di;
1395     int  siv;
1396
1397     n3 = 3*ip[ia[0]].vsiten.n;
1398     av = ia[1];
1399     copy_rvec(x[av], xv);
1400
1401     for (i = 0; i < n3; i += 3)
1402     {
1403         ai = ia[i+2];
1404         if (g)
1405         {
1406             ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
1407             siv = IVEC2IS(di);
1408         }
1409         else if (pbc)
1410         {
1411             siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
1412         }
1413         else
1414         {
1415             siv = CENTRAL;
1416         }
1417         a = ip[ia[i]].vsiten.a;
1418         svmul(a, f[av], fi);
1419         rvec_inc(f[ai], fi);
1420         if (fshift && siv != CENTRAL)
1421         {
1422             rvec_inc(fshift[siv], fi);
1423             rvec_dec(fshift[CENTRAL], fi);
1424         }
1425         /* 6 Flops */
1426     }
1427
1428     return n3;
1429 }
1430
1431
1432 static int vsite_count(const t_ilist *ilist, int ftype)
1433 {
1434     if (ftype == F_VSITEN)
1435     {
1436         return ilist[ftype].nr/3;
1437     }
1438     else
1439     {
1440         return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
1441     }
1442 }
1443
1444 static void spread_vsite_f_thread(gmx_vsite_t *vsite,
1445                                   rvec x[], rvec f[], rvec *fshift,
1446                                   gmx_bool VirCorr, matrix dxdf,
1447                                   t_iparams ip[], t_ilist ilist[],
1448                                   t_graph *g, t_pbc *pbc_null)
1449 {
1450     gmx_bool   bPBCAll;
1451     real       a1, b1, c1;
1452     int        i, inc, m, nra, nr, tp, ftype;
1453     t_iatom   *ia;
1454     t_pbc     *pbc_null2;
1455     int       *vsite_pbc;
1456
1457     if (VirCorr)
1458     {
1459         clear_mat(dxdf);
1460     }
1461
1462     bPBCAll = (pbc_null != NULL && !vsite->bHaveChargeGroups);
1463
1464     /* this loop goes backwards to be able to build *
1465      * higher type vsites from lower types         */
1466     pbc_null2 = NULL;
1467     vsite_pbc = NULL;
1468     for (ftype = F_NRE-1; (ftype >= 0); ftype--)
1469     {
1470         if ((interaction_function[ftype].flags & IF_VSITE) &&
1471             ilist[ftype].nr > 0)
1472         {
1473             nra    = interaction_function[ftype].nratoms;
1474             inc    = 1 + nra;
1475             nr     = ilist[ftype].nr;
1476             ia     = ilist[ftype].iatoms;
1477
1478             if (bPBCAll)
1479             {
1480                 pbc_null2 = pbc_null;
1481             }
1482             else if (pbc_null != NULL)
1483             {
1484                 vsite_pbc = vsite->vsite_pbc_loc[ftype-F_VSITE2];
1485             }
1486
1487             for (i = 0; i < nr; )
1488             {
1489                 if (vsite_pbc != NULL)
1490                 {
1491                     if (vsite_pbc[i/(1+nra)] > -2)
1492                     {
1493                         pbc_null2 = pbc_null;
1494                     }
1495                     else
1496                     {
1497                         pbc_null2 = NULL;
1498                     }
1499                 }
1500
1501                 tp   = ia[0];
1502
1503                 /* Constants for constructing */
1504                 a1   = ip[tp].vsite.a;
1505                 /* Construct the vsite depending on type */
1506                 switch (ftype)
1507                 {
1508                     case F_VSITE2:
1509                         spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g);
1510                         break;
1511                     case F_VSITE3:
1512                         b1 = ip[tp].vsite.b;
1513                         spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
1514                         break;
1515                     case F_VSITE3FD:
1516                         b1 = ip[tp].vsite.b;
1517                         spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1518                         break;
1519                     case F_VSITE3FAD:
1520                         b1 = ip[tp].vsite.b;
1521                         spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1522                         break;
1523                     case F_VSITE3OUT:
1524                         b1 = ip[tp].vsite.b;
1525                         c1 = ip[tp].vsite.c;
1526                         spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1527                         break;
1528                     case F_VSITE4FD:
1529                         b1 = ip[tp].vsite.b;
1530                         c1 = ip[tp].vsite.c;
1531                         spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1532                         break;
1533                     case F_VSITE4FDN:
1534                         b1 = ip[tp].vsite.b;
1535                         c1 = ip[tp].vsite.c;
1536                         spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
1537                         break;
1538                     case F_VSITEN:
1539                         inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g);
1540                         break;
1541                     default:
1542                         gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
1543                                   ftype, __FILE__, __LINE__);
1544                 }
1545                 clear_rvec(f[ia[1]]);
1546
1547                 /* Increment loop variables */
1548                 i  += inc;
1549                 ia += inc;
1550             }
1551         }
1552     }
1553 }
1554
1555 void spread_vsite_f(gmx_vsite_t *vsite,
1556                     rvec x[], rvec f[], rvec *fshift,
1557                     gmx_bool VirCorr, matrix vir,
1558                     t_nrnb *nrnb, t_idef *idef,
1559                     int ePBC, gmx_bool bMolPBC, t_graph *g, matrix box,
1560                     t_commrec *cr)
1561 {
1562     t_pbc pbc, *pbc_null;
1563     int   th;
1564
1565     /* We only need to do pbc when we have inter-cg vsites */
1566     if ((DOMAINDECOMP(cr) || bMolPBC) && vsite->n_intercg_vsite)
1567     {
1568         /* This is wasting some CPU time as we now do this multiple times
1569          * per MD step. But how often do we have vsites with full pbc?
1570          */
1571         pbc_null = set_pbc_dd(&pbc, ePBC, cr->dd, FALSE, box);
1572     }
1573     else
1574     {
1575         pbc_null = NULL;
1576     }
1577
1578     if (DOMAINDECOMP(cr))
1579     {
1580         dd_clear_f_vsites(cr->dd, f);
1581     }
1582     else if (PARTDECOMP(cr) && vsite->vsitecomm != NULL)
1583     {
1584         pd_clear_nonlocal_constructs(vsite->vsitecomm, f);
1585     }
1586
1587     if (vsite->nthreads == 1)
1588     {
1589         spread_vsite_f_thread(vsite,
1590                               x, f, fshift,
1591                               VirCorr, vsite->tdata[0].dxdf,
1592                               idef->iparams, idef->il,
1593                               g, pbc_null);
1594     }
1595     else
1596     {
1597         /* First spread the vsites that might depend on other vsites */
1598         spread_vsite_f_thread(vsite,
1599                               x, f, fshift,
1600                               VirCorr, vsite->tdata[vsite->nthreads].dxdf,
1601                               idef->iparams,
1602                               vsite->tdata[vsite->nthreads].ilist,
1603                               g, pbc_null);
1604
1605 #pragma omp parallel num_threads(vsite->nthreads)
1606         {
1607             int   thread;
1608             rvec *fshift_t;
1609
1610             thread = gmx_omp_get_thread_num();
1611
1612             if (thread == 0 || fshift == NULL)
1613             {
1614                 fshift_t = fshift;
1615             }
1616             else
1617             {
1618                 int i;
1619
1620                 fshift_t = vsite->tdata[thread].fshift;
1621
1622                 for (i = 0; i < SHIFTS; i++)
1623                 {
1624                     clear_rvec(fshift_t[i]);
1625                 }
1626             }
1627
1628             spread_vsite_f_thread(vsite,
1629                                   x, f, fshift_t,
1630                                   VirCorr, vsite->tdata[thread].dxdf,
1631                                   idef->iparams,
1632                                   vsite->tdata[thread].ilist,
1633                                   g, pbc_null);
1634         }
1635
1636         if (fshift != NULL)
1637         {
1638             int i;
1639
1640             for (th = 1; th < vsite->nthreads; th++)
1641             {
1642                 for (i = 0; i < SHIFTS; i++)
1643                 {
1644                     rvec_inc(fshift[i], vsite->tdata[th].fshift[i]);
1645                 }
1646             }
1647         }
1648     }
1649
1650     if (VirCorr)
1651     {
1652         int i, j;
1653
1654         for (th = 0; th < (vsite->nthreads == 1 ? 1 : vsite->nthreads+1); th++)
1655         {
1656             for (i = 0; i < DIM; i++)
1657             {
1658                 for (j = 0; j < DIM; j++)
1659                 {
1660                     vir[i][j] += -0.5*vsite->tdata[th].dxdf[i][j];
1661                 }
1662             }
1663         }
1664     }
1665
1666     if (DOMAINDECOMP(cr))
1667     {
1668         dd_move_f_vsites(cr->dd, f, fshift);
1669     }
1670     else if (vsite->bPDvsitecomm)
1671     {
1672         /* We only move forces here, and they are independent of shifts */
1673         move_construct_f(vsite->vsitecomm, f, cr);
1674     }
1675
1676     inc_nrnb(nrnb, eNR_VSITE2,   vsite_count(idef->il, F_VSITE2));
1677     inc_nrnb(nrnb, eNR_VSITE3,   vsite_count(idef->il, F_VSITE3));
1678     inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
1679     inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
1680     inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
1681     inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
1682     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
1683     inc_nrnb(nrnb, eNR_VSITEN,   vsite_count(idef->il, F_VSITEN));
1684 }
1685
1686 static int *atom2cg(t_block *cgs)
1687 {
1688     int *a2cg, cg, i;
1689
1690     snew(a2cg, cgs->index[cgs->nr]);
1691     for (cg = 0; cg < cgs->nr; cg++)
1692     {
1693         for (i = cgs->index[cg]; i < cgs->index[cg+1]; i++)
1694         {
1695             a2cg[i] = cg;
1696         }
1697     }
1698
1699     return a2cg;
1700 }
1701
1702 static int count_intercg_vsite(gmx_mtop_t *mtop,
1703                                gmx_bool   *bHaveChargeGroups)
1704 {
1705     int             mb, mt, ftype, nral, i, cg, a;
1706     gmx_molblock_t *molb;
1707     gmx_moltype_t  *molt;
1708     int            *a2cg;
1709     t_ilist        *il;
1710     t_iatom        *ia;
1711     int             n_intercg_vsite;
1712
1713     *bHaveChargeGroups = FALSE;
1714
1715     n_intercg_vsite = 0;
1716     for (mb = 0; mb < mtop->nmolblock; mb++)
1717     {
1718         molb = &mtop->molblock[mb];
1719         molt = &mtop->moltype[molb->type];
1720
1721         if (molt->cgs.nr < molt->atoms.nr)
1722         {
1723             *bHaveChargeGroups = TRUE;
1724         }
1725
1726         a2cg = atom2cg(&molt->cgs);
1727         for (ftype = 0; ftype < F_NRE; ftype++)
1728         {
1729             if (interaction_function[ftype].flags & IF_VSITE)
1730             {
1731                 nral = NRAL(ftype);
1732                 il   = &molt->ilist[ftype];
1733                 ia   = il->iatoms;
1734                 for (i = 0; i < il->nr; i += 1+nral)
1735                 {
1736                     cg = a2cg[ia[1+i]];
1737                     for (a = 1; a < nral; a++)
1738                     {
1739                         if (a2cg[ia[1+a]] != cg)
1740                         {
1741                             n_intercg_vsite += molb->nmol;
1742                             break;
1743                         }
1744                     }
1745                 }
1746             }
1747         }
1748         sfree(a2cg);
1749     }
1750
1751     return n_intercg_vsite;
1752 }
1753
1754 static int **get_vsite_pbc(t_iparams *iparams, t_ilist *ilist,
1755                            t_atom *atom, t_mdatoms *md,
1756                            t_block *cgs, int *a2cg)
1757 {
1758     int      ftype, nral, i, j, vsi, vsite, cg_v, cg_c, a, nc3 = 0;
1759     t_ilist *il;
1760     t_iatom *ia;
1761     int    **vsite_pbc, *vsite_pbc_f;
1762     char    *pbc_set;
1763     gmx_bool bViteOnlyCG_and_FirstAtom;
1764
1765     /* Make an array that tells if the pbc of an atom is set */
1766     snew(pbc_set, cgs->index[cgs->nr]);
1767     /* PBC is set for all non vsites */
1768     for (a = 0; a < cgs->index[cgs->nr]; a++)
1769     {
1770         if ((atom && atom[a].ptype != eptVSite) ||
1771             (md   && md->ptype[a]  != eptVSite))
1772         {
1773             pbc_set[a] = 1;
1774         }
1775     }
1776
1777     snew(vsite_pbc, F_VSITEN-F_VSITE2+1);
1778
1779     for (ftype = 0; ftype < F_NRE; ftype++)
1780     {
1781         if (interaction_function[ftype].flags & IF_VSITE)
1782         {
1783             nral = NRAL(ftype);
1784             il   = &ilist[ftype];
1785             ia   = il->iatoms;
1786
1787             snew(vsite_pbc[ftype-F_VSITE2], il->nr/(1+nral));
1788             vsite_pbc_f = vsite_pbc[ftype-F_VSITE2];
1789
1790             i = 0;
1791             while (i < il->nr)
1792             {
1793                 vsi   = i/(1+nral);
1794                 vsite = ia[i+1];
1795                 cg_v  = a2cg[vsite];
1796                 /* A value of -2 signals that this vsite and its contructing
1797                  * atoms are all within the same cg, so no pbc is required.
1798                  */
1799                 vsite_pbc_f[vsi] = -2;
1800                 /* Check if constructing atoms are outside the vsite's cg */
1801                 nc3 = 0;
1802                 if (ftype == F_VSITEN)
1803                 {
1804                     nc3 = 3*iparams[ia[i]].vsiten.n;
1805                     for (j = 0; j < nc3; j += 3)
1806                     {
1807                         if (a2cg[ia[i+j+2]] != cg_v)
1808                         {
1809                             vsite_pbc_f[vsi] = -1;
1810                         }
1811                     }
1812                 }
1813                 else
1814                 {
1815                     for (a = 1; a < nral; a++)
1816                     {
1817                         if (a2cg[ia[i+1+a]] != cg_v)
1818                         {
1819                             vsite_pbc_f[vsi] = -1;
1820                         }
1821                     }
1822                 }
1823                 if (vsite_pbc_f[vsi] == -1)
1824                 {
1825                     /* Check if this is the first processed atom of a vsite only cg */
1826                     bViteOnlyCG_and_FirstAtom = TRUE;
1827                     for (a = cgs->index[cg_v]; a < cgs->index[cg_v+1]; a++)
1828                     {
1829                         /* Non-vsites already have pbc set, so simply check for pbc_set */
1830                         if (pbc_set[a])
1831                         {
1832                             bViteOnlyCG_and_FirstAtom = FALSE;
1833                             break;
1834                         }
1835                     }
1836                     if (bViteOnlyCG_and_FirstAtom)
1837                     {
1838                         /* First processed atom of a vsite only charge group.
1839                          * The pbc of the input coordinates to construct_vsites
1840                          * should be preserved.
1841                          */
1842                         vsite_pbc_f[vsi] = vsite;
1843                     }
1844                     else if (cg_v != a2cg[ia[1+i+1]])
1845                     {
1846                         /* This vsite has a different charge group index
1847                          * than it's first constructing atom
1848                          * and the charge group has more than one atom,
1849                          * search for the first normal particle
1850                          * or vsite that already had its pbc defined.
1851                          * If nothing is found, use full pbc for this vsite.
1852                          */
1853                         for (a = cgs->index[cg_v]; a < cgs->index[cg_v+1]; a++)
1854                         {
1855                             if (a != vsite && pbc_set[a])
1856                             {
1857                                 vsite_pbc_f[vsi] = a;
1858                                 if (gmx_debug_at)
1859                                 {
1860                                     fprintf(debug, "vsite %d match pbc with atom %d\n",
1861                                             vsite+1, a+1);
1862                                 }
1863                                 break;
1864                             }
1865                         }
1866                         if (gmx_debug_at)
1867                         {
1868                             fprintf(debug, "vsite atom %d  cg %d - %d pbc atom %d\n",
1869                                     vsite+1, cgs->index[cg_v]+1, cgs->index[cg_v+1],
1870                                     vsite_pbc_f[vsi]+1);
1871                         }
1872                     }
1873                 }
1874                 if (ftype == F_VSITEN)
1875                 {
1876                     /* The other entries in vsite_pbc_f are not used for center vsites */
1877                     i += nc3;
1878                 }
1879                 else
1880                 {
1881                     i += 1+nral;
1882                 }
1883
1884                 /* This vsite now has its pbc defined */
1885                 pbc_set[vsite] = 1;
1886             }
1887         }
1888     }
1889
1890     sfree(pbc_set);
1891
1892     return vsite_pbc;
1893 }
1894
1895
1896 gmx_vsite_t *init_vsite(gmx_mtop_t *mtop, t_commrec *cr,
1897                         gmx_bool bSerial_NoPBC)
1898 {
1899     int            nvsite, i;
1900     int           *a2cg, cg;
1901     gmx_vsite_t   *vsite;
1902     int            mt;
1903     gmx_moltype_t *molt;
1904     int            nthreads;
1905
1906     /* check if there are vsites */
1907     nvsite = 0;
1908     for (i = 0; i < F_NRE; i++)
1909     {
1910         if (interaction_function[i].flags & IF_VSITE)
1911         {
1912             nvsite += gmx_mtop_ftype_count(mtop, i);
1913         }
1914     }
1915
1916     if (nvsite == 0)
1917     {
1918         return NULL;
1919     }
1920
1921     snew(vsite, 1);
1922
1923     vsite->n_intercg_vsite = count_intercg_vsite(mtop,
1924                                                  &vsite->bHaveChargeGroups);
1925
1926     /* If we don't have charge groups, the vsite follows its own pbc */
1927     if (!bSerial_NoPBC &&
1928         vsite->bHaveChargeGroups &&
1929         vsite->n_intercg_vsite > 0 && DOMAINDECOMP(cr))
1930     {
1931         vsite->nvsite_pbc_molt = mtop->nmoltype;
1932         snew(vsite->vsite_pbc_molt, vsite->nvsite_pbc_molt);
1933         for (mt = 0; mt < mtop->nmoltype; mt++)
1934         {
1935             molt = &mtop->moltype[mt];
1936             /* Make an atom to charge group index */
1937             a2cg = atom2cg(&molt->cgs);
1938             vsite->vsite_pbc_molt[mt] = get_vsite_pbc(mtop->ffparams.iparams,
1939                                                       molt->ilist,
1940                                                       molt->atoms.atom, NULL,
1941                                                       &molt->cgs, a2cg);
1942             sfree(a2cg);
1943         }
1944
1945         snew(vsite->vsite_pbc_loc_nalloc, F_VSITEN-F_VSITE2+1);
1946         snew(vsite->vsite_pbc_loc, F_VSITEN-F_VSITE2+1);
1947     }
1948
1949     if (bSerial_NoPBC)
1950     {
1951         vsite->nthreads = 1;
1952     }
1953     else
1954     {
1955         vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
1956     }
1957     if (!bSerial_NoPBC)
1958     {
1959         /* We need one extra thread data structure for the overlap vsites */
1960         snew(vsite->tdata, vsite->nthreads+1);
1961     }
1962
1963     vsite->th_ind        = NULL;
1964     vsite->th_ind_nalloc = 0;
1965
1966     return vsite;
1967 }
1968
1969 static void prepare_vsite_thread(const t_ilist      *ilist,
1970                                  gmx_vsite_thread_t *vsite_th)
1971 {
1972     int ftype;
1973
1974     for (ftype = 0; ftype < F_NRE; ftype++)
1975     {
1976         if (interaction_function[ftype].flags & IF_VSITE)
1977         {
1978             if (ilist[ftype].nr > vsite_th->ilist[ftype].nalloc)
1979             {
1980                 vsite_th->ilist[ftype].nalloc = over_alloc_large(ilist[ftype].nr);
1981                 srenew(vsite_th->ilist[ftype].iatoms, vsite_th->ilist[ftype].nalloc);
1982             }
1983
1984             vsite_th->ilist[ftype].nr = 0;
1985         }
1986     }
1987 }
1988
1989 void split_vsites_over_threads(const t_ilist   *ilist,
1990                                const t_mdatoms *mdatoms,
1991                                gmx_bool         bLimitRange,
1992                                gmx_vsite_t     *vsite)
1993 {
1994     int      th;
1995     int      vsite_atom_range, natperthread;
1996     int     *th_ind;
1997     int      ftype;
1998     t_iatom *iat;
1999     t_ilist *il_th;
2000     int      nral1, inc, i, j;
2001
2002     if (vsite->nthreads == 1)
2003     {
2004         /* Nothing to do */
2005         return;
2006     }
2007
2008 #pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
2009     for (th = 0; th < vsite->nthreads; th++)
2010     {
2011         prepare_vsite_thread(ilist, &vsite->tdata[th]);
2012     }
2013     /* Master threads does the (potential) overlap vsites */
2014     prepare_vsite_thread(ilist, &vsite->tdata[vsite->nthreads]);
2015
2016     /* The current way of distributing the vsites over threads in primitive.
2017      * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
2018      * without taking into account how the vsites are distributed.
2019      * Without domain decomposition we bLimitRange=TRUE and we at least
2020      * tighten the upper bound of the range (useful for common systems
2021      * such as a vsite-protein in 3-site water).
2022      */
2023     if (bLimitRange)
2024     {
2025         vsite_atom_range = -1;
2026         for (ftype = 0; ftype < F_NRE; ftype++)
2027         {
2028             if ((interaction_function[ftype].flags & IF_VSITE) &&
2029                 ftype != F_VSITEN)
2030             {
2031                 nral1 = 1 + NRAL(ftype);
2032                 iat   = ilist[ftype].iatoms;
2033                 for (i = 0; i < ilist[ftype].nr; i += nral1)
2034                 {
2035                     for (j = i+1; j < i+nral1; j++)
2036                     {
2037                         vsite_atom_range = max(vsite_atom_range, iat[j]);
2038                     }
2039                 }
2040             }
2041         }
2042         vsite_atom_range++;
2043     }
2044     else
2045     {
2046         vsite_atom_range = mdatoms->homenr;
2047     }
2048     natperthread = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
2049
2050     if (debug)
2051     {
2052         fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms->nr, vsite_atom_range, natperthread);
2053     }
2054
2055     /* To simplify the vsite assignment, we make an index which tells us
2056      * to which thread particles, both non-vsites and vsites, are assigned.
2057      */
2058     if (mdatoms->nr > vsite->th_ind_nalloc)
2059     {
2060         vsite->th_ind_nalloc = over_alloc_large(mdatoms->nr);
2061         srenew(vsite->th_ind, vsite->th_ind_nalloc);
2062     }
2063     th_ind = vsite->th_ind;
2064     th     = 0;
2065     for (i = 0; i < mdatoms->nr; i++)
2066     {
2067         if (mdatoms->ptype[i] == eptVSite)
2068         {
2069             /* vsites are not assigned to a thread yet */
2070             th_ind[i] = -1;
2071         }
2072         else
2073         {
2074             /* assign non-vsite particles to thread th */
2075             th_ind[i] = th;
2076         }
2077         if (i == (th + 1)*natperthread && th < vsite->nthreads)
2078         {
2079             th++;
2080         }
2081     }
2082
2083     for (ftype = 0; ftype < F_NRE; ftype++)
2084     {
2085         if ((interaction_function[ftype].flags & IF_VSITE) &&
2086             ftype != F_VSITEN)
2087         {
2088             nral1 = 1 + NRAL(ftype);
2089             inc   = nral1;
2090             iat   = ilist[ftype].iatoms;
2091             for (i = 0; i < ilist[ftype].nr; )
2092             {
2093                 th = iat[1+i]/natperthread;
2094                 /* We would like to assign this vsite the thread th,
2095                  * but it might depend on atoms outside the atom range of th
2096                  * or on another vsite not assigned to thread th.
2097                  */
2098                 if (ftype != F_VSITEN)
2099                 {
2100                     for (j = i+2; j < i+nral1; j++)
2101                     {
2102                         if (th_ind[iat[j]] != th)
2103                         {
2104                             /* Some constructing atoms are not assigned to
2105                              * thread th, move this vsite to a separate batch.
2106                              */
2107                             th = vsite->nthreads;
2108                         }
2109                     }
2110                 }
2111                 else
2112                 {
2113                     inc = iat[i];
2114                     for (j = i+2; j < i+inc; j += 3)
2115                     {
2116                         if (th_ind[iat[j]] != th)
2117                         {
2118                             th = vsite->nthreads;
2119                         }
2120                     }
2121                 }
2122                 /* Copy this vsite to the thread data struct of thread th */
2123                 il_th = &vsite->tdata[th].ilist[ftype];
2124                 for (j = i; j < i+inc; j++)
2125                 {
2126                     il_th->iatoms[il_th->nr++] = iat[j];
2127                 }
2128                 /* Update this vsite's thread index entry */
2129                 th_ind[iat[1+i]] = th;
2130
2131                 i += inc;
2132             }
2133         }
2134     }
2135
2136     if (debug)
2137     {
2138         for (ftype = 0; ftype < F_NRE; ftype++)
2139         {
2140             if ((interaction_function[ftype].flags & IF_VSITE) &&
2141                 ilist[ftype].nr > 0)
2142             {
2143                 fprintf(debug, "%-20s thread dist:",
2144                         interaction_function[ftype].longname);
2145                 for (th = 0; th < vsite->nthreads+1; th++)
2146                 {
2147                     fprintf(debug, " %4d", vsite->tdata[th].ilist[ftype].nr);
2148                 }
2149                 fprintf(debug, "\n");
2150             }
2151         }
2152     }
2153 }
2154
2155 void set_vsite_top(gmx_vsite_t *vsite, gmx_localtop_t *top, t_mdatoms *md,
2156                    t_commrec *cr)
2157 {
2158     int *a2cg;
2159
2160     if (vsite->n_intercg_vsite > 0)
2161     {
2162         if (vsite->bHaveChargeGroups)
2163         {
2164             /* Make an atom to charge group index */
2165             a2cg                 = atom2cg(&top->cgs);
2166             vsite->vsite_pbc_loc = get_vsite_pbc(top->idef.iparams,
2167                                                  top->idef.il, NULL, md,
2168                                                  &top->cgs, a2cg);
2169             sfree(a2cg);
2170         }
2171
2172         if (PARTDECOMP(cr))
2173         {
2174             snew(vsite->vsitecomm, 1);
2175             vsite->bPDvsitecomm =
2176                 setup_parallel_vsites(&(top->idef), cr, vsite->vsitecomm);
2177         }
2178     }
2179
2180     if (vsite->nthreads > 1)
2181     {
2182         if (vsite->bHaveChargeGroups || PARTDECOMP(cr))
2183         {
2184             gmx_incons("Can not use threads virtual sites combined with charge groups or particle decomposition");
2185         }
2186
2187         split_vsites_over_threads(top->idef.il, md, !DOMAINDECOMP(cr), vsite);
2188     }
2189 }