src/gromacs/mdlib/sim_util.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROwing Monsters And Cloning Shrimps
  35  */
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40 #include <stdio.h>
  41 #ifdef HAVE_SYS_TIME_H
  42 #include <sys/time.h>
  43 #endif
  44 #include <math.h>
  45 #include "typedefs.h"
  46 #include "string2.h"
  47 #include "smalloc.h"
  48 #include "names.h"
  49 #include "mvdata.h"
  50 #include "txtdump.h"
  51 #include "pbc.h"
  52 #include "chargegroup.h"
  53 #include "vec.h"
  54 #include "nrnb.h"
  55 #include "mshift.h"
  56 #include "mdrun.h"
  57 #include "sim_util.h"
  58 #include "update.h"
  59 #include "physics.h"
  60 #include "main.h"
  61 #include "mdatoms.h"
  62 #include "force.h"
  63 #include "bondf.h"
  64 #include "pme.h"
  65 #include "disre.h"
  66 #include "orires.h"
  67 #include "network.h"
  68 #include "calcmu.h"
  69 #include "constr.h"
  70 #include "xvgr.h"
  71 #include "copyrite.h"
  72 #include "pull_rotation.h"
  73 #include "gmx_random.h"
  74 #include "domdec.h"
  75 #include "partdec.h"
  76 #include "genborn.h"
  77 #include "nbnxn_atomdata.h"
  78 #include "nbnxn_search.h"
  79 #include "nbnxn_kernels/nbnxn_kernel_ref.h"
  80 #include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
  81 #include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
  82 #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  83
  84 #include "gromacs/timing/wallcycle.h"
  85 #include "gromacs/timing/walltime_accounting.h"
  86 #include "gromacs/utility/gmxmpi.h"
  87
  88 #include "adress.h"
  89 #include "qmmm.h"
  90
  91 #include "nbnxn_cuda_data_mgmt.h"
  92 #include "nbnxn_cuda/nbnxn_cuda.h"
  93
  94 void print_time(FILE                     *out,
  95                 gmx_walltime_accounting_t walltime_accounting,
  96                 gmx_large_int_t           step,
  97                 t_inputrec               *ir,
  98                 t_commrec gmx_unused     *cr)
  99 {
 100     time_t finish;
 101     char   timebuf[STRLEN];
 102     double dt, elapsed_seconds, time_per_step;
 103     char   buf[48];
 104
 105 #ifndef GMX_THREAD_MPI
 106     if (!PAR(cr))
 107 #endif
 108     {
 109         fprintf(out, "\r");
 110     }
 111     fprintf(out, "step %s", gmx_step_str(step, buf));
 112     if ((step >= ir->nstlist))
 113     {
 114         double seconds_since_epoch = gmx_gettime();
 115         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 116         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 117         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 118
 119         if (ir->nsteps >= 0)
 120         {
 121             if (dt >= 300)
 122             {
 123                 finish = (time_t) (seconds_since_epoch + dt);
 124                 gmx_ctime_r(&finish, timebuf, STRLEN);
 125                 sprintf(buf, "%s", timebuf);
 126                 buf[strlen(buf)-1] = '\0';
 127                 fprintf(out, ", will finish %s", buf);
 128             }
 129             else
 130             {
 131                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 132             }
 133         }
 134         else
 135         {
 136             fprintf(out, " performance: %.1f ns/day    ",
 137                     ir->delta_t/1000*24*60*60/time_per_step);
 138         }
 139     }
 140 #ifndef GMX_THREAD_MPI
 141     if (PAR(cr))
 142     {
 143         fprintf(out, "\n");
 144     }
 145 #endif
 146
 147     fflush(out);
 148 }
 149
 150 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 151                          const gmx_walltime_accounting_t walltime_accounting)
 152 {
 153     int    i;
 154     char   timebuf[STRLEN];
 155     char   time_string[STRLEN];
 156     time_t tmptime;
 157
 158     if (fplog)
 159     {
 160         if (walltime_accounting != NULL)
 161         {
 162             tmptime = (time_t) walltime_accounting_get_start_time_stamp(walltime_accounting);
 163             gmx_ctime_r(&tmptime, timebuf, STRLEN);
 164         }
 165         else
 166         {
 167             tmptime = (time_t) gmx_gettime();
 168             gmx_ctime_r(&tmptime, timebuf, STRLEN);
 169         }
 170         for (i = 0; timebuf[i] >= ' '; i++)
 171         {
 172             time_string[i] = timebuf[i];
 173         }
 174         time_string[i] = '\0';
 175
 176         fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
 177     }
 178 }
 179
 180 static void sum_forces(int start, int end, rvec f[], rvec flr[])
 181 {
 182     int i;
 183
 184     if (gmx_debug_at)
 185     {
 186         pr_rvecs(debug, 0, "fsr", f+start, end-start);
 187         pr_rvecs(debug, 0, "flr", flr+start, end-start);
 188     }
 189     for (i = start; (i < end); i++)
 190     {
 191         rvec_inc(f[i], flr[i]);
 192     }
 193 }
 194
 195 /*
 196  * calc_f_el calculates forces due to an electric field.
 197  *
 198  * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
 199  *
 200  * Et[] contains the parameters for the time dependent
 201  * part of the field (not yet used).
 202  * Ex[] contains the parameters for
 203  * the spatial dependent part of the field. You can have cool periodic
 204  * fields in principle, but only a constant field is supported
 205  * now.
 206  * The function should return the energy due to the electric field
 207  * (if any) but for now returns 0.
 208  *
 209  * WARNING:
 210  * There can be problems with the virial.
 211  * Since the field is not self-consistent this is unavoidable.
 212  * For neutral molecules the virial is correct within this approximation.
 213  * For neutral systems with many charged molecules the error is small.
 214  * But for systems with a net charge or a few charged molecules
 215  * the error can be significant when the field is high.
 216  * Solution: implement a self-consitent electric field into PME.
 217  */
 218 static void calc_f_el(FILE *fp, int  start, int homenr,
 219                       real charge[], rvec f[],
 220                       t_cosines Ex[], t_cosines Et[], double t)
 221 {
 222     rvec Ext;
 223     real t0;
 224     int  i, m;
 225
 226     for (m = 0; (m < DIM); m++)
 227     {
 228         if (Et[m].n > 0)
 229         {
 230             if (Et[m].n == 3)
 231             {
 232                 t0     = Et[m].a[1];
 233                 Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
 234             }
 235             else
 236             {
 237                 Ext[m] = cos(Et[m].a[0]*t);
 238             }
 239         }
 240         else
 241         {
 242             Ext[m] = 1.0;
 243         }
 244         if (Ex[m].n > 0)
 245         {
 246             /* Convert the field strength from V/nm to MD-units */
 247             Ext[m] *= Ex[m].a[0]*FIELDFAC;
 248             for (i = start; (i < start+homenr); i++)
 249             {
 250                 f[i][m] += charge[i]*Ext[m];
 251             }
 252         }
 253         else
 254         {
 255             Ext[m] = 0;
 256         }
 257     }
 258     if (fp != NULL)
 259     {
 260         fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
 261                 Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
 262     }
 263 }
 264
 265 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 266                         tensor vir_part, t_graph *graph, matrix box,
 267                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 268 {
 269     int    i, j;
 270     tensor virtest;
 271
 272     /* The short-range virial from surrounding boxes */
 273     clear_mat(vir_part);
 274     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 275     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 276
 277     /* Calculate partial virial, for local atoms only, based on short range.
 278      * Total virial is computed in global_stat, called from do_md
 279      */
 280     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 281     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 282
 283     /* Add position restraint contribution */
 284     for (i = 0; i < DIM; i++)
 285     {
 286         vir_part[i][i] += fr->vir_diag_posres[i];
 287     }
 288
 289     /* Add wall contribution */
 290     for (i = 0; i < DIM; i++)
 291     {
 292         vir_part[i][ZZ] += fr->vir_wall_z[i];
 293     }
 294
 295     if (debug)
 296     {
 297         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 298     }
 299 }
 300
 301 static void posres_wrapper(FILE *fplog,
 302                            int flags,
 303                            gmx_bool bSepDVDL,
 304                            t_inputrec *ir,
 305                            t_nrnb *nrnb,
 306                            gmx_localtop_t *top,
 307                            matrix box, rvec x[],
 308                            gmx_enerdata_t *enerd,
 309                            real *lambda,
 310                            t_forcerec *fr)
 311 {
 312     t_pbc pbc;
 313     real  v, dvdl;
 314     int   i;
 315
 316     /* Position restraints always require full pbc */
 317     set_pbc(&pbc, ir->ePBC, box);
 318     dvdl = 0;
 319     v    = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
 320                   top->idef.iparams_posres,
 321                   (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
 322                   ir->ePBC == epbcNONE ? NULL : &pbc,
 323                   lambda[efptRESTRAINT], &dvdl,
 324                   fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
 325     if (bSepDVDL)
 326     {
 327         gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
 328     }
 329     enerd->term[F_POSRES] += v;
 330     /* If just the force constant changes, the FEP term is linear,
 331      * but if k changes, it is not.
 332      */
 333     enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
 334     inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
 335
 336     if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
 337     {
 338         for (i = 0; i < enerd->n_lambda; i++)
 339         {
 340             real dvdl_dum, lambda_dum;
 341
 342             lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
 343             v          = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
 344                                 top->idef.iparams_posres,
 345                                 (const rvec*)x, NULL, NULL,
 346                                 ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
 347                                 fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
 348             enerd->enerpart_lambda[i] += v;
 349         }
 350     }
 351 }
 352
 353 static void pull_potential_wrapper(FILE *fplog,
 354                                    gmx_bool bSepDVDL,
 355                                    t_commrec *cr,
 356                                    t_inputrec *ir,
 357                                    matrix box, rvec x[],
 358                                    rvec f[],
 359                                    tensor vir_force,
 360                                    t_mdatoms *mdatoms,
 361                                    gmx_enerdata_t *enerd,
 362                                    real *lambda,
 363                                    double t)
 364 {
 365     t_pbc  pbc;
 366     real   dvdl;
 367
 368     /* Calculate the center of mass forces, this requires communication,
 369      * which is why pull_potential is called close to other communication.
 370      * The virial contribution is calculated directly,
 371      * which is why we call pull_potential after calc_virial.
 372      */
 373     set_pbc(&pbc, ir->ePBC, box);
 374     dvdl                     = 0;
 375     enerd->term[F_COM_PULL] +=
 376         pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
 377                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
 378     if (bSepDVDL)
 379     {
 380         gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
 381     }
 382     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 383 }
 384
 385 static void pme_receive_force_ener(FILE           *fplog,
 386                                    gmx_bool        bSepDVDL,
 387                                    t_commrec      *cr,
 388                                    gmx_wallcycle_t wcycle,
 389                                    gmx_enerdata_t *enerd,
 390                                    t_forcerec     *fr)
 391 {
 392     real   e, v, dvdl;
 393     float  cycles_ppdpme, cycles_seppme;
 394
 395     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 396     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 397
 398     /* In case of node-splitting, the PP nodes receive the long-range
 399      * forces, virial and energy from the PME nodes here.
 400      */
 401     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 402     dvdl = 0;
 403     gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
 404                       &cycles_seppme);
 405     if (bSepDVDL)
 406     {
 407         gmx_print_sepdvdl(fplog, "PME mesh", e, dvdl);
 408     }
 409     enerd->term[F_COUL_RECIP] += e;
 410     enerd->dvdl_lin[efptCOUL] += dvdl;
 411     if (wcycle)
 412     {
 413         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 414     }
 415     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 416 }
 417
 418 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
 419                                gmx_large_int_t step, real pforce, rvec *x, rvec *f)
 420 {
 421     int  i;
 422     real pf2, fn2;
 423     char buf[STEPSTRSIZE];
 424
 425     pf2 = sqr(pforce);
 426     for (i = md->start; i < md->start+md->homenr; i++)
 427     {
 428         fn2 = norm2(f[i]);
 429         /* We also catch NAN, if the compiler does not optimize this away. */
 430         if (fn2 >= pf2 || fn2 != fn2)
 431         {
 432             fprintf(fp, "step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 433                     gmx_step_str(step, buf),
 434                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], sqrt(fn2));
 435         }
 436     }
 437 }
 438
 439 static void post_process_forces(t_commrec *cr,
 440                                 gmx_large_int_t step,
 441                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 442                                 gmx_localtop_t *top,
 443                                 matrix box, rvec x[],
 444                                 rvec f[],
 445                                 tensor vir_force,
 446                                 t_mdatoms *mdatoms,
 447                                 t_graph *graph,
 448                                 t_forcerec *fr, gmx_vsite_t *vsite,
 449                                 int flags)
 450 {
 451     if (fr->bF_NoVirSum)
 452     {
 453         if (vsite)
 454         {
 455             /* Spread the mesh force on virtual sites to the other particles...
 456              * This is parallellized. MPI communication is performed
 457              * if the constructing atoms aren't local.
 458              */
 459             wallcycle_start(wcycle, ewcVSITESPREAD);
 460             spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
 461                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
 462                            nrnb,
 463                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
 464             wallcycle_stop(wcycle, ewcVSITESPREAD);
 465         }
 466         if (flags & GMX_FORCE_VIRIAL)
 467         {
 468             /* Now add the forces, this is local */
 469             if (fr->bDomDec)
 470             {
 471                 sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
 472             }
 473             else
 474             {
 475                 sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
 476                            f, fr->f_novirsum);
 477             }
 478             if (EEL_FULL(fr->eeltype))
 479             {
 480                 /* Add the mesh contribution to the virial */
 481                 m_add(vir_force, fr->vir_el_recip, vir_force);
 482             }
 483             if (debug)
 484             {
 485                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 486             }
 487         }
 488     }
 489
 490     if (fr->print_force >= 0)
 491     {
 492         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 493     }
 494 }
 495
 496 static void do_nb_verlet(t_forcerec *fr,
 497                          interaction_const_t *ic,
 498                          gmx_enerdata_t *enerd,
 499                          int flags, int ilocality,
 500                          int clearF,
 501                          t_nrnb *nrnb,
 502                          gmx_wallcycle_t wcycle)
 503 {
 504     int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 505     char                      *env;
 506     nonbonded_verlet_group_t  *nbvg;
 507     gmx_bool                   bCUDA;
 508
 509     if (!(flags & GMX_FORCE_NONBONDED))
 510     {
 511         /* skip non-bonded calculation */
 512         return;
 513     }
 514
 515     nbvg = &fr->nbv->grp[ilocality];
 516
 517     /* CUDA kernel launch overhead is already timed separately */
 518     if (fr->cutoff_scheme != ecutsVERLET)
 519     {
 520         gmx_incons("Invalid cut-off scheme passed!");
 521     }
 522
 523     bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
 524
 525     if (!bCUDA)
 526     {
 527         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 528     }
 529     switch (nbvg->kernel_type)
 530     {
 531         case nbnxnk4x4_PlainC:
 532             nbnxn_kernel_ref(&nbvg->nbl_lists,
 533                              nbvg->nbat, ic,
 534                              fr->shift_vec,
 535                              flags,
 536                              clearF,
 537                              fr->fshift[0],
 538                              enerd->grpp.ener[egCOULSR],
 539                              fr->bBHAM ?
 540                              enerd->grpp.ener[egBHAMSR] :
 541                              enerd->grpp.ener[egLJSR]);
 542             break;
 543
 544         case nbnxnk4xN_SIMD_4xN:
 545             nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
 546                                   nbvg->nbat, ic,
 547                                   nbvg->ewald_excl,
 548                                   fr->shift_vec,
 549                                   flags,
 550                                   clearF,
 551                                   fr->fshift[0],
 552                                   enerd->grpp.ener[egCOULSR],
 553                                   fr->bBHAM ?
 554                                   enerd->grpp.ener[egBHAMSR] :
 555                                   enerd->grpp.ener[egLJSR]);
 556             break;
 557         case nbnxnk4xN_SIMD_2xNN:
 558             nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
 559                                    nbvg->nbat, ic,
 560                                    nbvg->ewald_excl,
 561                                    fr->shift_vec,
 562                                    flags,
 563                                    clearF,
 564                                    fr->fshift[0],
 565                                    enerd->grpp.ener[egCOULSR],
 566                                    fr->bBHAM ?
 567                                    enerd->grpp.ener[egBHAMSR] :
 568                                    enerd->grpp.ener[egLJSR]);
 569             break;
 570
 571         case nbnxnk8x8x8_CUDA:
 572             nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
 573             break;
 574
 575         case nbnxnk8x8x8_PlainC:
 576             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 577                                  nbvg->nbat, ic,
 578                                  fr->shift_vec,
 579                                  flags,
 580                                  clearF,
 581                                  nbvg->nbat->out[0].f,
 582                                  fr->fshift[0],
 583                                  enerd->grpp.ener[egCOULSR],
 584                                  fr->bBHAM ?
 585                                  enerd->grpp.ener[egBHAMSR] :
 586                                  enerd->grpp.ener[egLJSR]);
 587             break;
 588
 589         default:
 590             gmx_incons("Invalid nonbonded kernel type passed!");
 591
 592     }
 593     if (!bCUDA)
 594     {
 595         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 596     }
 597
 598     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 599     {
 600         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 601     }
 602     else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
 603              (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
 604     {
 605         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 606     }
 607     else
 608     {
 609         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 610     }
 611     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 612     if (flags & GMX_FORCE_ENERGY)
 613     {
 614         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 615         enr_nbnxn_kernel_ljc += 1;
 616         enr_nbnxn_kernel_lj  += 1;
 617     }
 618
 619     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 620              nbvg->nbl_lists.natpair_ljq);
 621     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 622              nbvg->nbl_lists.natpair_lj);
 623     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 624              nbvg->nbl_lists.natpair_q);
 625 }
 626
 627 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 628                          t_inputrec *inputrec,
 629                          gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 630                          gmx_localtop_t *top,
 631                          gmx_groups_t gmx_unused *groups,
 632                          matrix box, rvec x[], history_t *hist,
 633                          rvec f[],
 634                          tensor vir_force,
 635                          t_mdatoms *mdatoms,
 636                          gmx_enerdata_t *enerd, t_fcdata *fcd,
 637                          real *lambda, t_graph *graph,
 638                          t_forcerec *fr, interaction_const_t *ic,
 639                          gmx_vsite_t *vsite, rvec mu_tot,
 640                          double t, FILE *field, gmx_edsam_t ed,
 641                          gmx_bool bBornRadii,
 642                          int flags)
 643 {
 644     int                 cg0, cg1, i, j;
 645     int                 start, homenr;
 646     int                 nb_kernel_type;
 647     double              mu[2*DIM];
 648     gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
 649     gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
 650     gmx_bool            bDiffKernels = FALSE;
 651     matrix              boxs;
 652     rvec                vzero, box_diag;
 653     real                e, v, dvdl;
 654     float               cycles_pme, cycles_force;
 655     nonbonded_verlet_t *nbv;
 656
 657     cycles_force   = 0;
 658     nbv            = fr->nbv;
 659     nb_kernel_type = fr->nbv->grp[0].kernel_type;
 660
 661     start  = mdatoms->start;
 662     homenr = mdatoms->homenr;
 663
 664     bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
 665
 666     clear_mat(vir_force);
 667
 668     cg0 = 0;
 669     if (DOMAINDECOMP(cr))
 670     {
 671         cg1 = cr->dd->ncg_tot;
 672     }
 673     else
 674     {
 675         cg1 = top->cgs.nr;
 676     }
 677     if (fr->n_tpi > 0)
 678     {
 679         cg1--;
 680     }
 681
 682     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 683     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
 684     bFillGrid     = (bNS && bStateChanged);
 685     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
 686     bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
 687     bDoForces     = (flags & GMX_FORCE_FORCES);
 688     bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
 689     bUseGPU       = fr->nbv->bUseGPU;
 690     bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
 691
 692     if (bStateChanged)
 693     {
 694         update_forcerec(fr, box);
 695
 696         if (NEED_MUTOT(*inputrec))
 697         {
 698             /* Calculate total (local) dipole moment in a temporary common array.
 699              * This makes it possible to sum them over nodes faster.
 700              */
 701             calc_mu(start, homenr,
 702                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
 703                     mu, mu+DIM);
 704         }
 705     }
 706
 707     if (fr->ePBC != epbcNONE)
 708     {
 709         /* Compute shift vectors every step,
 710          * because of pressure coupling or box deformation!
 711          */
 712         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 713         {
 714             calc_shifts(box, fr->shift_vec);
 715         }
 716
 717         if (bCalcCGCM)
 718         {
 719             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
 720             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
 721         }
 722         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
 723         {
 724             unshift_self(graph, box, x);
 725         }
 726     }
 727
 728     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
 729                                  fr->shift_vec, nbv->grp[0].nbat);
 730
 731 #ifdef GMX_MPI
 732     if (!(cr->duty & DUTY_PME))
 733     {
 734         /* Send particle coordinates to the pme nodes.
 735          * Since this is only implemented for domain decomposition
 736          * and domain decomposition does not use the graph,
 737          * we do not need to worry about shifting.
 738          */
 739
 740         wallcycle_start(wcycle, ewcPP_PMESENDX);
 741
 742         bBS = (inputrec->nwall == 2);
 743         if (bBS)
 744         {
 745             copy_mat(box, boxs);
 746             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
 747         }
 748
 749         gmx_pme_send_x(cr, bBS ? boxs : box, x,
 750                        mdatoms->nChargePerturbed, lambda[efptCOUL],
 751                        (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
 752
 753         wallcycle_stop(wcycle, ewcPP_PMESENDX);
 754     }
 755 #endif /* GMX_MPI */
 756
 757     /* do gridding for pair search */
 758     if (bNS)
 759     {
 760         if (graph && bStateChanged)
 761         {
 762             /* Calculate intramolecular shift vectors to make molecules whole */
 763             mk_mshift(fplog, graph, fr->ePBC, box, x);
 764         }
 765
 766         clear_rvec(vzero);
 767         box_diag[XX] = box[XX][XX];
 768         box_diag[YY] = box[YY][YY];
 769         box_diag[ZZ] = box[ZZ][ZZ];
 770
 771         wallcycle_start(wcycle, ewcNS);
 772         if (!fr->bDomDec)
 773         {
 774             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
 775             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
 776                               0, vzero, box_diag,
 777                               0, mdatoms->homenr, -1, fr->cginfo, x,
 778                               0, NULL,
 779                               nbv->grp[eintLocal].kernel_type,
 780                               nbv->grp[eintLocal].nbat);
 781             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
 782         }
 783         else
 784         {
 785             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
 786             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
 787                                        fr->cginfo, x,
 788                                        nbv->grp[eintNonlocal].kernel_type,
 789                                        nbv->grp[eintNonlocal].nbat);
 790             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
 791         }
 792
 793         if (nbv->ngrp == 1 ||
 794             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
 795         {
 796             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
 797                                nbv->nbs, mdatoms, fr->cginfo);
 798         }
 799         else
 800         {
 801             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
 802                                nbv->nbs, mdatoms, fr->cginfo);
 803             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
 804                                nbv->nbs, mdatoms, fr->cginfo);
 805         }
 806         wallcycle_stop(wcycle, ewcNS);
 807     }
 808
 809     /* initialize the GPU atom data and copy shift vector */
 810     if (bUseGPU)
 811     {
 812         if (bNS)
 813         {
 814             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 815             nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 816             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 817         }
 818
 819         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 820         nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 821         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 822     }
 823
 824     /* do local pair search */
 825     if (bNS)
 826     {
 827         wallcycle_start_nocount(wcycle, ewcNS);
 828         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
 829         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
 830                             &top->excls,
 831                             ic->rlist,
 832                             nbv->min_ci_balanced,
 833                             &nbv->grp[eintLocal].nbl_lists,
 834                             eintLocal,
 835                             nbv->grp[eintLocal].kernel_type,
 836                             nrnb);
 837         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
 838
 839         if (bUseGPU)
 840         {
 841             /* initialize local pair-list on the GPU */
 842             nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 843                                      nbv->grp[eintLocal].nbl_lists.nbl[0],
 844                                      eintLocal);
 845         }
 846         wallcycle_stop(wcycle, ewcNS);
 847     }
 848     else
 849     {
 850         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 851         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 852         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
 853                                         nbv->grp[eintLocal].nbat);
 854         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 855         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 856     }
 857
 858     if (bUseGPU)
 859     {
 860         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
 861         /* launch local nonbonded F on GPU */
 862         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
 863                      nrnb, wcycle);
 864         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 865     }
 866
 867     /* Communicate coordinates and sum dipole if necessary +
 868        do non-local pair search */
 869     if (DOMAINDECOMP(cr))
 870     {
 871         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
 872                         nbv->grp[eintLocal].kernel_type);
 873
 874         if (bDiffKernels)
 875         {
 876             /* With GPU+CPU non-bonded calculations we need to copy
 877              * the local coordinates to the non-local nbat struct
 878              * (in CPU format) as the non-local kernel call also
 879              * calculates the local - non-local interactions.
 880              */
 881             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 882             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 883             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
 884                                             nbv->grp[eintNonlocal].nbat);
 885             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 886             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 887         }
 888
 889         if (bNS)
 890         {
 891             wallcycle_start_nocount(wcycle, ewcNS);
 892             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 893
 894             if (bDiffKernels)
 895             {
 896                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
 897             }
 898
 899             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
 900                                 &top->excls,
 901                                 ic->rlist,
 902                                 nbv->min_ci_balanced,
 903                                 &nbv->grp[eintNonlocal].nbl_lists,
 904                                 eintNonlocal,
 905                                 nbv->grp[eintNonlocal].kernel_type,
 906                                 nrnb);
 907
 908             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 909
 910             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
 911             {
 912                 /* initialize non-local pair-list on the GPU */
 913                 nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 914                                          nbv->grp[eintNonlocal].nbl_lists.nbl[0],
 915                                          eintNonlocal);
 916             }
 917             wallcycle_stop(wcycle, ewcNS);
 918         }
 919         else
 920         {
 921             wallcycle_start(wcycle, ewcMOVEX);
 922             dd_move_x(cr->dd, box, x);
 923
 924             /* When we don't need the total dipole we sum it in global_stat */
 925             if (bStateChanged && NEED_MUTOT(*inputrec))
 926             {
 927                 gmx_sumd(2*DIM, mu, cr);
 928             }
 929             wallcycle_stop(wcycle, ewcMOVEX);
 930
 931             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 932             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 933             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
 934                                             nbv->grp[eintNonlocal].nbat);
 935             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 936             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 937         }
 938
 939         if (bUseGPU && !bDiffKernels)
 940         {
 941             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
 942             /* launch non-local nonbonded F on GPU */
 943             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
 944                          nrnb, wcycle);
 945             cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 946         }
 947     }
 948
 949     if (bUseGPU)
 950     {
 951         /* launch D2H copy-back F */
 952         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 953         if (DOMAINDECOMP(cr) && !bDiffKernels)
 954         {
 955             nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
 956                                       flags, eatNonlocal);
 957         }
 958         nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
 959                                   flags, eatLocal);
 960         cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 961     }
 962
 963     if (bStateChanged && NEED_MUTOT(*inputrec))
 964     {
 965         if (PAR(cr))
 966         {
 967             gmx_sumd(2*DIM, mu, cr);
 968         }
 969
 970         for (i = 0; i < 2; i++)
 971         {
 972             for (j = 0; j < DIM; j++)
 973             {
 974                 fr->mu_tot[i][j] = mu[i*DIM + j];
 975             }
 976         }
 977     }
 978     if (fr->efep == efepNO)
 979     {
 980         copy_rvec(fr->mu_tot[0], mu_tot);
 981     }
 982     else
 983     {
 984         for (j = 0; j < DIM; j++)
 985         {
 986             mu_tot[j] =
 987                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
 988                 lambda[efptCOUL]*fr->mu_tot[1][j];
 989         }
 990     }
 991
 992     /* Reset energies */
 993     reset_enerdata(fr, bNS, enerd, MASTER(cr));
 994     clear_rvecs(SHIFTS, fr->fshift);
 995
 996     if (DOMAINDECOMP(cr))
 997     {
 998         if (!(cr->duty & DUTY_PME))
 999         {
1000             wallcycle_start(wcycle, ewcPPDURINGPME);
1001             dd_force_flop_start(cr->dd, nrnb);
1002         }
1003     }
1004
1005     if (inputrec->bRot)
1006     {
1007         /* Enforced rotation has its own cycle counter that starts after the collective
1008          * coordinates have been communicated. It is added to ddCyclF to allow
1009          * for proper load-balancing */
1010         wallcycle_start(wcycle, ewcROT);
1011         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1012         wallcycle_stop(wcycle, ewcROT);
1013     }
1014
1015     /* Start the force cycle counter.
1016      * This counter is stopped in do_forcelow_level.
1017      * No parallel communication should occur while this counter is running,
1018      * since that will interfere with the dynamic load balancing.
1019      */
1020     wallcycle_start(wcycle, ewcFORCE);
1021     if (bDoForces)
1022     {
1023         /* Reset forces for which the virial is calculated separately:
1024          * PME/Ewald forces if necessary */
1025         if (fr->bF_NoVirSum)
1026         {
1027             if (flags & GMX_FORCE_VIRIAL)
1028             {
1029                 fr->f_novirsum = fr->f_novirsum_alloc;
1030                 if (fr->bDomDec)
1031                 {
1032                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1033                 }
1034                 else
1035                 {
1036                     clear_rvecs(homenr, fr->f_novirsum+start);
1037                 }
1038             }
1039             else
1040             {
1041                 /* We are not calculating the pressure so we do not need
1042                  * a separate array for forces that do not contribute
1043                  * to the pressure.
1044                  */
1045                 fr->f_novirsum = f;
1046             }
1047         }
1048
1049         /* Clear the short- and long-range forces */
1050         clear_rvecs(fr->natoms_force_constr, f);
1051         if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
1052         {
1053             clear_rvecs(fr->natoms_force_constr, fr->f_twin);
1054         }
1055
1056         clear_rvec(fr->vir_diag_posres);
1057     }
1058
1059     if (inputrec->ePull == epullCONSTRAINT)
1060     {
1061         clear_pull_forces(inputrec->pull);
1062     }
1063
1064     /* We calculate the non-bonded forces, when done on the CPU, here.
1065      * We do this before calling do_force_lowlevel, as in there bondeds
1066      * forces are calculated before PME, which does communication.
1067      * With this order, non-bonded and bonded force calculation imbalance
1068      * can be balanced out by the domain decomposition load balancing.
1069      */
1070
1071     if (!bUseOrEmulGPU)
1072     {
1073         /* Maybe we should move this into do_force_lowlevel */
1074         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1075                      nrnb, wcycle);
1076     }
1077
1078     if (!bUseOrEmulGPU || bDiffKernels)
1079     {
1080         int aloc;
1081
1082         if (DOMAINDECOMP(cr))
1083         {
1084             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1085                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1086                          nrnb, wcycle);
1087         }
1088
1089         if (!bUseOrEmulGPU)
1090         {
1091             aloc = eintLocal;
1092         }
1093         else
1094         {
1095             aloc = eintNonlocal;
1096         }
1097
1098         /* Add all the non-bonded force to the normal force array.
1099          * This can be split into a local a non-local part when overlapping
1100          * communication with calculation with domain decomposition.
1101          */
1102         cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1103         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1104         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1105         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1106         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1107         cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1108         wallcycle_start_nocount(wcycle, ewcFORCE);
1109
1110         /* if there are multiple fshift output buffers reduce them */
1111         if ((flags & GMX_FORCE_VIRIAL) &&
1112             nbv->grp[aloc].nbl_lists.nnbl > 1)
1113         {
1114             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1115                                                      fr->fshift);
1116         }
1117     }
1118
1119     /* update QMMMrec, if necessary */
1120     if (fr->bQMMM)
1121     {
1122         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1123     }
1124
1125     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
1126     {
1127         posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
1128                        enerd, lambda, fr);
1129     }
1130
1131     /* Compute the bonded and non-bonded energies and optionally forces */
1132     do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
1133                       cr, nrnb, wcycle, mdatoms,
1134                       x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
1135                       &(top->atomtypes), bBornRadii, box,
1136                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1137                       flags, &cycles_pme);
1138
1139     if (bSepLRF)
1140     {
1141         if (do_per_step(step, inputrec->nstcalclr))
1142         {
1143             /* Add the long range forces to the short range forces */
1144             for (i = 0; i < fr->natoms_force_constr; i++)
1145             {
1146                 rvec_add(fr->f_twin[i], f[i], f[i]);
1147             }
1148         }
1149     }
1150
1151     cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1152
1153     if (ed)
1154     {
1155         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1156     }
1157
1158     if (bUseOrEmulGPU && !bDiffKernels)
1159     {
1160         /* wait for non-local forces (or calculate in emulation mode) */
1161         if (DOMAINDECOMP(cr))
1162         {
1163             if (bUseGPU)
1164             {
1165                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1166                 nbnxn_cuda_wait_gpu(nbv->cu_nbv,
1167                                     nbv->grp[eintNonlocal].nbat,
1168                                     flags, eatNonlocal,
1169                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1170                                     fr->fshift);
1171                 cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1172             }
1173             else
1174             {
1175                 wallcycle_start_nocount(wcycle, ewcFORCE);
1176                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1177                              nrnb, wcycle);
1178                 cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1179             }
1180             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1181             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1182             /* skip the reduction if there was no non-local work to do */
1183             if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
1184             {
1185                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1186                                                nbv->grp[eintNonlocal].nbat, f);
1187             }
1188             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1189             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1190         }
1191     }
1192
1193     if (bDoForces)
1194     {
1195         /* Communicate the forces */
1196         if (PAR(cr))
1197         {
1198             wallcycle_start(wcycle, ewcMOVEF);
1199             if (DOMAINDECOMP(cr))
1200             {
1201                 dd_move_f(cr->dd, f, fr->fshift);
1202                 /* Do we need to communicate the separate force array
1203                  * for terms that do not contribute to the single sum virial?
1204                  * Position restraints and electric fields do not introduce
1205                  * inter-cg forces, only full electrostatics methods do.
1206                  * When we do not calculate the virial, fr->f_novirsum = f,
1207                  * so we have already communicated these forces.
1208                  */
1209                 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1210                     (flags & GMX_FORCE_VIRIAL))
1211                 {
1212                     dd_move_f(cr->dd, fr->f_novirsum, NULL);
1213                 }
1214                 if (bSepLRF)
1215                 {
1216                     /* We should not update the shift forces here,
1217                      * since f_twin is already included in f.
1218                      */
1219                     dd_move_f(cr->dd, fr->f_twin, NULL);
1220                 }
1221             }
1222             wallcycle_stop(wcycle, ewcMOVEF);
1223         }
1224     }
1225
1226     if (bUseOrEmulGPU)
1227     {
1228         /* wait for local forces (or calculate in emulation mode) */
1229         if (bUseGPU)
1230         {
1231             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1232             nbnxn_cuda_wait_gpu(nbv->cu_nbv,
1233                                 nbv->grp[eintLocal].nbat,
1234                                 flags, eatLocal,
1235                                 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1236                                 fr->fshift);
1237             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1238
1239             /* now clear the GPU outputs while we finish the step on the CPU */
1240
1241             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1242             nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
1243             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1244         }
1245         else
1246         {
1247             wallcycle_start_nocount(wcycle, ewcFORCE);
1248             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1249                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1250                          nrnb, wcycle);
1251             wallcycle_stop(wcycle, ewcFORCE);
1252         }
1253         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1254         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1255         if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
1256         {
1257             /* skip the reduction if there was no non-local work to do */
1258             nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1259                                            nbv->grp[eintLocal].nbat, f);
1260         }
1261         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1262         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1263     }
1264
1265     if (DOMAINDECOMP(cr))
1266     {
1267         dd_force_flop_stop(cr->dd, nrnb);
1268         if (wcycle)
1269         {
1270             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1271         }
1272     }
1273
1274     if (bDoForces)
1275     {
1276         if (IR_ELEC_FIELD(*inputrec))
1277         {
1278             /* Compute forces due to electric field */
1279             calc_f_el(MASTER(cr) ? field : NULL,
1280                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1281                       inputrec->ex, inputrec->et, t);
1282         }
1283
1284         /* If we have NoVirSum forces, but we do not calculate the virial,
1285          * we sum fr->f_novirum=f later.
1286          */
1287         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1288         {
1289             wallcycle_start(wcycle, ewcVSITESPREAD);
1290             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1291                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1292             wallcycle_stop(wcycle, ewcVSITESPREAD);
1293
1294             if (bSepLRF)
1295             {
1296                 wallcycle_start(wcycle, ewcVSITESPREAD);
1297                 spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
1298                                nrnb,
1299                                &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1300                 wallcycle_stop(wcycle, ewcVSITESPREAD);
1301             }
1302         }
1303
1304         if (flags & GMX_FORCE_VIRIAL)
1305         {
1306             /* Calculation of the virial must be done after vsites! */
1307             calc_virial(mdatoms->start, mdatoms->homenr, x, f,
1308                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1309         }
1310     }
1311
1312     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
1313     {
1314         pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
1315                                f, vir_force, mdatoms, enerd, lambda, t);
1316     }
1317
1318     /* Add the forces from enforced rotation potentials (if any) */
1319     if (inputrec->bRot)
1320     {
1321         wallcycle_start(wcycle, ewcROTadd);
1322         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1323         wallcycle_stop(wcycle, ewcROTadd);
1324     }
1325
1326     if (PAR(cr) && !(cr->duty & DUTY_PME))
1327     {
1328         /* In case of node-splitting, the PP nodes receive the long-range
1329          * forces, virial and energy from the PME nodes here.
1330          */
1331         pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
1332     }
1333
1334     if (bDoForces)
1335     {
1336         post_process_forces(cr, step, nrnb, wcycle,
1337                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1338                             flags);
1339     }
1340
1341     /* Sum the potential energy terms from group contributions */
1342     sum_epot(&(enerd->grpp), enerd->term);
1343 }
1344
1345 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1346                         t_inputrec *inputrec,
1347                         gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1348                         gmx_localtop_t *top,
1349                         gmx_groups_t *groups,
1350                         matrix box, rvec x[], history_t *hist,
1351                         rvec f[],
1352                         tensor vir_force,
1353                         t_mdatoms *mdatoms,
1354                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1355                         real *lambda, t_graph *graph,
1356                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1357                         double t, FILE *field, gmx_edsam_t ed,
1358                         gmx_bool bBornRadii,
1359                         int flags)
1360 {
1361     int        cg0, cg1, i, j;
1362     int        start, homenr;
1363     double     mu[2*DIM];
1364     gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
1365     gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
1366     gmx_bool   bDoAdressWF;
1367     matrix     boxs;
1368     rvec       vzero, box_diag;
1369     real       e, v, dvdlambda[efptNR];
1370     t_pbc      pbc;
1371     float      cycles_pme, cycles_force;
1372
1373     start  = mdatoms->start;
1374     homenr = mdatoms->homenr;
1375
1376     bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
1377
1378     clear_mat(vir_force);
1379
1380     if (PARTDECOMP(cr))
1381     {
1382         pd_cg_range(cr, &cg0, &cg1);
1383     }
1384     else
1385     {
1386         cg0 = 0;
1387         if (DOMAINDECOMP(cr))
1388         {
1389             cg1 = cr->dd->ncg_tot;
1390         }
1391         else
1392         {
1393             cg1 = top->cgs.nr;
1394         }
1395         if (fr->n_tpi > 0)
1396         {
1397             cg1--;
1398         }
1399     }
1400
1401     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1402     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1403     /* Should we update the long-range neighborlists at this step? */
1404     bDoLongRangeNS = fr->bTwinRange && bNS;
1405     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1406     bFillGrid      = (bNS && bStateChanged);
1407     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1408     bDoForces      = (flags & GMX_FORCE_FORCES);
1409     bDoPotential   = (flags & GMX_FORCE_ENERGY);
1410     bSepLRF        = ((inputrec->nstcalclr > 1) && bDoForces &&
1411                       (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
1412
1413     /* should probably move this to the forcerec since it doesn't change */
1414     bDoAdressWF   = ((fr->adress_type != eAdressOff));
1415
1416     if (bStateChanged)
1417     {
1418         update_forcerec(fr, box);
1419
1420         if (NEED_MUTOT(*inputrec))
1421         {
1422             /* Calculate total (local) dipole moment in a temporary common array.
1423              * This makes it possible to sum them over nodes faster.
1424              */
1425             calc_mu(start, homenr,
1426                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1427                     mu, mu+DIM);
1428         }
1429     }
1430
1431     if (fr->ePBC != epbcNONE)
1432     {
1433         /* Compute shift vectors every step,
1434          * because of pressure coupling or box deformation!
1435          */
1436         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1437         {
1438             calc_shifts(box, fr->shift_vec);
1439         }
1440
1441         if (bCalcCGCM)
1442         {
1443             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1444                                      &(top->cgs), x, fr->cg_cm);
1445             inc_nrnb(nrnb, eNR_CGCM, homenr);
1446             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1447         }
1448         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1449         {
1450             unshift_self(graph, box, x);
1451         }
1452     }
1453     else if (bCalcCGCM)
1454     {
1455         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1456         inc_nrnb(nrnb, eNR_CGCM, homenr);
1457     }
1458
1459     if (bCalcCGCM)
1460     {
1461         if (PAR(cr))
1462         {
1463             move_cgcm(fplog, cr, fr->cg_cm);
1464         }
1465         if (gmx_debug_at)
1466         {
1467             pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1468         }
1469     }
1470
1471 #ifdef GMX_MPI
1472     if (!(cr->duty & DUTY_PME))
1473     {
1474         /* Send particle coordinates to the pme nodes.
1475          * Since this is only implemented for domain decomposition
1476          * and domain decomposition does not use the graph,
1477          * we do not need to worry about shifting.
1478          */
1479
1480         wallcycle_start(wcycle, ewcPP_PMESENDX);
1481
1482         bBS = (inputrec->nwall == 2);
1483         if (bBS)
1484         {
1485             copy_mat(box, boxs);
1486             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1487         }
1488
1489         gmx_pme_send_x(cr, bBS ? boxs : box, x,
1490                        mdatoms->nChargePerturbed, lambda[efptCOUL],
1491                        (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
1492
1493         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1494     }
1495 #endif /* GMX_MPI */
1496
1497     /* Communicate coordinates and sum dipole if necessary */
1498     if (PAR(cr))
1499     {
1500         wallcycle_start(wcycle, ewcMOVEX);
1501         if (DOMAINDECOMP(cr))
1502         {
1503             dd_move_x(cr->dd, box, x);
1504         }
1505         else
1506         {
1507             move_x(cr, x, nrnb);
1508         }
1509         wallcycle_stop(wcycle, ewcMOVEX);
1510     }
1511
1512     /* update adress weight beforehand */
1513     if (bStateChanged && bDoAdressWF)
1514     {
1515         /* need pbc for adress weight calculation with pbc_dx */
1516         set_pbc(&pbc, inputrec->ePBC, box);
1517         if (fr->adress_site == eAdressSITEcog)
1518         {
1519             update_adress_weights_cog(top->idef.iparams, top->idef.il, x, fr, mdatoms,
1520                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
1521         }
1522         else if (fr->adress_site == eAdressSITEcom)
1523         {
1524             update_adress_weights_com(fplog, cg0, cg1, &(top->cgs), x, fr, mdatoms,
1525                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
1526         }
1527         else if (fr->adress_site == eAdressSITEatomatom)
1528         {
1529             update_adress_weights_atom_per_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
1530                                                 inputrec->ePBC == epbcNONE ? NULL : &pbc);
1531         }
1532         else
1533         {
1534             update_adress_weights_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
1535                                        inputrec->ePBC == epbcNONE ? NULL : &pbc);
1536         }
1537     }
1538
1539     if (NEED_MUTOT(*inputrec))
1540     {
1541
1542         if (bStateChanged)
1543         {
1544             if (PAR(cr))
1545             {
1546                 gmx_sumd(2*DIM, mu, cr);
1547             }
1548             for (i = 0; i < 2; i++)
1549             {
1550                 for (j = 0; j < DIM; j++)
1551                 {
1552                     fr->mu_tot[i][j] = mu[i*DIM + j];
1553                 }
1554             }
1555         }
1556         if (fr->efep == efepNO)
1557         {
1558             copy_rvec(fr->mu_tot[0], mu_tot);
1559         }
1560         else
1561         {
1562             for (j = 0; j < DIM; j++)
1563             {
1564                 mu_tot[j] =
1565                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1566             }
1567         }
1568     }
1569
1570     /* Reset energies */
1571     reset_enerdata(fr, bNS, enerd, MASTER(cr));
1572     clear_rvecs(SHIFTS, fr->fshift);
1573
1574     if (bNS)
1575     {
1576         wallcycle_start(wcycle, ewcNS);
1577
1578         if (graph && bStateChanged)
1579         {
1580             /* Calculate intramolecular shift vectors to make molecules whole */
1581             mk_mshift(fplog, graph, fr->ePBC, box, x);
1582         }
1583
1584         /* Do the actual neighbour searching */
1585         ns(fplog, fr, box,
1586            groups, top, mdatoms,
1587            cr, nrnb, bFillGrid,
1588            bDoLongRangeNS);
1589
1590         wallcycle_stop(wcycle, ewcNS);
1591     }
1592
1593     if (inputrec->implicit_solvent && bNS)
1594     {
1595         make_gb_nblist(cr, inputrec->gb_algorithm,
1596                        x, box, fr, &top->idef, graph, fr->born);
1597     }
1598
1599     if (DOMAINDECOMP(cr))
1600     {
1601         if (!(cr->duty & DUTY_PME))
1602         {
1603             wallcycle_start(wcycle, ewcPPDURINGPME);
1604             dd_force_flop_start(cr->dd, nrnb);
1605         }
1606     }
1607
1608     if (inputrec->bRot)
1609     {
1610         /* Enforced rotation has its own cycle counter that starts after the collective
1611          * coordinates have been communicated. It is added to ddCyclF to allow
1612          * for proper load-balancing */
1613         wallcycle_start(wcycle, ewcROT);
1614         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1615         wallcycle_stop(wcycle, ewcROT);
1616     }
1617
1618     /* Start the force cycle counter.
1619      * This counter is stopped in do_forcelow_level.
1620      * No parallel communication should occur while this counter is running,
1621      * since that will interfere with the dynamic load balancing.
1622      */
1623     wallcycle_start(wcycle, ewcFORCE);
1624
1625     if (bDoForces)
1626     {
1627         /* Reset forces for which the virial is calculated separately:
1628          * PME/Ewald forces if necessary */
1629         if (fr->bF_NoVirSum)
1630         {
1631             if (flags & GMX_FORCE_VIRIAL)
1632             {
1633                 fr->f_novirsum = fr->f_novirsum_alloc;
1634                 if (fr->bDomDec)
1635                 {
1636                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1637                 }
1638                 else
1639                 {
1640                     clear_rvecs(homenr, fr->f_novirsum+start);
1641                 }
1642             }
1643             else
1644             {
1645                 /* We are not calculating the pressure so we do not need
1646                  * a separate array for forces that do not contribute
1647                  * to the pressure.
1648                  */
1649                 fr->f_novirsum = f;
1650             }
1651         }
1652
1653         /* Clear the short- and long-range forces */
1654         clear_rvecs(fr->natoms_force_constr, f);
1655         if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
1656         {
1657             clear_rvecs(fr->natoms_force_constr, fr->f_twin);
1658         }
1659
1660         clear_rvec(fr->vir_diag_posres);
1661     }
1662     if (inputrec->ePull == epullCONSTRAINT)
1663     {
1664         clear_pull_forces(inputrec->pull);
1665     }
1666
1667     /* update QMMMrec, if necessary */
1668     if (fr->bQMMM)
1669     {
1670         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1671     }
1672
1673     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
1674     {
1675         posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
1676                        enerd, lambda, fr);
1677     }
1678
1679     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
1680     {
1681         /* Flat-bottomed position restraints always require full pbc */
1682         if (!(bStateChanged && bDoAdressWF))
1683         {
1684             set_pbc(&pbc, inputrec->ePBC, box);
1685         }
1686         v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
1687                      top->idef.iparams_fbposres,
1688                      (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
1689                      inputrec->ePBC == epbcNONE ? NULL : &pbc,
1690                      fr->rc_scaling, fr->ePBC, fr->posres_com);
1691         enerd->term[F_FBPOSRES] += v;
1692         inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
1693     }
1694
1695     /* Compute the bonded and non-bonded energies and optionally forces */
1696     do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
1697                       cr, nrnb, wcycle, mdatoms,
1698                       x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
1699                       &(top->atomtypes), bBornRadii, box,
1700                       inputrec->fepvals, lambda,
1701                       graph, &(top->excls), fr->mu_tot,
1702                       flags,
1703                       &cycles_pme);
1704
1705     if (bSepLRF)
1706     {
1707         if (do_per_step(step, inputrec->nstcalclr))
1708         {
1709             /* Add the long range forces to the short range forces */
1710             for (i = 0; i < fr->natoms_force_constr; i++)
1711             {
1712                 rvec_add(fr->f_twin[i], f[i], f[i]);
1713             }
1714         }
1715     }
1716
1717     cycles_force = wallcycle_stop(wcycle, ewcFORCE);
1718
1719     if (ed)
1720     {
1721         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1722     }
1723
1724     if (DOMAINDECOMP(cr))
1725     {
1726         dd_force_flop_stop(cr->dd, nrnb);
1727         if (wcycle)
1728         {
1729             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1730         }
1731     }
1732
1733     if (bDoForces)
1734     {
1735         if (IR_ELEC_FIELD(*inputrec))
1736         {
1737             /* Compute forces due to electric field */
1738             calc_f_el(MASTER(cr) ? field : NULL,
1739                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1740                       inputrec->ex, inputrec->et, t);
1741         }
1742
1743         if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
1744         {
1745             /* Compute thermodynamic force in hybrid AdResS region */
1746             adress_thermo_force(start, homenr, &(top->cgs), x, fr->f_novirsum, fr, mdatoms,
1747                                 inputrec->ePBC == epbcNONE ? NULL : &pbc);
1748         }
1749
1750         /* Communicate the forces */
1751         if (PAR(cr))
1752         {
1753             wallcycle_start(wcycle, ewcMOVEF);
1754             if (DOMAINDECOMP(cr))
1755             {
1756                 dd_move_f(cr->dd, f, fr->fshift);
1757                 /* Do we need to communicate the separate force array
1758                  * for terms that do not contribute to the single sum virial?
1759                  * Position restraints and electric fields do not introduce
1760                  * inter-cg forces, only full electrostatics methods do.
1761                  * When we do not calculate the virial, fr->f_novirsum = f,
1762                  * so we have already communicated these forces.
1763                  */
1764                 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1765                     (flags & GMX_FORCE_VIRIAL))
1766                 {
1767                     dd_move_f(cr->dd, fr->f_novirsum, NULL);
1768                 }
1769                 if (bSepLRF)
1770                 {
1771                     /* We should not update the shift forces here,
1772                      * since f_twin is already included in f.
1773                      */
1774                     dd_move_f(cr->dd, fr->f_twin, NULL);
1775                 }
1776             }
1777             else
1778             {
1779                 pd_move_f(cr, f, nrnb);
1780                 if (bSepLRF)
1781                 {
1782                     pd_move_f(cr, fr->f_twin, nrnb);
1783                 }
1784             }
1785             wallcycle_stop(wcycle, ewcMOVEF);
1786         }
1787
1788         /* If we have NoVirSum forces, but we do not calculate the virial,
1789          * we sum fr->f_novirum=f later.
1790          */
1791         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1792         {
1793             wallcycle_start(wcycle, ewcVSITESPREAD);
1794             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1795                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1796             wallcycle_stop(wcycle, ewcVSITESPREAD);
1797
1798             if (bSepLRF)
1799             {
1800                 wallcycle_start(wcycle, ewcVSITESPREAD);
1801                 spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
1802                                nrnb,
1803                                &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1804                 wallcycle_stop(wcycle, ewcVSITESPREAD);
1805             }
1806         }
1807
1808         if (flags & GMX_FORCE_VIRIAL)
1809         {
1810             /* Calculation of the virial must be done after vsites! */
1811             calc_virial(mdatoms->start, mdatoms->homenr, x, f,
1812                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1813         }
1814     }
1815
1816     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
1817     {
1818         pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
1819                                f, vir_force, mdatoms, enerd, lambda, t);
1820     }
1821
1822     /* Add the forces from enforced rotation potentials (if any) */
1823     if (inputrec->bRot)
1824     {
1825         wallcycle_start(wcycle, ewcROTadd);
1826         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1827         wallcycle_stop(wcycle, ewcROTadd);
1828     }
1829
1830     if (PAR(cr) && !(cr->duty & DUTY_PME))
1831     {
1832         /* In case of node-splitting, the PP nodes receive the long-range
1833          * forces, virial and energy from the PME nodes here.
1834          */
1835         pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
1836     }
1837
1838     if (bDoForces)
1839     {
1840         post_process_forces(cr, step, nrnb, wcycle,
1841                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1842                             flags);
1843     }
1844
1845     /* Sum the potential energy terms from group contributions */
1846     sum_epot(&(enerd->grpp), enerd->term);
1847 }
1848
1849 void do_force(FILE *fplog, t_commrec *cr,
1850               t_inputrec *inputrec,
1851               gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1852               gmx_localtop_t *top,
1853               gmx_groups_t *groups,
1854               matrix box, rvec x[], history_t *hist,
1855               rvec f[],
1856               tensor vir_force,
1857               t_mdatoms *mdatoms,
1858               gmx_enerdata_t *enerd, t_fcdata *fcd,
1859               real *lambda, t_graph *graph,
1860               t_forcerec *fr,
1861               gmx_vsite_t *vsite, rvec mu_tot,
1862               double t, FILE *field, gmx_edsam_t ed,
1863               gmx_bool bBornRadii,
1864               int flags)
1865 {
1866     /* modify force flag if not doing nonbonded */
1867     if (!fr->bNonbonded)
1868     {
1869         flags &= ~GMX_FORCE_NONBONDED;
1870     }
1871
1872     switch (inputrec->cutoff_scheme)
1873     {
1874         case ecutsVERLET:
1875             do_force_cutsVERLET(fplog, cr, inputrec,
1876                                 step, nrnb, wcycle,
1877                                 top,
1878                                 groups,
1879                                 box, x, hist,
1880                                 f, vir_force,
1881                                 mdatoms,
1882                                 enerd, fcd,
1883                                 lambda, graph,
1884                                 fr, fr->ic,
1885                                 vsite, mu_tot,
1886                                 t, field, ed,
1887                                 bBornRadii,
1888                                 flags);
1889             break;
1890         case ecutsGROUP:
1891             do_force_cutsGROUP(fplog, cr, inputrec,
1892                                step, nrnb, wcycle,
1893                                top,
1894                                groups,
1895                                box, x, hist,
1896                                f, vir_force,
1897                                mdatoms,
1898                                enerd, fcd,
1899                                lambda, graph,
1900                                fr, vsite, mu_tot,
1901                                t, field, ed,
1902                                bBornRadii,
1903                                flags);
1904             break;
1905         default:
1906             gmx_incons("Invalid cut-off scheme passed!");
1907     }
1908 }
1909
1910
1911 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1912                         t_inputrec *ir, t_mdatoms *md,
1913                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1914                         t_forcerec *fr, gmx_localtop_t *top)
1915 {
1916     int             i, m, start, end;
1917     gmx_large_int_t step;
1918     real            dt = ir->delta_t;
1919     real            dvdl_dum;
1920     rvec           *savex;
1921
1922     snew(savex, state->natoms);
1923
1924     start = md->start;
1925     end   = md->homenr + start;
1926
1927     if (debug)
1928     {
1929         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1930                 start, md->homenr, end);
1931     }
1932     /* Do a first constrain to reset particles... */
1933     step = ir->init_step;
1934     if (fplog)
1935     {
1936         char buf[STEPSTRSIZE];
1937         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1938                 gmx_step_str(step, buf));
1939     }
1940     dvdl_dum = 0;
1941
1942     /* constrain the current position */
1943     constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1944               ir, NULL, cr, step, 0, md,
1945               state->x, state->x, NULL,
1946               fr->bMolPBC, state->box,
1947               state->lambda[efptBONDED], &dvdl_dum,
1948               NULL, NULL, nrnb, econqCoord,
1949               ir->epc == epcMTTK, state->veta, state->veta);
1950     if (EI_VV(ir->eI))
1951     {
1952         /* constrain the inital velocity, and save it */
1953         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1954         /* might not yet treat veta correctly */
1955         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1956                   ir, NULL, cr, step, 0, md,
1957                   state->x, state->v, state->v,
1958                   fr->bMolPBC, state->box,
1959                   state->lambda[efptBONDED], &dvdl_dum,
1960                   NULL, NULL, nrnb, econqVeloc,
1961                   ir->epc == epcMTTK, state->veta, state->veta);
1962     }
1963     /* constrain the inital velocities at t-dt/2 */
1964     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1965     {
1966         for (i = start; (i < end); i++)
1967         {
1968             for (m = 0; (m < DIM); m++)
1969             {
1970                 /* Reverse the velocity */
1971                 state->v[i][m] = -state->v[i][m];
1972                 /* Store the position at t-dt in buf */
1973                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1974             }
1975         }
1976         /* Shake the positions at t=-dt with the positions at t=0
1977          * as reference coordinates.
1978          */
1979         if (fplog)
1980         {
1981             char buf[STEPSTRSIZE];
1982             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1983                     gmx_step_str(step, buf));
1984         }
1985         dvdl_dum = 0;
1986         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1987                   ir, NULL, cr, step, -1, md,
1988                   state->x, savex, NULL,
1989                   fr->bMolPBC, state->box,
1990                   state->lambda[efptBONDED], &dvdl_dum,
1991                   state->v, NULL, nrnb, econqCoord,
1992                   ir->epc == epcMTTK, state->veta, state->veta);
1993
1994         for (i = start; i < end; i++)
1995         {
1996             for (m = 0; m < DIM; m++)
1997             {
1998                 /* Re-reverse the velocities */
1999                 state->v[i][m] = -state->v[i][m];
2000             }
2001         }
2002     }
2003     sfree(savex);
2004 }
2005
2006 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2007 {
2008     double eners[2], virs[2], enersum, virsum, y0, f, g, h;
2009     double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
2010     double invscale, invscale2, invscale3;
2011     int    ri0, ri1, ri, i, offstart, offset;
2012     real   scale, *vdwtab, tabfactor, tmp;
2013
2014     fr->enershiftsix    = 0;
2015     fr->enershifttwelve = 0;
2016     fr->enerdiffsix     = 0;
2017     fr->enerdifftwelve  = 0;
2018     fr->virdiffsix      = 0;
2019     fr->virdifftwelve   = 0;
2020
2021     if (eDispCorr != edispcNO)
2022     {
2023         for (i = 0; i < 2; i++)
2024         {
2025             eners[i] = 0;
2026             virs[i]  = 0;
2027         }
2028         if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
2029         {
2030             if (fr->rvdw_switch == 0)
2031             {
2032                 gmx_fatal(FARGS,
2033                           "With dispersion correction rvdw-switch can not be zero "
2034                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2035             }
2036
2037             scale  = fr->nblists[0].table_elec_vdw.scale;
2038             vdwtab = fr->nblists[0].table_vdw.data;
2039
2040             /* Round the cut-offs to exact table values for precision */
2041             ri0  = floor(fr->rvdw_switch*scale);
2042             ri1  = ceil(fr->rvdw*scale);
2043             r0   = ri0/scale;
2044             r1   = ri1/scale;
2045             rc3  = r0*r0*r0;
2046             rc9  = rc3*rc3*rc3;
2047
2048             if (fr->vdwtype == evdwSHIFT)
2049             {
2050                 /* Determine the constant energy shift below rvdw_switch.
2051                  * Table has a scale factor since we have scaled it down to compensate
2052                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2053                  */
2054                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2055                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2056             }
2057             /* Add the constant part from 0 to rvdw_switch.
2058              * This integration from 0 to rvdw_switch overcounts the number
2059              * of interactions by 1, as it also counts the self interaction.
2060              * We will correct for this later.
2061              */
2062             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2063             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2064
2065             invscale  = 1.0/(scale);
2066             invscale2 = invscale*invscale;
2067             invscale3 = invscale*invscale2;
2068
2069             /* following summation derived from cubic spline definition,
2070                Numerical Recipies in C, second edition, p. 113-116.  Exact
2071                for the cubic spline.  We first calculate the negative of
2072                the energy from rvdw to rvdw_switch, assuming that g(r)=1,
2073                and then add the more standard, abrupt cutoff correction to
2074                that result, yielding the long-range correction for a
2075                switched function.  We perform both the pressure and energy
2076                loops at the same time for simplicity, as the computational
2077                cost is low. */
2078
2079             for (i = 0; i < 2; i++)
2080             {
2081                 enersum = 0.0; virsum = 0.0;
2082                 if (i == 0)
2083                 {
2084                     offstart = 0;
2085                     /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
2086                      * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
2087                      * up (to save flops in kernels), we need to correct for this.
2088                      */
2089                     tabfactor = 6.0;
2090                 }
2091                 else
2092                 {
2093                     offstart  = 4;
2094                     tabfactor = 12.0;
2095                 }
2096                 for (ri = ri0; ri < ri1; ri++)
2097                 {
2098                     r  = ri*invscale;
2099                     ea = invscale3;
2100                     eb = 2.0*invscale2*r;
2101                     ec = invscale*r*r;
2102
2103                     pa = invscale3;
2104                     pb = 3.0*invscale2*r;
2105                     pc = 3.0*invscale*r*r;
2106                     pd = r*r*r;
2107
2108                     /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
2109                     offset = 8*ri + offstart;
2110                     y0     = vdwtab[offset];
2111                     f      = vdwtab[offset+1];
2112                     g      = vdwtab[offset+2];
2113                     h      = vdwtab[offset+3];
2114
2115                     enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2116                     virsum  += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2117                 }
2118
2119                 enersum  *= 4.0*M_PI*tabfactor;
2120                 virsum   *= 4.0*M_PI*tabfactor;
2121                 eners[i] -= enersum;
2122                 virs[i]  -= virsum;
2123             }
2124
2125             /* now add the correction for rvdw_switch to infinity */
2126             eners[0] += -4.0*M_PI/(3.0*rc3);
2127             eners[1] +=  4.0*M_PI/(9.0*rc9);
2128             virs[0]  +=  8.0*M_PI/rc3;
2129             virs[1]  += -16.0*M_PI/(3.0*rc9);
2130         }
2131         else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
2132         {
2133             if (fr->vdwtype == evdwUSER && fplog)
2134             {
2135                 fprintf(fplog,
2136                         "WARNING: using dispersion correction with user tables\n");
2137             }
2138             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2139             rc9  = rc3*rc3*rc3;
2140             /* Contribution beyond the cut-off */
2141             eners[0] += -4.0*M_PI/(3.0*rc3);
2142             eners[1] +=  4.0*M_PI/(9.0*rc9);
2143             if (fr->vdw_modifier == eintmodPOTSHIFT)
2144             {
2145                 /* Contribution within the cut-off */
2146                 eners[0] += -4.0*M_PI/(3.0*rc3);
2147                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2148             }
2149             /* Contribution beyond the cut-off */
2150             virs[0]  +=  8.0*M_PI/rc3;
2151             virs[1]  += -16.0*M_PI/(3.0*rc9);
2152         }
2153         else
2154         {
2155             gmx_fatal(FARGS,
2156                       "Dispersion correction is not implemented for vdw-type = %s",
2157                       evdw_names[fr->vdwtype]);
2158         }
2159         fr->enerdiffsix    = eners[0];
2160         fr->enerdifftwelve = eners[1];
2161         /* The 0.5 is due to the Gromacs definition of the virial */
2162         fr->virdiffsix     = 0.5*virs[0];
2163         fr->virdifftwelve  = 0.5*virs[1];
2164     }
2165 }
2166
2167 void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
2168                    gmx_large_int_t step, int natoms,
2169                    matrix box, real lambda, tensor pres, tensor virial,
2170                    real *prescorr, real *enercorr, real *dvdlcorr)
2171 {
2172     gmx_bool bCorrAll, bCorrPres;
2173     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2174     int      m;
2175
2176     *prescorr = 0;
2177     *enercorr = 0;
2178     *dvdlcorr = 0;
2179
2180     clear_mat(virial);
2181     clear_mat(pres);
2182
2183     if (ir->eDispCorr != edispcNO)
2184     {
2185         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2186                      ir->eDispCorr == edispcAllEnerPres);
2187         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2188                      ir->eDispCorr == edispcAllEnerPres);
2189
2190         invvol = 1/det(box);
2191         if (fr->n_tpi)
2192         {
2193             /* Only correct for the interactions with the inserted molecule */
2194             dens   = (natoms - fr->n_tpi)*invvol;
2195             ninter = fr->n_tpi;
2196         }
2197         else
2198         {
2199             dens   = natoms*invvol;
2200             ninter = 0.5*natoms;
2201         }
2202
2203         if (ir->efep == efepNO)
2204         {
2205             avcsix    = fr->avcsix[0];
2206             avctwelve = fr->avctwelve[0];
2207         }
2208         else
2209         {
2210             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2211             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2212         }
2213
2214         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2215         *enercorr += avcsix*enerdiff;
2216         dvdlambda  = 0.0;
2217         if (ir->efep != efepNO)
2218         {
2219             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2220         }
2221         if (bCorrAll)
2222         {
2223             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2224             *enercorr += avctwelve*enerdiff;
2225             if (fr->efep != efepNO)
2226             {
2227                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2228             }
2229         }
2230
2231         if (bCorrPres)
2232         {
2233             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2234             if (ir->eDispCorr == edispcAllEnerPres)
2235             {
2236                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2237             }
2238             /* The factor 2 is because of the Gromacs virial definition */
2239             spres = -2.0*invvol*svir*PRESFAC;
2240
2241             for (m = 0; m < DIM; m++)
2242             {
2243                 virial[m][m] += svir;
2244                 pres[m][m]   += spres;
2245             }
2246             *prescorr += spres;
2247         }
2248
2249         /* Can't currently control when it prints, for now, just print when degugging */
2250         if (debug)
2251         {
2252             if (bCorrAll)
2253             {
2254                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2255                         avcsix, avctwelve);
2256             }
2257             if (bCorrPres)
2258             {
2259                 fprintf(debug,
2260                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2261                         *enercorr, spres, svir);
2262             }
2263             else
2264             {
2265                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2266             }
2267         }
2268
2269         if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
2270         {
2271             gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
2272         }
2273         if (fr->efep != efepNO)
2274         {
2275             *dvdlcorr += dvdlambda;
2276         }
2277     }
2278 }
2279
2280 void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
2281                   t_graph *graph, rvec x[])
2282 {
2283     if (fplog)
2284     {
2285         fprintf(fplog, "Removing pbc first time\n");
2286     }
2287     calc_shifts(box, fr->shift_vec);
2288     if (graph)
2289     {
2290         mk_mshift(fplog, graph, fr->ePBC, box, x);
2291         if (gmx_debug_at)
2292         {
2293             p_graph(debug, "do_pbc_first 1", graph);
2294         }
2295         shift_self(graph, box, x);
2296         /* By doing an extra mk_mshift the molecules that are broken
2297          * because they were e.g. imported from another software
2298          * will be made whole again. Such are the healing powers
2299          * of GROMACS.
2300          */
2301         mk_mshift(fplog, graph, fr->ePBC, box, x);
2302         if (gmx_debug_at)
2303         {
2304             p_graph(debug, "do_pbc_first 2", graph);
2305         }
2306     }
2307     if (fplog)
2308     {
2309         fprintf(fplog, "Done rmpbc\n");
2310     }
2311 }
2312
2313 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2314                             gmx_mtop_t *mtop, rvec x[],
2315                             gmx_bool bFirst)
2316 {
2317     t_graph        *graph;
2318     int             mb, as, mol;
2319     gmx_molblock_t *molb;
2320
2321     if (bFirst && fplog)
2322     {
2323         fprintf(fplog, "Removing pbc first time\n");
2324     }
2325
2326     snew(graph, 1);
2327     as = 0;
2328     for (mb = 0; mb < mtop->nmolblock; mb++)
2329     {
2330         molb = &mtop->molblock[mb];
2331         if (molb->natoms_mol == 1 ||
2332             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2333         {
2334             /* Just one atom or charge group in the molecule, no PBC required */
2335             as += molb->nmol*molb->natoms_mol;
2336         }
2337         else
2338         {
2339             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2340             mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
2341                            0, molb->natoms_mol, FALSE, FALSE, graph);
2342
2343             for (mol = 0; mol < molb->nmol; mol++)
2344             {
2345                 mk_mshift(fplog, graph, ePBC, box, x+as);
2346
2347                 shift_self(graph, box, x+as);
2348                 /* The molecule is whole now.
2349                  * We don't need the second mk_mshift call as in do_pbc_first,
2350                  * since we no longer need this graph.
2351                  */
2352
2353                 as += molb->natoms_mol;
2354             }
2355             done_graph(graph);
2356         }
2357     }
2358     sfree(graph);
2359 }
2360
2361 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2362                        gmx_mtop_t *mtop, rvec x[])
2363 {
2364     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2365 }
2366
2367 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2368                  gmx_mtop_t *mtop, rvec x[])
2369 {
2370     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2371 }
2372
2373 void finish_run(FILE *fplog, t_commrec *cr,
2374                 t_inputrec *inputrec,
2375                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2376                 gmx_walltime_accounting_t walltime_accounting,
2377                 wallclock_gpu_t *gputimes,
2378                 gmx_bool bWriteStat)
2379 {
2380     int     i, j;
2381     t_nrnb *nrnb_tot = NULL;
2382     real    delta_t;
2383     double  nbfs, mflop;
2384     double  elapsed_time,
2385             elapsed_time_over_all_ranks,
2386             elapsed_time_over_all_threads,
2387             elapsed_time_over_all_threads_over_all_ranks;
2388     wallcycle_sum(cr, wcycle);
2389
2390     if (cr->nnodes > 1)
2391     {
2392         snew(nrnb_tot, 1);
2393 #ifdef GMX_MPI
2394         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2395                       cr->mpi_comm_mysim);
2396 #endif
2397     }
2398     else
2399     {
2400         nrnb_tot = nrnb;
2401     }
2402
2403     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2404     elapsed_time_over_all_ranks                  = elapsed_time;
2405     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2406     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2407 #ifdef GMX_MPI
2408     if (cr->nnodes > 1)
2409     {
2410         /* reduce elapsed_time over all MPI ranks in the current simulation */
2411         MPI_Allreduce(&elapsed_time,
2412                       &elapsed_time_over_all_ranks,
2413                       1, MPI_DOUBLE, MPI_SUM,
2414                       cr->mpi_comm_mysim);
2415         elapsed_time_over_all_ranks /= cr->nnodes;
2416         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2417          * current simulation. */
2418         MPI_Allreduce(&elapsed_time_over_all_threads,
2419                       &elapsed_time_over_all_threads_over_all_ranks,
2420                       1, MPI_DOUBLE, MPI_SUM,
2421                       cr->mpi_comm_mysim);
2422     }
2423 #endif
2424
2425     if (SIMMASTER(cr))
2426     {
2427         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2428     }
2429     if (cr->nnodes > 1)
2430     {
2431         sfree(nrnb_tot);
2432     }
2433
2434     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2435     {
2436         print_dd_statistics(cr, inputrec, fplog);
2437     }
2438
2439 #ifdef GMX_MPI
2440     if (PARTDECOMP(cr))
2441     {
2442         if (MASTER(cr))
2443         {
2444             t_nrnb     *nrnb_all;
2445             int         s;
2446             MPI_Status  stat;
2447
2448             snew(nrnb_all, cr->nnodes);
2449             nrnb_all[0] = *nrnb;
2450             for (s = 1; s < cr->nnodes; s++)
2451             {
2452                 MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
2453                          cr->mpi_comm_mysim, &stat);
2454             }
2455             pr_load(fplog, cr, nrnb_all);
2456             sfree(nrnb_all);
2457         }
2458         else
2459         {
2460             MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
2461                      cr->mpi_comm_mysim);
2462         }
2463     }
2464 #endif
2465
2466     if (SIMMASTER(cr))
2467     {
2468         wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
2469                         elapsed_time_over_all_ranks,
2470                         wcycle, gputimes);
2471
2472         if (EI_DYNAMICS(inputrec->eI))
2473         {
2474             delta_t = inputrec->delta_t;
2475         }
2476         else
2477         {
2478             delta_t = 0;
2479         }
2480
2481         if (fplog)
2482         {
2483             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2484                        elapsed_time_over_all_ranks,
2485                        walltime_accounting_get_nsteps_done(walltime_accounting),
2486                        delta_t, nbfs, mflop);
2487         }
2488         if (bWriteStat)
2489         {
2490             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2491                        elapsed_time_over_all_ranks,
2492                        walltime_accounting_get_nsteps_done(walltime_accounting),
2493                        delta_t, nbfs, mflop);
2494         }
2495     }
2496 }
2497
2498 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
2499 {
2500     /* this function works, but could probably use a logic rewrite to keep all the different
2501        types of efep straight. */
2502
2503     int       i;
2504     t_lambda *fep = ir->fepvals;
2505
2506     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2507     {
2508         for (i = 0; i < efptNR; i++)
2509         {
2510             lambda[i] = 0.0;
2511             if (lam0)
2512             {
2513                 lam0[i] = 0.0;
2514             }
2515         }
2516         return;
2517     }
2518     else
2519     {
2520         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2521                                              if checkpoint is set -- a kludge is in for now
2522                                              to prevent this.*/
2523         for (i = 0; i < efptNR; i++)
2524         {
2525             /* overwrite lambda state with init_lambda for now for backwards compatibility */
2526             if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2527             {
2528                 lambda[i] = fep->init_lambda;
2529                 if (lam0)
2530                 {
2531                     lam0[i] = lambda[i];
2532                 }
2533             }
2534             else
2535             {
2536                 lambda[i] = fep->all_lambda[i][*fep_state];
2537                 if (lam0)
2538                 {
2539                     lam0[i] = lambda[i];
2540                 }
2541             }
2542         }
2543         if (ir->bSimTemp)
2544         {
2545             /* need to rescale control temperatures to match current state */
2546             for (i = 0; i < ir->opts.ngtc; i++)
2547             {
2548                 if (ir->opts.ref_t[i] > 0)
2549                 {
2550                     ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2551                 }
2552             }
2553         }
2554     }
2555
2556     /* Send to the log the information on the current lambdas */
2557     if (fplog != NULL)
2558     {
2559         fprintf(fplog, "Initial vector of lambda components:[ ");
2560         for (i = 0; i < efptNR; i++)
2561         {
2562             fprintf(fplog, "%10.4f ", lambda[i]);
2563         }
2564         fprintf(fplog, "]\n");
2565     }
2566     return;
2567 }
2568
2569
2570 void init_md(FILE *fplog,
2571              t_commrec *cr, t_inputrec *ir, const output_env_t oenv,
2572              double *t, double *t0,
2573              real *lambda, int *fep_state, double *lam0,
2574              t_nrnb *nrnb, gmx_mtop_t *mtop,
2575              gmx_update_t *upd,
2576              int nfile, const t_filenm fnm[],
2577              gmx_mdoutf_t **outf, t_mdebin **mdebin,
2578              tensor force_vir, tensor shake_vir, rvec mu_tot,
2579              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
2580 {
2581     int  i, j, n;
2582     real tmpt, mod;
2583
2584     /* Initial values */
2585     *t = *t0       = ir->init_t;
2586
2587     *bSimAnn = FALSE;
2588     for (i = 0; i < ir->opts.ngtc; i++)
2589     {
2590         /* set bSimAnn if any group is being annealed */
2591         if (ir->opts.annealing[i] != eannNO)
2592         {
2593             *bSimAnn = TRUE;
2594         }
2595     }
2596     if (*bSimAnn)
2597     {
2598         update_annealing_target_temp(&(ir->opts), ir->init_t);
2599     }
2600
2601     /* Initialize lambda variables */
2602     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2603
2604     if (upd)
2605     {
2606         *upd = init_update(ir);
2607     }
2608
2609
2610     if (vcm != NULL)
2611     {
2612         *vcm = init_vcm(fplog, &mtop->groups, ir);
2613     }
2614
2615     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2616     {
2617         if (ir->etc == etcBERENDSEN)
2618         {
2619             please_cite(fplog, "Berendsen84a");
2620         }
2621         if (ir->etc == etcVRESCALE)
2622         {
2623             please_cite(fplog, "Bussi2007a");
2624         }
2625     }
2626
2627     init_nrnb(nrnb);
2628
2629     if (nfile != -1)
2630     {
2631         *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
2632
2633         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
2634                               mtop, ir, (*outf)->fp_dhdl);
2635     }
2636
2637     if (ir->bAdress)
2638     {
2639         please_cite(fplog, "Fritsch12");
2640         please_cite(fplog, "Junghans10");
2641     }
2642     /* Initiate variables */
2643     clear_mat(force_vir);
2644     clear_mat(shake_vir);
2645     clear_rvec(mu_tot);
2646
2647     debug_gmx();
2648 }