src/gromacs/mdlib/sim_util.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROwing Monsters And Cloning Shrimps
  35  */
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40 #include <stdio.h>
  41 #ifdef HAVE_SYS_TIME_H
  42 #include <sys/time.h>
  43 #endif
  44 #include <math.h>
  45 #include "typedefs.h"
  46 #include "string2.h"
  47 #include "smalloc.h"
  48 #include "names.h"
  49 #include "mvdata.h"
  50 #include "txtdump.h"
  51 #include "pbc.h"
  52 #include "chargegroup.h"
  53 #include "vec.h"
  54 #include "nrnb.h"
  55 #include "mshift.h"
  56 #include "mdrun.h"
  57 #include "sim_util.h"
  58 #include "update.h"
  59 #include "physics.h"
  60 #include "main.h"
  61 #include "mdatoms.h"
  62 #include "force.h"
  63 #include "bondf.h"
  64 #include "pme.h"
  65 #include "disre.h"
  66 #include "orires.h"
  67 #include "network.h"
  68 #include "calcmu.h"
  69 #include "constr.h"
  70 #include "xvgr.h"
  71 #include "copyrite.h"
  72 #include "pull_rotation.h"
  73 #include "gmx_random.h"
  74 #include "domdec.h"
  75 #include "partdec.h"
  76 #include "genborn.h"
  77 #include "nbnxn_atomdata.h"
  78 #include "nbnxn_search.h"
  79 #include "nbnxn_kernels/nbnxn_kernel_ref.h"
  80 #include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
  81 #include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
  82 #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  83
  84 #include "gromacs/timing/wallcycle.h"
  85 #include "gromacs/timing/walltime_accounting.h"
  86 #include "gromacs/utility/gmxmpi.h"
  87
  88 #include "adress.h"
  89 #include "qmmm.h"
  90
  91 #include "nbnxn_cuda_data_mgmt.h"
  92 #include "nbnxn_cuda/nbnxn_cuda.h"
  93
  94 void print_time(FILE                     *out,
  95                 gmx_walltime_accounting_t walltime_accounting,
  96                 gmx_large_int_t           step,
  97                 t_inputrec               *ir,
  98                 t_commrec gmx_unused     *cr)
  99 {
 100     time_t finish;
 101     char   timebuf[STRLEN];
 102     double dt, elapsed_seconds, time_per_step;
 103     char   buf[48];
 104
 105 #ifndef GMX_THREAD_MPI
 106     if (!PAR(cr))
 107 #endif
 108     {
 109         fprintf(out, "\r");
 110     }
 111     fprintf(out, "step %s", gmx_step_str(step, buf));
 112     if ((step >= ir->nstlist))
 113     {
 114         double seconds_since_epoch = gmx_gettime();
 115         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
 116         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
 117         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
 118
 119         if (ir->nsteps >= 0)
 120         {
 121             if (dt >= 300)
 122             {
 123                 finish = (time_t) (seconds_since_epoch + dt);
 124                 gmx_ctime_r(&finish, timebuf, STRLEN);
 125                 sprintf(buf, "%s", timebuf);
 126                 buf[strlen(buf)-1] = '\0';
 127                 fprintf(out, ", will finish %s", buf);
 128             }
 129             else
 130             {
 131                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
 132             }
 133         }
 134         else
 135         {
 136             fprintf(out, " performance: %.1f ns/day    ",
 137                     ir->delta_t/1000*24*60*60/time_per_step);
 138         }
 139     }
 140 #ifndef GMX_THREAD_MPI
 141     if (PAR(cr))
 142     {
 143         fprintf(out, "\n");
 144     }
 145 #endif
 146
 147     fflush(out);
 148 }
 149
 150 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
 151                          const gmx_walltime_accounting_t walltime_accounting)
 152 {
 153     int    i;
 154     char   timebuf[STRLEN];
 155     char   time_string[STRLEN];
 156     time_t tmptime;
 157
 158     if (fplog)
 159     {
 160         if (walltime_accounting != NULL)
 161         {
 162             tmptime = (time_t) walltime_accounting_get_start_time_stamp(walltime_accounting);
 163             gmx_ctime_r(&tmptime, timebuf, STRLEN);
 164         }
 165         else
 166         {
 167             tmptime = (time_t) gmx_gettime();
 168             gmx_ctime_r(&tmptime, timebuf, STRLEN);
 169         }
 170         for (i = 0; timebuf[i] >= ' '; i++)
 171         {
 172             time_string[i] = timebuf[i];
 173         }
 174         time_string[i] = '\0';
 175
 176         fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
 177     }
 178 }
 179
 180 static void sum_forces(int start, int end, rvec f[], rvec flr[])
 181 {
 182     int i;
 183
 184     if (gmx_debug_at)
 185     {
 186         pr_rvecs(debug, 0, "fsr", f+start, end-start);
 187         pr_rvecs(debug, 0, "flr", flr+start, end-start);
 188     }
 189     for (i = start; (i < end); i++)
 190     {
 191         rvec_inc(f[i], flr[i]);
 192     }
 193 }
 194
 195 /*
 196  * calc_f_el calculates forces due to an electric field.
 197  *
 198  * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
 199  *
 200  * Et[] contains the parameters for the time dependent
 201  * part of the field (not yet used).
 202  * Ex[] contains the parameters for
 203  * the spatial dependent part of the field. You can have cool periodic
 204  * fields in principle, but only a constant field is supported
 205  * now.
 206  * The function should return the energy due to the electric field
 207  * (if any) but for now returns 0.
 208  *
 209  * WARNING:
 210  * There can be problems with the virial.
 211  * Since the field is not self-consistent this is unavoidable.
 212  * For neutral molecules the virial is correct within this approximation.
 213  * For neutral systems with many charged molecules the error is small.
 214  * But for systems with a net charge or a few charged molecules
 215  * the error can be significant when the field is high.
 216  * Solution: implement a self-consitent electric field into PME.
 217  */
 218 static void calc_f_el(FILE *fp, int  start, int homenr,
 219                       real charge[], rvec f[],
 220                       t_cosines Ex[], t_cosines Et[], double t)
 221 {
 222     rvec Ext;
 223     real t0;
 224     int  i, m;
 225
 226     for (m = 0; (m < DIM); m++)
 227     {
 228         if (Et[m].n > 0)
 229         {
 230             if (Et[m].n == 3)
 231             {
 232                 t0     = Et[m].a[1];
 233                 Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
 234             }
 235             else
 236             {
 237                 Ext[m] = cos(Et[m].a[0]*t);
 238             }
 239         }
 240         else
 241         {
 242             Ext[m] = 1.0;
 243         }
 244         if (Ex[m].n > 0)
 245         {
 246             /* Convert the field strength from V/nm to MD-units */
 247             Ext[m] *= Ex[m].a[0]*FIELDFAC;
 248             for (i = start; (i < start+homenr); i++)
 249             {
 250                 f[i][m] += charge[i]*Ext[m];
 251             }
 252         }
 253         else
 254         {
 255             Ext[m] = 0;
 256         }
 257     }
 258     if (fp != NULL)
 259     {
 260         fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
 261                 Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
 262     }
 263 }
 264
 265 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
 266                         tensor vir_part, t_graph *graph, matrix box,
 267                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
 268 {
 269     int    i, j;
 270     tensor virtest;
 271
 272     /* The short-range virial from surrounding boxes */
 273     clear_mat(vir_part);
 274     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
 275     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
 276
 277     /* Calculate partial virial, for local atoms only, based on short range.
 278      * Total virial is computed in global_stat, called from do_md
 279      */
 280     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
 281     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
 282
 283     /* Add position restraint contribution */
 284     for (i = 0; i < DIM; i++)
 285     {
 286         vir_part[i][i] += fr->vir_diag_posres[i];
 287     }
 288
 289     /* Add wall contribution */
 290     for (i = 0; i < DIM; i++)
 291     {
 292         vir_part[i][ZZ] += fr->vir_wall_z[i];
 293     }
 294
 295     if (debug)
 296     {
 297         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
 298     }
 299 }
 300
 301 static void posres_wrapper(FILE *fplog,
 302                            int flags,
 303                            gmx_bool bSepDVDL,
 304                            t_inputrec *ir,
 305                            t_nrnb *nrnb,
 306                            gmx_localtop_t *top,
 307                            matrix box, rvec x[],
 308                            gmx_enerdata_t *enerd,
 309                            real *lambda,
 310                            t_forcerec *fr)
 311 {
 312     t_pbc pbc;
 313     real  v, dvdl;
 314     int   i;
 315
 316     /* Position restraints always require full pbc */
 317     set_pbc(&pbc, ir->ePBC, box);
 318     dvdl = 0;
 319     v    = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
 320                   top->idef.iparams_posres,
 321                   (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
 322                   ir->ePBC == epbcNONE ? NULL : &pbc,
 323                   lambda[efptRESTRAINT], &dvdl,
 324                   fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
 325     if (bSepDVDL)
 326     {
 327         gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
 328     }
 329     enerd->term[F_POSRES] += v;
 330     /* If just the force constant changes, the FEP term is linear,
 331      * but if k changes, it is not.
 332      */
 333     enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
 334     inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
 335
 336     if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
 337     {
 338         for (i = 0; i < enerd->n_lambda; i++)
 339         {
 340             real dvdl_dum, lambda_dum;
 341
 342             lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
 343             v          = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
 344                                 top->idef.iparams_posres,
 345                                 (const rvec*)x, NULL, NULL,
 346                                 ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
 347                                 fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
 348             enerd->enerpart_lambda[i] += v;
 349         }
 350     }
 351 }
 352
 353 static void pull_potential_wrapper(FILE *fplog,
 354                                    gmx_bool bSepDVDL,
 355                                    t_commrec *cr,
 356                                    t_inputrec *ir,
 357                                    matrix box, rvec x[],
 358                                    rvec f[],
 359                                    tensor vir_force,
 360                                    t_mdatoms *mdatoms,
 361                                    gmx_enerdata_t *enerd,
 362                                    real *lambda,
 363                                    double t)
 364 {
 365     t_pbc  pbc;
 366     real   dvdl;
 367
 368     /* Calculate the center of mass forces, this requires communication,
 369      * which is why pull_potential is called close to other communication.
 370      * The virial contribution is calculated directly,
 371      * which is why we call pull_potential after calc_virial.
 372      */
 373     set_pbc(&pbc, ir->ePBC, box);
 374     dvdl                     = 0;
 375     enerd->term[F_COM_PULL] +=
 376         pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
 377                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
 378     if (bSepDVDL)
 379     {
 380         gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
 381     }
 382     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 383 }
 384
 385 static void pme_receive_force_ener(FILE           *fplog,
 386                                    gmx_bool        bSepDVDL,
 387                                    t_commrec      *cr,
 388                                    gmx_wallcycle_t wcycle,
 389                                    gmx_enerdata_t *enerd,
 390                                    t_forcerec     *fr)
 391 {
 392     real   e, v, dvdl;
 393     float  cycles_ppdpme, cycles_seppme;
 394
 395     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
 396     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 397
 398     /* In case of node-splitting, the PP nodes receive the long-range
 399      * forces, virial and energy from the PME nodes here.
 400      */
 401     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
 402     dvdl = 0;
 403     gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
 404                       &cycles_seppme);
 405     if (bSepDVDL)
 406     {
 407         gmx_print_sepdvdl(fplog, "PME mesh", e, dvdl);
 408     }
 409     enerd->term[F_COUL_RECIP] += e;
 410     enerd->dvdl_lin[efptCOUL] += dvdl;
 411     if (wcycle)
 412     {
 413         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
 414     }
 415     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
 416 }
 417
 418 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
 419                                gmx_large_int_t step, real pforce, rvec *x, rvec *f)
 420 {
 421     int  i;
 422     real pf2, fn2;
 423     char buf[STEPSTRSIZE];
 424
 425     pf2 = sqr(pforce);
 426     for (i = md->start; i < md->start+md->homenr; i++)
 427     {
 428         fn2 = norm2(f[i]);
 429         /* We also catch NAN, if the compiler does not optimize this away. */
 430         if (fn2 >= pf2 || fn2 != fn2)
 431         {
 432             fprintf(fp, "step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 433                     gmx_step_str(step, buf),
 434                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], sqrt(fn2));
 435         }
 436     }
 437 }
 438
 439 static void post_process_forces(t_commrec *cr,
 440                                 gmx_large_int_t step,
 441                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 442                                 gmx_localtop_t *top,
 443                                 matrix box, rvec x[],
 444                                 rvec f[],
 445                                 tensor vir_force,
 446                                 t_mdatoms *mdatoms,
 447                                 t_graph *graph,
 448                                 t_forcerec *fr, gmx_vsite_t *vsite,
 449                                 int flags)
 450 {
 451     if (fr->bF_NoVirSum)
 452     {
 453         if (vsite)
 454         {
 455             /* Spread the mesh force on virtual sites to the other particles...
 456              * This is parallellized. MPI communication is performed
 457              * if the constructing atoms aren't local.
 458              */
 459             wallcycle_start(wcycle, ewcVSITESPREAD);
 460             spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
 461                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
 462                            nrnb,
 463                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
 464             wallcycle_stop(wcycle, ewcVSITESPREAD);
 465         }
 466         if (flags & GMX_FORCE_VIRIAL)
 467         {
 468             /* Now add the forces, this is local */
 469             if (fr->bDomDec)
 470             {
 471                 sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
 472             }
 473             else
 474             {
 475                 sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
 476                            f, fr->f_novirsum);
 477             }
 478             if (EEL_FULL(fr->eeltype))
 479             {
 480                 /* Add the mesh contribution to the virial */
 481                 m_add(vir_force, fr->vir_el_recip, vir_force);
 482             }
 483             if (debug)
 484             {
 485                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
 486             }
 487         }
 488     }
 489
 490     if (fr->print_force >= 0)
 491     {
 492         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
 493     }
 494 }
 495
 496 static void do_nb_verlet(t_forcerec *fr,
 497                          interaction_const_t *ic,
 498                          gmx_enerdata_t *enerd,
 499                          int flags, int ilocality,
 500                          int clearF,
 501                          t_nrnb *nrnb,
 502                          gmx_wallcycle_t wcycle)
 503 {
 504     int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 505     char                      *env;
 506     nonbonded_verlet_group_t  *nbvg;
 507     gmx_bool                   bCUDA;
 508
 509     if (!(flags & GMX_FORCE_NONBONDED))
 510     {
 511         /* skip non-bonded calculation */
 512         return;
 513     }
 514
 515     nbvg = &fr->nbv->grp[ilocality];
 516
 517     /* CUDA kernel launch overhead is already timed separately */
 518     if (fr->cutoff_scheme != ecutsVERLET)
 519     {
 520         gmx_incons("Invalid cut-off scheme passed!");
 521     }
 522
 523     bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
 524
 525     if (!bCUDA)
 526     {
 527         wallcycle_sub_start(wcycle, ewcsNONBONDED);
 528     }
 529     switch (nbvg->kernel_type)
 530     {
 531         case nbnxnk4x4_PlainC:
 532             nbnxn_kernel_ref(&nbvg->nbl_lists,
 533                              nbvg->nbat, ic,
 534                              fr->shift_vec,
 535                              flags,
 536                              clearF,
 537                              fr->fshift[0],
 538                              enerd->grpp.ener[egCOULSR],
 539                              fr->bBHAM ?
 540                              enerd->grpp.ener[egBHAMSR] :
 541                              enerd->grpp.ener[egLJSR]);
 542             break;
 543
 544         case nbnxnk4xN_SIMD_4xN:
 545             nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
 546                                   nbvg->nbat, ic,
 547                                   nbvg->ewald_excl,
 548                                   fr->shift_vec,
 549                                   flags,
 550                                   clearF,
 551                                   fr->fshift[0],
 552                                   enerd->grpp.ener[egCOULSR],
 553                                   fr->bBHAM ?
 554                                   enerd->grpp.ener[egBHAMSR] :
 555                                   enerd->grpp.ener[egLJSR]);
 556             break;
 557         case nbnxnk4xN_SIMD_2xNN:
 558             nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
 559                                    nbvg->nbat, ic,
 560                                    nbvg->ewald_excl,
 561                                    fr->shift_vec,
 562                                    flags,
 563                                    clearF,
 564                                    fr->fshift[0],
 565                                    enerd->grpp.ener[egCOULSR],
 566                                    fr->bBHAM ?
 567                                    enerd->grpp.ener[egBHAMSR] :
 568                                    enerd->grpp.ener[egLJSR]);
 569             break;
 570
 571         case nbnxnk8x8x8_CUDA:
 572             nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
 573             break;
 574
 575         case nbnxnk8x8x8_PlainC:
 576             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 577                                  nbvg->nbat, ic,
 578                                  fr->shift_vec,
 579                                  flags,
 580                                  clearF,
 581                                  nbvg->nbat->out[0].f,
 582                                  fr->fshift[0],
 583                                  enerd->grpp.ener[egCOULSR],
 584                                  fr->bBHAM ?
 585                                  enerd->grpp.ener[egBHAMSR] :
 586                                  enerd->grpp.ener[egLJSR]);
 587             break;
 588
 589         default:
 590             gmx_incons("Invalid nonbonded kernel type passed!");
 591
 592     }
 593     if (!bCUDA)
 594     {
 595         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 596     }
 597
 598     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 599     {
 600         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 601     }
 602     else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
 603              (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
 604     {
 605         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 606     }
 607     else
 608     {
 609         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 610     }
 611     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 612     if (flags & GMX_FORCE_ENERGY)
 613     {
 614         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 615         enr_nbnxn_kernel_ljc += 1;
 616         enr_nbnxn_kernel_lj  += 1;
 617     }
 618
 619     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 620              nbvg->nbl_lists.natpair_ljq);
 621     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 622              nbvg->nbl_lists.natpair_lj);
 623     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 624              nbvg->nbl_lists.natpair_q);
 625 }
 626
 627 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 628                          t_inputrec *inputrec,
 629                          gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 630                          gmx_localtop_t *top,
 631                          gmx_groups_t gmx_unused *groups,
 632                          matrix box, rvec x[], history_t *hist,
 633                          rvec f[],
 634                          tensor vir_force,
 635                          t_mdatoms *mdatoms,
 636                          gmx_enerdata_t *enerd, t_fcdata *fcd,
 637                          real *lambda, t_graph *graph,
 638                          t_forcerec *fr, interaction_const_t *ic,
 639                          gmx_vsite_t *vsite, rvec mu_tot,
 640                          double t, FILE *field, gmx_edsam_t ed,
 641                          gmx_bool bBornRadii,
 642                          int flags)
 643 {
 644     int                 cg0, cg1, i, j;
 645     int                 start, homenr;
 646     int                 nb_kernel_type;
 647     double              mu[2*DIM];
 648     gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
 649     gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
 650     gmx_bool            bDiffKernels = FALSE;
 651     matrix              boxs;
 652     rvec                vzero, box_diag;
 653     real                e, v, dvdl;
 654     float               cycles_pme, cycles_force, cycles_wait_gpu;
 655     nonbonded_verlet_t *nbv;
 656
 657     cycles_force    = 0;
 658     cycles_wait_gpu = 0;
 659     nbv             = fr->nbv;
 660     nb_kernel_type  = fr->nbv->grp[0].kernel_type;
 661
 662     start  = mdatoms->start;
 663     homenr = mdatoms->homenr;
 664
 665     bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
 666
 667     clear_mat(vir_force);
 668
 669     cg0 = 0;
 670     if (DOMAINDECOMP(cr))
 671     {
 672         cg1 = cr->dd->ncg_tot;
 673     }
 674     else
 675     {
 676         cg1 = top->cgs.nr;
 677     }
 678     if (fr->n_tpi > 0)
 679     {
 680         cg1--;
 681     }
 682
 683     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 684     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
 685     bFillGrid     = (bNS && bStateChanged);
 686     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
 687     bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
 688     bDoForces     = (flags & GMX_FORCE_FORCES);
 689     bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
 690     bUseGPU       = fr->nbv->bUseGPU;
 691     bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
 692
 693     if (bStateChanged)
 694     {
 695         update_forcerec(fr, box);
 696
 697         if (NEED_MUTOT(*inputrec))
 698         {
 699             /* Calculate total (local) dipole moment in a temporary common array.
 700              * This makes it possible to sum them over nodes faster.
 701              */
 702             calc_mu(start, homenr,
 703                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
 704                     mu, mu+DIM);
 705         }
 706     }
 707
 708     if (fr->ePBC != epbcNONE)
 709     {
 710         /* Compute shift vectors every step,
 711          * because of pressure coupling or box deformation!
 712          */
 713         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 714         {
 715             calc_shifts(box, fr->shift_vec);
 716         }
 717
 718         if (bCalcCGCM)
 719         {
 720             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
 721             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
 722         }
 723         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
 724         {
 725             unshift_self(graph, box, x);
 726         }
 727     }
 728
 729     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
 730                                  fr->shift_vec, nbv->grp[0].nbat);
 731
 732 #ifdef GMX_MPI
 733     if (!(cr->duty & DUTY_PME))
 734     {
 735         /* Send particle coordinates to the pme nodes.
 736          * Since this is only implemented for domain decomposition
 737          * and domain decomposition does not use the graph,
 738          * we do not need to worry about shifting.
 739          */
 740
 741         wallcycle_start(wcycle, ewcPP_PMESENDX);
 742
 743         bBS = (inputrec->nwall == 2);
 744         if (bBS)
 745         {
 746             copy_mat(box, boxs);
 747             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
 748         }
 749
 750         gmx_pme_send_x(cr, bBS ? boxs : box, x,
 751                        mdatoms->nChargePerturbed, lambda[efptCOUL],
 752                        (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
 753
 754         wallcycle_stop(wcycle, ewcPP_PMESENDX);
 755     }
 756 #endif /* GMX_MPI */
 757
 758     /* do gridding for pair search */
 759     if (bNS)
 760     {
 761         if (graph && bStateChanged)
 762         {
 763             /* Calculate intramolecular shift vectors to make molecules whole */
 764             mk_mshift(fplog, graph, fr->ePBC, box, x);
 765         }
 766
 767         clear_rvec(vzero);
 768         box_diag[XX] = box[XX][XX];
 769         box_diag[YY] = box[YY][YY];
 770         box_diag[ZZ] = box[ZZ][ZZ];
 771
 772         wallcycle_start(wcycle, ewcNS);
 773         if (!fr->bDomDec)
 774         {
 775             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
 776             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
 777                               0, vzero, box_diag,
 778                               0, mdatoms->homenr, -1, fr->cginfo, x,
 779                               0, NULL,
 780                               nbv->grp[eintLocal].kernel_type,
 781                               nbv->grp[eintLocal].nbat);
 782             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
 783         }
 784         else
 785         {
 786             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
 787             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
 788                                        fr->cginfo, x,
 789                                        nbv->grp[eintNonlocal].kernel_type,
 790                                        nbv->grp[eintNonlocal].nbat);
 791             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
 792         }
 793
 794         if (nbv->ngrp == 1 ||
 795             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
 796         {
 797             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
 798                                nbv->nbs, mdatoms, fr->cginfo);
 799         }
 800         else
 801         {
 802             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
 803                                nbv->nbs, mdatoms, fr->cginfo);
 804             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
 805                                nbv->nbs, mdatoms, fr->cginfo);
 806         }
 807         wallcycle_stop(wcycle, ewcNS);
 808     }
 809
 810     /* initialize the GPU atom data and copy shift vector */
 811     if (bUseGPU)
 812     {
 813         if (bNS)
 814         {
 815             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 816             nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 817             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 818         }
 819
 820         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 821         nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 822         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 823     }
 824
 825     /* do local pair search */
 826     if (bNS)
 827     {
 828         wallcycle_start_nocount(wcycle, ewcNS);
 829         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
 830         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
 831                             &top->excls,
 832                             ic->rlist,
 833                             nbv->min_ci_balanced,
 834                             &nbv->grp[eintLocal].nbl_lists,
 835                             eintLocal,
 836                             nbv->grp[eintLocal].kernel_type,
 837                             nrnb);
 838         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
 839
 840         if (bUseGPU)
 841         {
 842             /* initialize local pair-list on the GPU */
 843             nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 844                                      nbv->grp[eintLocal].nbl_lists.nbl[0],
 845                                      eintLocal);
 846         }
 847         wallcycle_stop(wcycle, ewcNS);
 848     }
 849     else
 850     {
 851         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 852         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 853         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
 854                                         nbv->grp[eintLocal].nbat);
 855         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 856         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 857     }
 858
 859     if (bUseGPU)
 860     {
 861         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
 862         /* launch local nonbonded F on GPU */
 863         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
 864                      nrnb, wcycle);
 865         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 866     }
 867
 868     /* Communicate coordinates and sum dipole if necessary +
 869        do non-local pair search */
 870     if (DOMAINDECOMP(cr))
 871     {
 872         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
 873                         nbv->grp[eintLocal].kernel_type);
 874
 875         if (bDiffKernels)
 876         {
 877             /* With GPU+CPU non-bonded calculations we need to copy
 878              * the local coordinates to the non-local nbat struct
 879              * (in CPU format) as the non-local kernel call also
 880              * calculates the local - non-local interactions.
 881              */
 882             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 883             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 884             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
 885                                             nbv->grp[eintNonlocal].nbat);
 886             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 887             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 888         }
 889
 890         if (bNS)
 891         {
 892             wallcycle_start_nocount(wcycle, ewcNS);
 893             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 894
 895             if (bDiffKernels)
 896             {
 897                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
 898             }
 899
 900             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
 901                                 &top->excls,
 902                                 ic->rlist,
 903                                 nbv->min_ci_balanced,
 904                                 &nbv->grp[eintNonlocal].nbl_lists,
 905                                 eintNonlocal,
 906                                 nbv->grp[eintNonlocal].kernel_type,
 907                                 nrnb);
 908
 909             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 910
 911             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
 912             {
 913                 /* initialize non-local pair-list on the GPU */
 914                 nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 915                                          nbv->grp[eintNonlocal].nbl_lists.nbl[0],
 916                                          eintNonlocal);
 917             }
 918             wallcycle_stop(wcycle, ewcNS);
 919         }
 920         else
 921         {
 922             wallcycle_start(wcycle, ewcMOVEX);
 923             dd_move_x(cr->dd, box, x);
 924
 925             /* When we don't need the total dipole we sum it in global_stat */
 926             if (bStateChanged && NEED_MUTOT(*inputrec))
 927             {
 928                 gmx_sumd(2*DIM, mu, cr);
 929             }
 930             wallcycle_stop(wcycle, ewcMOVEX);
 931
 932             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 933             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 934             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
 935                                             nbv->grp[eintNonlocal].nbat);
 936             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 937             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 938         }
 939
 940         if (bUseGPU && !bDiffKernels)
 941         {
 942             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
 943             /* launch non-local nonbonded F on GPU */
 944             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
 945                          nrnb, wcycle);
 946             cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 947         }
 948     }
 949
 950     if (bUseGPU)
 951     {
 952         /* launch D2H copy-back F */
 953         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 954         if (DOMAINDECOMP(cr) && !bDiffKernels)
 955         {
 956             nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
 957                                       flags, eatNonlocal);
 958         }
 959         nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
 960                                   flags, eatLocal);
 961         cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 962     }
 963
 964     if (bStateChanged && NEED_MUTOT(*inputrec))
 965     {
 966         if (PAR(cr))
 967         {
 968             gmx_sumd(2*DIM, mu, cr);
 969         }
 970
 971         for (i = 0; i < 2; i++)
 972         {
 973             for (j = 0; j < DIM; j++)
 974             {
 975                 fr->mu_tot[i][j] = mu[i*DIM + j];
 976             }
 977         }
 978     }
 979     if (fr->efep == efepNO)
 980     {
 981         copy_rvec(fr->mu_tot[0], mu_tot);
 982     }
 983     else
 984     {
 985         for (j = 0; j < DIM; j++)
 986         {
 987             mu_tot[j] =
 988                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
 989                 lambda[efptCOUL]*fr->mu_tot[1][j];
 990         }
 991     }
 992
 993     /* Reset energies */
 994     reset_enerdata(fr, bNS, enerd, MASTER(cr));
 995     clear_rvecs(SHIFTS, fr->fshift);
 996
 997     if (DOMAINDECOMP(cr))
 998     {
 999         if (!(cr->duty & DUTY_PME))
1000         {
1001             wallcycle_start(wcycle, ewcPPDURINGPME);
1002             dd_force_flop_start(cr->dd, nrnb);
1003         }
1004     }
1005
1006     if (inputrec->bRot)
1007     {
1008         /* Enforced rotation has its own cycle counter that starts after the collective
1009          * coordinates have been communicated. It is added to ddCyclF to allow
1010          * for proper load-balancing */
1011         wallcycle_start(wcycle, ewcROT);
1012         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1013         wallcycle_stop(wcycle, ewcROT);
1014     }
1015
1016     /* Start the force cycle counter.
1017      * This counter is stopped in do_forcelow_level.
1018      * No parallel communication should occur while this counter is running,
1019      * since that will interfere with the dynamic load balancing.
1020      */
1021     wallcycle_start(wcycle, ewcFORCE);
1022     if (bDoForces)
1023     {
1024         /* Reset forces for which the virial is calculated separately:
1025          * PME/Ewald forces if necessary */
1026         if (fr->bF_NoVirSum)
1027         {
1028             if (flags & GMX_FORCE_VIRIAL)
1029             {
1030                 fr->f_novirsum = fr->f_novirsum_alloc;
1031                 if (fr->bDomDec)
1032                 {
1033                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1034                 }
1035                 else
1036                 {
1037                     clear_rvecs(homenr, fr->f_novirsum+start);
1038                 }
1039             }
1040             else
1041             {
1042                 /* We are not calculating the pressure so we do not need
1043                  * a separate array for forces that do not contribute
1044                  * to the pressure.
1045                  */
1046                 fr->f_novirsum = f;
1047             }
1048         }
1049
1050         /* Clear the short- and long-range forces */
1051         clear_rvecs(fr->natoms_force_constr, f);
1052         if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
1053         {
1054             clear_rvecs(fr->natoms_force_constr, fr->f_twin);
1055         }
1056
1057         clear_rvec(fr->vir_diag_posres);
1058     }
1059
1060     if (inputrec->ePull == epullCONSTRAINT)
1061     {
1062         clear_pull_forces(inputrec->pull);
1063     }
1064
1065     /* We calculate the non-bonded forces, when done on the CPU, here.
1066      * We do this before calling do_force_lowlevel, as in there bondeds
1067      * forces are calculated before PME, which does communication.
1068      * With this order, non-bonded and bonded force calculation imbalance
1069      * can be balanced out by the domain decomposition load balancing.
1070      */
1071
1072     if (!bUseOrEmulGPU)
1073     {
1074         /* Maybe we should move this into do_force_lowlevel */
1075         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1076                      nrnb, wcycle);
1077     }
1078
1079     if (!bUseOrEmulGPU || bDiffKernels)
1080     {
1081         int aloc;
1082
1083         if (DOMAINDECOMP(cr))
1084         {
1085             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1086                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1087                          nrnb, wcycle);
1088         }
1089
1090         if (!bUseOrEmulGPU)
1091         {
1092             aloc = eintLocal;
1093         }
1094         else
1095         {
1096             aloc = eintNonlocal;
1097         }
1098
1099         /* Add all the non-bonded force to the normal force array.
1100          * This can be split into a local a non-local part when overlapping
1101          * communication with calculation with domain decomposition.
1102          */
1103         cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1104         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1105         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1106         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1107         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1108         cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1109         wallcycle_start_nocount(wcycle, ewcFORCE);
1110
1111         /* if there are multiple fshift output buffers reduce them */
1112         if ((flags & GMX_FORCE_VIRIAL) &&
1113             nbv->grp[aloc].nbl_lists.nnbl > 1)
1114         {
1115             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1116                                                      fr->fshift);
1117         }
1118     }
1119
1120     /* update QMMMrec, if necessary */
1121     if (fr->bQMMM)
1122     {
1123         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1124     }
1125
1126     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
1127     {
1128         posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
1129                        enerd, lambda, fr);
1130     }
1131
1132     /* Compute the bonded and non-bonded energies and optionally forces */
1133     do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
1134                       cr, nrnb, wcycle, mdatoms,
1135                       x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
1136                       &(top->atomtypes), bBornRadii, box,
1137                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1138                       flags, &cycles_pme);
1139
1140     if (bSepLRF)
1141     {
1142         if (do_per_step(step, inputrec->nstcalclr))
1143         {
1144             /* Add the long range forces to the short range forces */
1145             for (i = 0; i < fr->natoms_force_constr; i++)
1146             {
1147                 rvec_add(fr->f_twin[i], f[i], f[i]);
1148             }
1149         }
1150     }
1151
1152     cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1153
1154     if (ed)
1155     {
1156         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1157     }
1158
1159     if (bUseOrEmulGPU && !bDiffKernels)
1160     {
1161         /* wait for non-local forces (or calculate in emulation mode) */
1162         if (DOMAINDECOMP(cr))
1163         {
1164             if (bUseGPU)
1165             {
1166                 float cycles_tmp;
1167
1168                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1169                 nbnxn_cuda_wait_gpu(nbv->cu_nbv,
1170                                     nbv->grp[eintNonlocal].nbat,
1171                                     flags, eatNonlocal,
1172                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1173                                     fr->fshift);
1174                 cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1175                 cycles_wait_gpu += cycles_tmp;
1176                 cycles_force    += cycles_tmp;
1177             }
1178             else
1179             {
1180                 wallcycle_start_nocount(wcycle, ewcFORCE);
1181                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1182                              nrnb, wcycle);
1183                 cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1184             }
1185             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1186             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1187             /* skip the reduction if there was no non-local work to do */
1188             if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
1189             {
1190                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1191                                                nbv->grp[eintNonlocal].nbat, f);
1192             }
1193             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1194             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1195         }
1196     }
1197
1198     if (bDoForces)
1199     {
1200         /* Communicate the forces */
1201         if (PAR(cr))
1202         {
1203             wallcycle_start(wcycle, ewcMOVEF);
1204             if (DOMAINDECOMP(cr))
1205             {
1206                 dd_move_f(cr->dd, f, fr->fshift);
1207                 /* Do we need to communicate the separate force array
1208                  * for terms that do not contribute to the single sum virial?
1209                  * Position restraints and electric fields do not introduce
1210                  * inter-cg forces, only full electrostatics methods do.
1211                  * When we do not calculate the virial, fr->f_novirsum = f,
1212                  * so we have already communicated these forces.
1213                  */
1214                 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1215                     (flags & GMX_FORCE_VIRIAL))
1216                 {
1217                     dd_move_f(cr->dd, fr->f_novirsum, NULL);
1218                 }
1219                 if (bSepLRF)
1220                 {
1221                     /* We should not update the shift forces here,
1222                      * since f_twin is already included in f.
1223                      */
1224                     dd_move_f(cr->dd, fr->f_twin, NULL);
1225                 }
1226             }
1227             wallcycle_stop(wcycle, ewcMOVEF);
1228         }
1229     }
1230
1231     if (bUseOrEmulGPU)
1232     {
1233         /* wait for local forces (or calculate in emulation mode) */
1234         if (bUseGPU)
1235         {
1236             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1237             nbnxn_cuda_wait_gpu(nbv->cu_nbv,
1238                                 nbv->grp[eintLocal].nbat,
1239                                 flags, eatLocal,
1240                                 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1241                                 fr->fshift);
1242             cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1243
1244             /* now clear the GPU outputs while we finish the step on the CPU */
1245
1246             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1247             nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
1248             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1249         }
1250         else
1251         {
1252             wallcycle_start_nocount(wcycle, ewcFORCE);
1253             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1254                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1255                          nrnb, wcycle);
1256             wallcycle_stop(wcycle, ewcFORCE);
1257         }
1258         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1259         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1260         if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
1261         {
1262             /* skip the reduction if there was no non-local work to do */
1263             nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1264                                            nbv->grp[eintLocal].nbat, f);
1265         }
1266         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1267         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1268     }
1269
1270     if (DOMAINDECOMP(cr))
1271     {
1272         dd_force_flop_stop(cr->dd, nrnb);
1273         if (wcycle)
1274         {
1275             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1276             if (bUseGPU)
1277             {
1278                 dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
1279             }
1280         }
1281     }
1282
1283     if (bDoForces)
1284     {
1285         if (IR_ELEC_FIELD(*inputrec))
1286         {
1287             /* Compute forces due to electric field */
1288             calc_f_el(MASTER(cr) ? field : NULL,
1289                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1290                       inputrec->ex, inputrec->et, t);
1291         }
1292
1293         /* If we have NoVirSum forces, but we do not calculate the virial,
1294          * we sum fr->f_novirum=f later.
1295          */
1296         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1297         {
1298             wallcycle_start(wcycle, ewcVSITESPREAD);
1299             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1300                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1301             wallcycle_stop(wcycle, ewcVSITESPREAD);
1302
1303             if (bSepLRF)
1304             {
1305                 wallcycle_start(wcycle, ewcVSITESPREAD);
1306                 spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
1307                                nrnb,
1308                                &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1309                 wallcycle_stop(wcycle, ewcVSITESPREAD);
1310             }
1311         }
1312
1313         if (flags & GMX_FORCE_VIRIAL)
1314         {
1315             /* Calculation of the virial must be done after vsites! */
1316             calc_virial(mdatoms->start, mdatoms->homenr, x, f,
1317                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1318         }
1319     }
1320
1321     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
1322     {
1323         pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
1324                                f, vir_force, mdatoms, enerd, lambda, t);
1325     }
1326
1327     /* Add the forces from enforced rotation potentials (if any) */
1328     if (inputrec->bRot)
1329     {
1330         wallcycle_start(wcycle, ewcROTadd);
1331         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1332         wallcycle_stop(wcycle, ewcROTadd);
1333     }
1334
1335     if (PAR(cr) && !(cr->duty & DUTY_PME))
1336     {
1337         /* In case of node-splitting, the PP nodes receive the long-range
1338          * forces, virial and energy from the PME nodes here.
1339          */
1340         pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
1341     }
1342
1343     if (bDoForces)
1344     {
1345         post_process_forces(cr, step, nrnb, wcycle,
1346                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1347                             flags);
1348     }
1349
1350     /* Sum the potential energy terms from group contributions */
1351     sum_epot(&(enerd->grpp), enerd->term);
1352 }
1353
1354 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1355                         t_inputrec *inputrec,
1356                         gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1357                         gmx_localtop_t *top,
1358                         gmx_groups_t *groups,
1359                         matrix box, rvec x[], history_t *hist,
1360                         rvec f[],
1361                         tensor vir_force,
1362                         t_mdatoms *mdatoms,
1363                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1364                         real *lambda, t_graph *graph,
1365                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1366                         double t, FILE *field, gmx_edsam_t ed,
1367                         gmx_bool bBornRadii,
1368                         int flags)
1369 {
1370     int        cg0, cg1, i, j;
1371     int        start, homenr;
1372     double     mu[2*DIM];
1373     gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
1374     gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
1375     gmx_bool   bDoAdressWF;
1376     matrix     boxs;
1377     rvec       vzero, box_diag;
1378     real       e, v, dvdlambda[efptNR];
1379     t_pbc      pbc;
1380     float      cycles_pme, cycles_force;
1381
1382     start  = mdatoms->start;
1383     homenr = mdatoms->homenr;
1384
1385     bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
1386
1387     clear_mat(vir_force);
1388
1389     if (PARTDECOMP(cr))
1390     {
1391         pd_cg_range(cr, &cg0, &cg1);
1392     }
1393     else
1394     {
1395         cg0 = 0;
1396         if (DOMAINDECOMP(cr))
1397         {
1398             cg1 = cr->dd->ncg_tot;
1399         }
1400         else
1401         {
1402             cg1 = top->cgs.nr;
1403         }
1404         if (fr->n_tpi > 0)
1405         {
1406             cg1--;
1407         }
1408     }
1409
1410     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1411     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1412     /* Should we update the long-range neighborlists at this step? */
1413     bDoLongRangeNS = fr->bTwinRange && bNS;
1414     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1415     bFillGrid      = (bNS && bStateChanged);
1416     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1417     bDoForces      = (flags & GMX_FORCE_FORCES);
1418     bDoPotential   = (flags & GMX_FORCE_ENERGY);
1419     bSepLRF        = ((inputrec->nstcalclr > 1) && bDoForces &&
1420                       (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
1421
1422     /* should probably move this to the forcerec since it doesn't change */
1423     bDoAdressWF   = ((fr->adress_type != eAdressOff));
1424
1425     if (bStateChanged)
1426     {
1427         update_forcerec(fr, box);
1428
1429         if (NEED_MUTOT(*inputrec))
1430         {
1431             /* Calculate total (local) dipole moment in a temporary common array.
1432              * This makes it possible to sum them over nodes faster.
1433              */
1434             calc_mu(start, homenr,
1435                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1436                     mu, mu+DIM);
1437         }
1438     }
1439
1440     if (fr->ePBC != epbcNONE)
1441     {
1442         /* Compute shift vectors every step,
1443          * because of pressure coupling or box deformation!
1444          */
1445         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1446         {
1447             calc_shifts(box, fr->shift_vec);
1448         }
1449
1450         if (bCalcCGCM)
1451         {
1452             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1453                                      &(top->cgs), x, fr->cg_cm);
1454             inc_nrnb(nrnb, eNR_CGCM, homenr);
1455             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1456         }
1457         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1458         {
1459             unshift_self(graph, box, x);
1460         }
1461     }
1462     else if (bCalcCGCM)
1463     {
1464         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1465         inc_nrnb(nrnb, eNR_CGCM, homenr);
1466     }
1467
1468     if (bCalcCGCM)
1469     {
1470         if (PAR(cr))
1471         {
1472             move_cgcm(fplog, cr, fr->cg_cm);
1473         }
1474         if (gmx_debug_at)
1475         {
1476             pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1477         }
1478     }
1479
1480 #ifdef GMX_MPI
1481     if (!(cr->duty & DUTY_PME))
1482     {
1483         /* Send particle coordinates to the pme nodes.
1484          * Since this is only implemented for domain decomposition
1485          * and domain decomposition does not use the graph,
1486          * we do not need to worry about shifting.
1487          */
1488
1489         wallcycle_start(wcycle, ewcPP_PMESENDX);
1490
1491         bBS = (inputrec->nwall == 2);
1492         if (bBS)
1493         {
1494             copy_mat(box, boxs);
1495             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1496         }
1497
1498         gmx_pme_send_x(cr, bBS ? boxs : box, x,
1499                        mdatoms->nChargePerturbed, lambda[efptCOUL],
1500                        (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
1501
1502         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1503     }
1504 #endif /* GMX_MPI */
1505
1506     /* Communicate coordinates and sum dipole if necessary */
1507     if (PAR(cr))
1508     {
1509         wallcycle_start(wcycle, ewcMOVEX);
1510         if (DOMAINDECOMP(cr))
1511         {
1512             dd_move_x(cr->dd, box, x);
1513         }
1514         else
1515         {
1516             move_x(cr, x, nrnb);
1517         }
1518         wallcycle_stop(wcycle, ewcMOVEX);
1519     }
1520
1521     /* update adress weight beforehand */
1522     if (bStateChanged && bDoAdressWF)
1523     {
1524         /* need pbc for adress weight calculation with pbc_dx */
1525         set_pbc(&pbc, inputrec->ePBC, box);
1526         if (fr->adress_site == eAdressSITEcog)
1527         {
1528             update_adress_weights_cog(top->idef.iparams, top->idef.il, x, fr, mdatoms,
1529                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
1530         }
1531         else if (fr->adress_site == eAdressSITEcom)
1532         {
1533             update_adress_weights_com(fplog, cg0, cg1, &(top->cgs), x, fr, mdatoms,
1534                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
1535         }
1536         else if (fr->adress_site == eAdressSITEatomatom)
1537         {
1538             update_adress_weights_atom_per_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
1539                                                 inputrec->ePBC == epbcNONE ? NULL : &pbc);
1540         }
1541         else
1542         {
1543             update_adress_weights_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
1544                                        inputrec->ePBC == epbcNONE ? NULL : &pbc);
1545         }
1546     }
1547
1548     if (NEED_MUTOT(*inputrec))
1549     {
1550
1551         if (bStateChanged)
1552         {
1553             if (PAR(cr))
1554             {
1555                 gmx_sumd(2*DIM, mu, cr);
1556             }
1557             for (i = 0; i < 2; i++)
1558             {
1559                 for (j = 0; j < DIM; j++)
1560                 {
1561                     fr->mu_tot[i][j] = mu[i*DIM + j];
1562                 }
1563             }
1564         }
1565         if (fr->efep == efepNO)
1566         {
1567             copy_rvec(fr->mu_tot[0], mu_tot);
1568         }
1569         else
1570         {
1571             for (j = 0; j < DIM; j++)
1572             {
1573                 mu_tot[j] =
1574                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1575             }
1576         }
1577     }
1578
1579     /* Reset energies */
1580     reset_enerdata(fr, bNS, enerd, MASTER(cr));
1581     clear_rvecs(SHIFTS, fr->fshift);
1582
1583     if (bNS)
1584     {
1585         wallcycle_start(wcycle, ewcNS);
1586
1587         if (graph && bStateChanged)
1588         {
1589             /* Calculate intramolecular shift vectors to make molecules whole */
1590             mk_mshift(fplog, graph, fr->ePBC, box, x);
1591         }
1592
1593         /* Do the actual neighbour searching */
1594         ns(fplog, fr, box,
1595            groups, top, mdatoms,
1596            cr, nrnb, bFillGrid,
1597            bDoLongRangeNS);
1598
1599         wallcycle_stop(wcycle, ewcNS);
1600     }
1601
1602     if (inputrec->implicit_solvent && bNS)
1603     {
1604         make_gb_nblist(cr, inputrec->gb_algorithm,
1605                        x, box, fr, &top->idef, graph, fr->born);
1606     }
1607
1608     if (DOMAINDECOMP(cr))
1609     {
1610         if (!(cr->duty & DUTY_PME))
1611         {
1612             wallcycle_start(wcycle, ewcPPDURINGPME);
1613             dd_force_flop_start(cr->dd, nrnb);
1614         }
1615     }
1616
1617     if (inputrec->bRot)
1618     {
1619         /* Enforced rotation has its own cycle counter that starts after the collective
1620          * coordinates have been communicated. It is added to ddCyclF to allow
1621          * for proper load-balancing */
1622         wallcycle_start(wcycle, ewcROT);
1623         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1624         wallcycle_stop(wcycle, ewcROT);
1625     }
1626
1627     /* Start the force cycle counter.
1628      * This counter is stopped in do_forcelow_level.
1629      * No parallel communication should occur while this counter is running,
1630      * since that will interfere with the dynamic load balancing.
1631      */
1632     wallcycle_start(wcycle, ewcFORCE);
1633
1634     if (bDoForces)
1635     {
1636         /* Reset forces for which the virial is calculated separately:
1637          * PME/Ewald forces if necessary */
1638         if (fr->bF_NoVirSum)
1639         {
1640             if (flags & GMX_FORCE_VIRIAL)
1641             {
1642                 fr->f_novirsum = fr->f_novirsum_alloc;
1643                 if (fr->bDomDec)
1644                 {
1645                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1646                 }
1647                 else
1648                 {
1649                     clear_rvecs(homenr, fr->f_novirsum+start);
1650                 }
1651             }
1652             else
1653             {
1654                 /* We are not calculating the pressure so we do not need
1655                  * a separate array for forces that do not contribute
1656                  * to the pressure.
1657                  */
1658                 fr->f_novirsum = f;
1659             }
1660         }
1661
1662         /* Clear the short- and long-range forces */
1663         clear_rvecs(fr->natoms_force_constr, f);
1664         if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
1665         {
1666             clear_rvecs(fr->natoms_force_constr, fr->f_twin);
1667         }
1668
1669         clear_rvec(fr->vir_diag_posres);
1670     }
1671     if (inputrec->ePull == epullCONSTRAINT)
1672     {
1673         clear_pull_forces(inputrec->pull);
1674     }
1675
1676     /* update QMMMrec, if necessary */
1677     if (fr->bQMMM)
1678     {
1679         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1680     }
1681
1682     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
1683     {
1684         posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
1685                        enerd, lambda, fr);
1686     }
1687
1688     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
1689     {
1690         /* Flat-bottomed position restraints always require full pbc */
1691         if (!(bStateChanged && bDoAdressWF))
1692         {
1693             set_pbc(&pbc, inputrec->ePBC, box);
1694         }
1695         v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
1696                      top->idef.iparams_fbposres,
1697                      (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
1698                      inputrec->ePBC == epbcNONE ? NULL : &pbc,
1699                      fr->rc_scaling, fr->ePBC, fr->posres_com);
1700         enerd->term[F_FBPOSRES] += v;
1701         inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
1702     }
1703
1704     /* Compute the bonded and non-bonded energies and optionally forces */
1705     do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
1706                       cr, nrnb, wcycle, mdatoms,
1707                       x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
1708                       &(top->atomtypes), bBornRadii, box,
1709                       inputrec->fepvals, lambda,
1710                       graph, &(top->excls), fr->mu_tot,
1711                       flags,
1712                       &cycles_pme);
1713
1714     if (bSepLRF)
1715     {
1716         if (do_per_step(step, inputrec->nstcalclr))
1717         {
1718             /* Add the long range forces to the short range forces */
1719             for (i = 0; i < fr->natoms_force_constr; i++)
1720             {
1721                 rvec_add(fr->f_twin[i], f[i], f[i]);
1722             }
1723         }
1724     }
1725
1726     cycles_force = wallcycle_stop(wcycle, ewcFORCE);
1727
1728     if (ed)
1729     {
1730         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1731     }
1732
1733     if (DOMAINDECOMP(cr))
1734     {
1735         dd_force_flop_stop(cr->dd, nrnb);
1736         if (wcycle)
1737         {
1738             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1739         }
1740     }
1741
1742     if (bDoForces)
1743     {
1744         if (IR_ELEC_FIELD(*inputrec))
1745         {
1746             /* Compute forces due to electric field */
1747             calc_f_el(MASTER(cr) ? field : NULL,
1748                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1749                       inputrec->ex, inputrec->et, t);
1750         }
1751
1752         if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
1753         {
1754             /* Compute thermodynamic force in hybrid AdResS region */
1755             adress_thermo_force(start, homenr, &(top->cgs), x, fr->f_novirsum, fr, mdatoms,
1756                                 inputrec->ePBC == epbcNONE ? NULL : &pbc);
1757         }
1758
1759         /* Communicate the forces */
1760         if (PAR(cr))
1761         {
1762             wallcycle_start(wcycle, ewcMOVEF);
1763             if (DOMAINDECOMP(cr))
1764             {
1765                 dd_move_f(cr->dd, f, fr->fshift);
1766                 /* Do we need to communicate the separate force array
1767                  * for terms that do not contribute to the single sum virial?
1768                  * Position restraints and electric fields do not introduce
1769                  * inter-cg forces, only full electrostatics methods do.
1770                  * When we do not calculate the virial, fr->f_novirsum = f,
1771                  * so we have already communicated these forces.
1772                  */
1773                 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1774                     (flags & GMX_FORCE_VIRIAL))
1775                 {
1776                     dd_move_f(cr->dd, fr->f_novirsum, NULL);
1777                 }
1778                 if (bSepLRF)
1779                 {
1780                     /* We should not update the shift forces here,
1781                      * since f_twin is already included in f.
1782                      */
1783                     dd_move_f(cr->dd, fr->f_twin, NULL);
1784                 }
1785             }
1786             else
1787             {
1788                 pd_move_f(cr, f, nrnb);
1789                 if (bSepLRF)
1790                 {
1791                     pd_move_f(cr, fr->f_twin, nrnb);
1792                 }
1793             }
1794             wallcycle_stop(wcycle, ewcMOVEF);
1795         }
1796
1797         /* If we have NoVirSum forces, but we do not calculate the virial,
1798          * we sum fr->f_novirum=f later.
1799          */
1800         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1801         {
1802             wallcycle_start(wcycle, ewcVSITESPREAD);
1803             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1804                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1805             wallcycle_stop(wcycle, ewcVSITESPREAD);
1806
1807             if (bSepLRF)
1808             {
1809                 wallcycle_start(wcycle, ewcVSITESPREAD);
1810                 spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
1811                                nrnb,
1812                                &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1813                 wallcycle_stop(wcycle, ewcVSITESPREAD);
1814             }
1815         }
1816
1817         if (flags & GMX_FORCE_VIRIAL)
1818         {
1819             /* Calculation of the virial must be done after vsites! */
1820             calc_virial(mdatoms->start, mdatoms->homenr, x, f,
1821                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1822         }
1823     }
1824
1825     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
1826     {
1827         pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
1828                                f, vir_force, mdatoms, enerd, lambda, t);
1829     }
1830
1831     /* Add the forces from enforced rotation potentials (if any) */
1832     if (inputrec->bRot)
1833     {
1834         wallcycle_start(wcycle, ewcROTadd);
1835         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1836         wallcycle_stop(wcycle, ewcROTadd);
1837     }
1838
1839     if (PAR(cr) && !(cr->duty & DUTY_PME))
1840     {
1841         /* In case of node-splitting, the PP nodes receive the long-range
1842          * forces, virial and energy from the PME nodes here.
1843          */
1844         pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
1845     }
1846
1847     if (bDoForces)
1848     {
1849         post_process_forces(cr, step, nrnb, wcycle,
1850                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1851                             flags);
1852     }
1853
1854     /* Sum the potential energy terms from group contributions */
1855     sum_epot(&(enerd->grpp), enerd->term);
1856 }
1857
1858 void do_force(FILE *fplog, t_commrec *cr,
1859               t_inputrec *inputrec,
1860               gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1861               gmx_localtop_t *top,
1862               gmx_groups_t *groups,
1863               matrix box, rvec x[], history_t *hist,
1864               rvec f[],
1865               tensor vir_force,
1866               t_mdatoms *mdatoms,
1867               gmx_enerdata_t *enerd, t_fcdata *fcd,
1868               real *lambda, t_graph *graph,
1869               t_forcerec *fr,
1870               gmx_vsite_t *vsite, rvec mu_tot,
1871               double t, FILE *field, gmx_edsam_t ed,
1872               gmx_bool bBornRadii,
1873               int flags)
1874 {
1875     /* modify force flag if not doing nonbonded */
1876     if (!fr->bNonbonded)
1877     {
1878         flags &= ~GMX_FORCE_NONBONDED;
1879     }
1880
1881     switch (inputrec->cutoff_scheme)
1882     {
1883         case ecutsVERLET:
1884             do_force_cutsVERLET(fplog, cr, inputrec,
1885                                 step, nrnb, wcycle,
1886                                 top,
1887                                 groups,
1888                                 box, x, hist,
1889                                 f, vir_force,
1890                                 mdatoms,
1891                                 enerd, fcd,
1892                                 lambda, graph,
1893                                 fr, fr->ic,
1894                                 vsite, mu_tot,
1895                                 t, field, ed,
1896                                 bBornRadii,
1897                                 flags);
1898             break;
1899         case ecutsGROUP:
1900             do_force_cutsGROUP(fplog, cr, inputrec,
1901                                step, nrnb, wcycle,
1902                                top,
1903                                groups,
1904                                box, x, hist,
1905                                f, vir_force,
1906                                mdatoms,
1907                                enerd, fcd,
1908                                lambda, graph,
1909                                fr, vsite, mu_tot,
1910                                t, field, ed,
1911                                bBornRadii,
1912                                flags);
1913             break;
1914         default:
1915             gmx_incons("Invalid cut-off scheme passed!");
1916     }
1917 }
1918
1919
1920 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1921                         t_inputrec *ir, t_mdatoms *md,
1922                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1923                         t_forcerec *fr, gmx_localtop_t *top)
1924 {
1925     int             i, m, start, end;
1926     gmx_large_int_t step;
1927     real            dt = ir->delta_t;
1928     real            dvdl_dum;
1929     rvec           *savex;
1930
1931     snew(savex, state->natoms);
1932
1933     start = md->start;
1934     end   = md->homenr + start;
1935
1936     if (debug)
1937     {
1938         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1939                 start, md->homenr, end);
1940     }
1941     /* Do a first constrain to reset particles... */
1942     step = ir->init_step;
1943     if (fplog)
1944     {
1945         char buf[STEPSTRSIZE];
1946         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1947                 gmx_step_str(step, buf));
1948     }
1949     dvdl_dum = 0;
1950
1951     /* constrain the current position */
1952     constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1953               ir, NULL, cr, step, 0, md,
1954               state->x, state->x, NULL,
1955               fr->bMolPBC, state->box,
1956               state->lambda[efptBONDED], &dvdl_dum,
1957               NULL, NULL, nrnb, econqCoord,
1958               ir->epc == epcMTTK, state->veta, state->veta);
1959     if (EI_VV(ir->eI))
1960     {
1961         /* constrain the inital velocity, and save it */
1962         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1963         /* might not yet treat veta correctly */
1964         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1965                   ir, NULL, cr, step, 0, md,
1966                   state->x, state->v, state->v,
1967                   fr->bMolPBC, state->box,
1968                   state->lambda[efptBONDED], &dvdl_dum,
1969                   NULL, NULL, nrnb, econqVeloc,
1970                   ir->epc == epcMTTK, state->veta, state->veta);
1971     }
1972     /* constrain the inital velocities at t-dt/2 */
1973     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1974     {
1975         for (i = start; (i < end); i++)
1976         {
1977             for (m = 0; (m < DIM); m++)
1978             {
1979                 /* Reverse the velocity */
1980                 state->v[i][m] = -state->v[i][m];
1981                 /* Store the position at t-dt in buf */
1982                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1983             }
1984         }
1985         /* Shake the positions at t=-dt with the positions at t=0
1986          * as reference coordinates.
1987          */
1988         if (fplog)
1989         {
1990             char buf[STEPSTRSIZE];
1991             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1992                     gmx_step_str(step, buf));
1993         }
1994         dvdl_dum = 0;
1995         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1996                   ir, NULL, cr, step, -1, md,
1997                   state->x, savex, NULL,
1998                   fr->bMolPBC, state->box,
1999                   state->lambda[efptBONDED], &dvdl_dum,
2000                   state->v, NULL, nrnb, econqCoord,
2001                   ir->epc == epcMTTK, state->veta, state->veta);
2002
2003         for (i = start; i < end; i++)
2004         {
2005             for (m = 0; m < DIM; m++)
2006             {
2007                 /* Re-reverse the velocities */
2008                 state->v[i][m] = -state->v[i][m];
2009             }
2010         }
2011     }
2012     sfree(savex);
2013 }
2014
2015 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2016 {
2017     double eners[2], virs[2], enersum, virsum, y0, f, g, h;
2018     double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
2019     double invscale, invscale2, invscale3;
2020     int    ri0, ri1, ri, i, offstart, offset;
2021     real   scale, *vdwtab, tabfactor, tmp;
2022
2023     fr->enershiftsix    = 0;
2024     fr->enershifttwelve = 0;
2025     fr->enerdiffsix     = 0;
2026     fr->enerdifftwelve  = 0;
2027     fr->virdiffsix      = 0;
2028     fr->virdifftwelve   = 0;
2029
2030     if (eDispCorr != edispcNO)
2031     {
2032         for (i = 0; i < 2; i++)
2033         {
2034             eners[i] = 0;
2035             virs[i]  = 0;
2036         }
2037         if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
2038         {
2039             if (fr->rvdw_switch == 0)
2040             {
2041                 gmx_fatal(FARGS,
2042                           "With dispersion correction rvdw-switch can not be zero "
2043                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2044             }
2045
2046             scale  = fr->nblists[0].table_elec_vdw.scale;
2047             vdwtab = fr->nblists[0].table_vdw.data;
2048
2049             /* Round the cut-offs to exact table values for precision */
2050             ri0  = floor(fr->rvdw_switch*scale);
2051             ri1  = ceil(fr->rvdw*scale);
2052             r0   = ri0/scale;
2053             r1   = ri1/scale;
2054             rc3  = r0*r0*r0;
2055             rc9  = rc3*rc3*rc3;
2056
2057             if (fr->vdwtype == evdwSHIFT)
2058             {
2059                 /* Determine the constant energy shift below rvdw_switch.
2060                  * Table has a scale factor since we have scaled it down to compensate
2061                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2062                  */
2063                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2064                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2065             }
2066             /* Add the constant part from 0 to rvdw_switch.
2067              * This integration from 0 to rvdw_switch overcounts the number
2068              * of interactions by 1, as it also counts the self interaction.
2069              * We will correct for this later.
2070              */
2071             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2072             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2073
2074             invscale  = 1.0/(scale);
2075             invscale2 = invscale*invscale;
2076             invscale3 = invscale*invscale2;
2077
2078             /* following summation derived from cubic spline definition,
2079                Numerical Recipies in C, second edition, p. 113-116.  Exact
2080                for the cubic spline.  We first calculate the negative of
2081                the energy from rvdw to rvdw_switch, assuming that g(r)=1,
2082                and then add the more standard, abrupt cutoff correction to
2083                that result, yielding the long-range correction for a
2084                switched function.  We perform both the pressure and energy
2085                loops at the same time for simplicity, as the computational
2086                cost is low. */
2087
2088             for (i = 0; i < 2; i++)
2089             {
2090                 enersum = 0.0; virsum = 0.0;
2091                 if (i == 0)
2092                 {
2093                     offstart = 0;
2094                     /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
2095                      * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
2096                      * up (to save flops in kernels), we need to correct for this.
2097                      */
2098                     tabfactor = 6.0;
2099                 }
2100                 else
2101                 {
2102                     offstart  = 4;
2103                     tabfactor = 12.0;
2104                 }
2105                 for (ri = ri0; ri < ri1; ri++)
2106                 {
2107                     r  = ri*invscale;
2108                     ea = invscale3;
2109                     eb = 2.0*invscale2*r;
2110                     ec = invscale*r*r;
2111
2112                     pa = invscale3;
2113                     pb = 3.0*invscale2*r;
2114                     pc = 3.0*invscale*r*r;
2115                     pd = r*r*r;
2116
2117                     /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
2118                     offset = 8*ri + offstart;
2119                     y0     = vdwtab[offset];
2120                     f      = vdwtab[offset+1];
2121                     g      = vdwtab[offset+2];
2122                     h      = vdwtab[offset+3];
2123
2124                     enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2125                     virsum  += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2126                 }
2127
2128                 enersum  *= 4.0*M_PI*tabfactor;
2129                 virsum   *= 4.0*M_PI*tabfactor;
2130                 eners[i] -= enersum;
2131                 virs[i]  -= virsum;
2132             }
2133
2134             /* now add the correction for rvdw_switch to infinity */
2135             eners[0] += -4.0*M_PI/(3.0*rc3);
2136             eners[1] +=  4.0*M_PI/(9.0*rc9);
2137             virs[0]  +=  8.0*M_PI/rc3;
2138             virs[1]  += -16.0*M_PI/(3.0*rc9);
2139         }
2140         else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
2141         {
2142             if (fr->vdwtype == evdwUSER && fplog)
2143             {
2144                 fprintf(fplog,
2145                         "WARNING: using dispersion correction with user tables\n");
2146             }
2147             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2148             rc9  = rc3*rc3*rc3;
2149             /* Contribution beyond the cut-off */
2150             eners[0] += -4.0*M_PI/(3.0*rc3);
2151             eners[1] +=  4.0*M_PI/(9.0*rc9);
2152             if (fr->vdw_modifier == eintmodPOTSHIFT)
2153             {
2154                 /* Contribution within the cut-off */
2155                 eners[0] += -4.0*M_PI/(3.0*rc3);
2156                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2157             }
2158             /* Contribution beyond the cut-off */
2159             virs[0]  +=  8.0*M_PI/rc3;
2160             virs[1]  += -16.0*M_PI/(3.0*rc9);
2161         }
2162         else
2163         {
2164             gmx_fatal(FARGS,
2165                       "Dispersion correction is not implemented for vdw-type = %s",
2166                       evdw_names[fr->vdwtype]);
2167         }
2168         fr->enerdiffsix    = eners[0];
2169         fr->enerdifftwelve = eners[1];
2170         /* The 0.5 is due to the Gromacs definition of the virial */
2171         fr->virdiffsix     = 0.5*virs[0];
2172         fr->virdifftwelve  = 0.5*virs[1];
2173     }
2174 }
2175
2176 void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
2177                    gmx_large_int_t step, int natoms,
2178                    matrix box, real lambda, tensor pres, tensor virial,
2179                    real *prescorr, real *enercorr, real *dvdlcorr)
2180 {
2181     gmx_bool bCorrAll, bCorrPres;
2182     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2183     int      m;
2184
2185     *prescorr = 0;
2186     *enercorr = 0;
2187     *dvdlcorr = 0;
2188
2189     clear_mat(virial);
2190     clear_mat(pres);
2191
2192     if (ir->eDispCorr != edispcNO)
2193     {
2194         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2195                      ir->eDispCorr == edispcAllEnerPres);
2196         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2197                      ir->eDispCorr == edispcAllEnerPres);
2198
2199         invvol = 1/det(box);
2200         if (fr->n_tpi)
2201         {
2202             /* Only correct for the interactions with the inserted molecule */
2203             dens   = (natoms - fr->n_tpi)*invvol;
2204             ninter = fr->n_tpi;
2205         }
2206         else
2207         {
2208             dens   = natoms*invvol;
2209             ninter = 0.5*natoms;
2210         }
2211
2212         if (ir->efep == efepNO)
2213         {
2214             avcsix    = fr->avcsix[0];
2215             avctwelve = fr->avctwelve[0];
2216         }
2217         else
2218         {
2219             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2220             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2221         }
2222
2223         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2224         *enercorr += avcsix*enerdiff;
2225         dvdlambda  = 0.0;
2226         if (ir->efep != efepNO)
2227         {
2228             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2229         }
2230         if (bCorrAll)
2231         {
2232             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2233             *enercorr += avctwelve*enerdiff;
2234             if (fr->efep != efepNO)
2235             {
2236                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2237             }
2238         }
2239
2240         if (bCorrPres)
2241         {
2242             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2243             if (ir->eDispCorr == edispcAllEnerPres)
2244             {
2245                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2246             }
2247             /* The factor 2 is because of the Gromacs virial definition */
2248             spres = -2.0*invvol*svir*PRESFAC;
2249
2250             for (m = 0; m < DIM; m++)
2251             {
2252                 virial[m][m] += svir;
2253                 pres[m][m]   += spres;
2254             }
2255             *prescorr += spres;
2256         }
2257
2258         /* Can't currently control when it prints, for now, just print when degugging */
2259         if (debug)
2260         {
2261             if (bCorrAll)
2262             {
2263                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2264                         avcsix, avctwelve);
2265             }
2266             if (bCorrPres)
2267             {
2268                 fprintf(debug,
2269                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2270                         *enercorr, spres, svir);
2271             }
2272             else
2273             {
2274                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2275             }
2276         }
2277
2278         if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
2279         {
2280             gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
2281         }
2282         if (fr->efep != efepNO)
2283         {
2284             *dvdlcorr += dvdlambda;
2285         }
2286     }
2287 }
2288
2289 void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
2290                   t_graph *graph, rvec x[])
2291 {
2292     if (fplog)
2293     {
2294         fprintf(fplog, "Removing pbc first time\n");
2295     }
2296     calc_shifts(box, fr->shift_vec);
2297     if (graph)
2298     {
2299         mk_mshift(fplog, graph, fr->ePBC, box, x);
2300         if (gmx_debug_at)
2301         {
2302             p_graph(debug, "do_pbc_first 1", graph);
2303         }
2304         shift_self(graph, box, x);
2305         /* By doing an extra mk_mshift the molecules that are broken
2306          * because they were e.g. imported from another software
2307          * will be made whole again. Such are the healing powers
2308          * of GROMACS.
2309          */
2310         mk_mshift(fplog, graph, fr->ePBC, box, x);
2311         if (gmx_debug_at)
2312         {
2313             p_graph(debug, "do_pbc_first 2", graph);
2314         }
2315     }
2316     if (fplog)
2317     {
2318         fprintf(fplog, "Done rmpbc\n");
2319     }
2320 }
2321
2322 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2323                             gmx_mtop_t *mtop, rvec x[],
2324                             gmx_bool bFirst)
2325 {
2326     t_graph        *graph;
2327     int             mb, as, mol;
2328     gmx_molblock_t *molb;
2329
2330     if (bFirst && fplog)
2331     {
2332         fprintf(fplog, "Removing pbc first time\n");
2333     }
2334
2335     snew(graph, 1);
2336     as = 0;
2337     for (mb = 0; mb < mtop->nmolblock; mb++)
2338     {
2339         molb = &mtop->molblock[mb];
2340         if (molb->natoms_mol == 1 ||
2341             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2342         {
2343             /* Just one atom or charge group in the molecule, no PBC required */
2344             as += molb->nmol*molb->natoms_mol;
2345         }
2346         else
2347         {
2348             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2349             mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
2350                            0, molb->natoms_mol, FALSE, FALSE, graph);
2351
2352             for (mol = 0; mol < molb->nmol; mol++)
2353             {
2354                 mk_mshift(fplog, graph, ePBC, box, x+as);
2355
2356                 shift_self(graph, box, x+as);
2357                 /* The molecule is whole now.
2358                  * We don't need the second mk_mshift call as in do_pbc_first,
2359                  * since we no longer need this graph.
2360                  */
2361
2362                 as += molb->natoms_mol;
2363             }
2364             done_graph(graph);
2365         }
2366     }
2367     sfree(graph);
2368 }
2369
2370 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2371                        gmx_mtop_t *mtop, rvec x[])
2372 {
2373     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2374 }
2375
2376 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2377                  gmx_mtop_t *mtop, rvec x[])
2378 {
2379     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2380 }
2381
2382 void finish_run(FILE *fplog, t_commrec *cr,
2383                 t_inputrec *inputrec,
2384                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2385                 gmx_walltime_accounting_t walltime_accounting,
2386                 wallclock_gpu_t *gputimes,
2387                 gmx_bool bWriteStat)
2388 {
2389     int     i, j;
2390     t_nrnb *nrnb_tot = NULL;
2391     real    delta_t;
2392     double  nbfs, mflop;
2393     double  elapsed_time,
2394             elapsed_time_over_all_ranks,
2395             elapsed_time_over_all_threads,
2396             elapsed_time_over_all_threads_over_all_ranks;
2397     wallcycle_sum(cr, wcycle);
2398
2399     if (cr->nnodes > 1)
2400     {
2401         snew(nrnb_tot, 1);
2402 #ifdef GMX_MPI
2403         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2404                       cr->mpi_comm_mysim);
2405 #endif
2406     }
2407     else
2408     {
2409         nrnb_tot = nrnb;
2410     }
2411
2412     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2413     elapsed_time_over_all_ranks                  = elapsed_time;
2414     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2415     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2416 #ifdef GMX_MPI
2417     if (cr->nnodes > 1)
2418     {
2419         /* reduce elapsed_time over all MPI ranks in the current simulation */
2420         MPI_Allreduce(&elapsed_time,
2421                       &elapsed_time_over_all_ranks,
2422                       1, MPI_DOUBLE, MPI_SUM,
2423                       cr->mpi_comm_mysim);
2424         elapsed_time_over_all_ranks /= cr->nnodes;
2425         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2426          * current simulation. */
2427         MPI_Allreduce(&elapsed_time_over_all_threads,
2428                       &elapsed_time_over_all_threads_over_all_ranks,
2429                       1, MPI_DOUBLE, MPI_SUM,
2430                       cr->mpi_comm_mysim);
2431     }
2432 #endif
2433
2434     if (SIMMASTER(cr))
2435     {
2436         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2437     }
2438     if (cr->nnodes > 1)
2439     {
2440         sfree(nrnb_tot);
2441     }
2442
2443     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2444     {
2445         print_dd_statistics(cr, inputrec, fplog);
2446     }
2447
2448 #ifdef GMX_MPI
2449     if (PARTDECOMP(cr))
2450     {
2451         if (MASTER(cr))
2452         {
2453             t_nrnb     *nrnb_all;
2454             int         s;
2455             MPI_Status  stat;
2456
2457             snew(nrnb_all, cr->nnodes);
2458             nrnb_all[0] = *nrnb;
2459             for (s = 1; s < cr->nnodes; s++)
2460             {
2461                 MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
2462                          cr->mpi_comm_mysim, &stat);
2463             }
2464             pr_load(fplog, cr, nrnb_all);
2465             sfree(nrnb_all);
2466         }
2467         else
2468         {
2469             MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
2470                      cr->mpi_comm_mysim);
2471         }
2472     }
2473 #endif
2474
2475     if (SIMMASTER(cr))
2476     {
2477         wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
2478                         elapsed_time_over_all_ranks,
2479                         wcycle, gputimes);
2480
2481         if (EI_DYNAMICS(inputrec->eI))
2482         {
2483             delta_t = inputrec->delta_t;
2484         }
2485         else
2486         {
2487             delta_t = 0;
2488         }
2489
2490         if (fplog)
2491         {
2492             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2493                        elapsed_time_over_all_ranks,
2494                        walltime_accounting_get_nsteps_done(walltime_accounting),
2495                        delta_t, nbfs, mflop);
2496         }
2497         if (bWriteStat)
2498         {
2499             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2500                        elapsed_time_over_all_ranks,
2501                        walltime_accounting_get_nsteps_done(walltime_accounting),
2502                        delta_t, nbfs, mflop);
2503         }
2504     }
2505 }
2506
2507 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
2508 {
2509     /* this function works, but could probably use a logic rewrite to keep all the different
2510        types of efep straight. */
2511
2512     int       i;
2513     t_lambda *fep = ir->fepvals;
2514
2515     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2516     {
2517         for (i = 0; i < efptNR; i++)
2518         {
2519             lambda[i] = 0.0;
2520             if (lam0)
2521             {
2522                 lam0[i] = 0.0;
2523             }
2524         }
2525         return;
2526     }
2527     else
2528     {
2529         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2530                                              if checkpoint is set -- a kludge is in for now
2531                                              to prevent this.*/
2532         for (i = 0; i < efptNR; i++)
2533         {
2534             /* overwrite lambda state with init_lambda for now for backwards compatibility */
2535             if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2536             {
2537                 lambda[i] = fep->init_lambda;
2538                 if (lam0)
2539                 {
2540                     lam0[i] = lambda[i];
2541                 }
2542             }
2543             else
2544             {
2545                 lambda[i] = fep->all_lambda[i][*fep_state];
2546                 if (lam0)
2547                 {
2548                     lam0[i] = lambda[i];
2549                 }
2550             }
2551         }
2552         if (ir->bSimTemp)
2553         {
2554             /* need to rescale control temperatures to match current state */
2555             for (i = 0; i < ir->opts.ngtc; i++)
2556             {
2557                 if (ir->opts.ref_t[i] > 0)
2558                 {
2559                     ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2560                 }
2561             }
2562         }
2563     }
2564
2565     /* Send to the log the information on the current lambdas */
2566     if (fplog != NULL)
2567     {
2568         fprintf(fplog, "Initial vector of lambda components:[ ");
2569         for (i = 0; i < efptNR; i++)
2570         {
2571             fprintf(fplog, "%10.4f ", lambda[i]);
2572         }
2573         fprintf(fplog, "]\n");
2574     }
2575     return;
2576 }
2577
2578
2579 void init_md(FILE *fplog,
2580              t_commrec *cr, t_inputrec *ir, const output_env_t oenv,
2581              double *t, double *t0,
2582              real *lambda, int *fep_state, double *lam0,
2583              t_nrnb *nrnb, gmx_mtop_t *mtop,
2584              gmx_update_t *upd,
2585              int nfile, const t_filenm fnm[],
2586              gmx_mdoutf_t **outf, t_mdebin **mdebin,
2587              tensor force_vir, tensor shake_vir, rvec mu_tot,
2588              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
2589 {
2590     int  i, j, n;
2591     real tmpt, mod;
2592
2593     /* Initial values */
2594     *t = *t0       = ir->init_t;
2595
2596     *bSimAnn = FALSE;
2597     for (i = 0; i < ir->opts.ngtc; i++)
2598     {
2599         /* set bSimAnn if any group is being annealed */
2600         if (ir->opts.annealing[i] != eannNO)
2601         {
2602             *bSimAnn = TRUE;
2603         }
2604     }
2605     if (*bSimAnn)
2606     {
2607         update_annealing_target_temp(&(ir->opts), ir->init_t);
2608     }
2609
2610     /* Initialize lambda variables */
2611     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2612
2613     if (upd)
2614     {
2615         *upd = init_update(ir);
2616     }
2617
2618
2619     if (vcm != NULL)
2620     {
2621         *vcm = init_vcm(fplog, &mtop->groups, ir);
2622     }
2623
2624     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2625     {
2626         if (ir->etc == etcBERENDSEN)
2627         {
2628             please_cite(fplog, "Berendsen84a");
2629         }
2630         if (ir->etc == etcVRESCALE)
2631         {
2632             please_cite(fplog, "Bussi2007a");
2633         }
2634     }
2635
2636     init_nrnb(nrnb);
2637
2638     if (nfile != -1)
2639     {
2640         *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
2641
2642         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
2643                               mtop, ir, (*outf)->fp_dhdl);
2644     }
2645
2646     if (ir->bAdress)
2647     {
2648         please_cite(fplog, "Fritsch12");
2649         please_cite(fplog, "Junghans10");
2650     }
2651     /* Initiate variables */
2652     clear_mat(force_vir);
2653     clear_mat(shake_vir);
2654     clear_rvec(mu_tot);
2655
2656     debug_gmx();
2657 }