Merge branch release-5-0 into release-5-1
authorMark Abraham <mark.j.abraham@gmail.com>
Fri, 11 Mar 2016 00:37:02 +0000 (01:37 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Fri, 11 Mar 2016 00:37:55 +0000 (01:37 +0100)
Change-Id: Icb8d7bc8d5639a824fe5f45e72eede3ec7b9d3c9

1  2 
src/gromacs/mdlib/sim_util.cpp

index b2d98a11a017e75e3b4021aab4b9b7b85371b075,13b8fc7a9a81dc5e5b800c4231fd68ffa7756040..ac13d64b9392aef451aa29cd444b2331a266b1af
@@@ -3,7 -3,7 +3,7 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
-  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+  * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "gromacs/legacyheaders/sim_util.h"
 +
 +#include "config.h"
  
  #include <assert.h>
  #include <math.h>
  #include <stdio.h>
  #include <string.h>
 -#ifdef HAVE_SYS_TIME_H
 -#include <sys/time.h>
 -#endif
 -
 -#include "typedefs.h"
 -#include "gromacs/utility/cstringutil.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "names.h"
 -#include "txtdump.h"
 -#include "pbc.h"
 -#include "chargegroup.h"
 -#include "vec.h"
 -#include "nrnb.h"
 -#include "mshift.h"
 -#include "mdrun.h"
 -#include "sim_util.h"
 -#include "update.h"
 -#include "physics.h"
 -#include "main.h"
 -#include "mdatoms.h"
 -#include "force.h"
 -#include "bondf.h"
 -#include "pme.h"
 -#include "disre.h"
 -#include "orires.h"
 -#include "network.h"
 -#include "calcmu.h"
 -#include "constr.h"
 -#include "xvgr.h"
 -#include "copyrite.h"
 -#include "domdec.h"
 -#include "genborn.h"
 -#include "nbnxn_atomdata.h"
 -#include "nbnxn_search.h"
 -#include "nbnxn_kernels/nbnxn_kernel_ref.h"
 -#include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
 -#include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
 -#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 -#include "nonbonded.h"
 -#include "../gmxlib/nonbonded/nb_kernel.h"
 -#include "../gmxlib/nonbonded/nb_free_energy.h"
  
 -#include "gromacs/timing/wallcycle.h"
 -#include "gromacs/timing/walltime_accounting.h"
 -#include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/domdec/domdec.h"
  #include "gromacs/essentialdynamics/edsam.h"
 +#include "gromacs/ewald/pme.h"
 +#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
 +#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
 +#include "gromacs/imd/imd.h"
 +#include "gromacs/legacyheaders/calcmu.h"
 +#include "gromacs/legacyheaders/chargegroup.h"
 +#include "gromacs/legacyheaders/constr.h"
 +#include "gromacs/legacyheaders/copyrite.h"
 +#include "gromacs/legacyheaders/disre.h"
 +#include "gromacs/legacyheaders/force.h"
 +#include "gromacs/legacyheaders/genborn.h"
 +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 +#include "gromacs/legacyheaders/mdatoms.h"
 +#include "gromacs/legacyheaders/mdrun.h"
 +#include "gromacs/legacyheaders/names.h"
 +#include "gromacs/legacyheaders/network.h"
 +#include "gromacs/legacyheaders/nonbonded.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/orires.h"
 +#include "gromacs/legacyheaders/qmmm.h"
 +#include "gromacs/legacyheaders/txtdump.h"
 +#include "gromacs/legacyheaders/typedefs.h"
 +#include "gromacs/legacyheaders/update.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
 +#include "gromacs/listed-forces/bonded.h"
 +#include "gromacs/math/units.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/mdlib/nb_verlet.h"
 +#include "gromacs/mdlib/nbnxn_atomdata.h"
 +#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 +#include "gromacs/mdlib/nbnxn_search.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
 +#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
 +#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/pbcutil/mshift.h"
 +#include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/pulling/pull_rotation.h"
 -#include "gromacs/imd/imd.h"
 -#include "adress.h"
 -#include "qmmm.h"
 -
 -#include "gmx_omp_nthreads.h"
 +#include "gromacs/timing/gpu_timing.h"
 +#include "gromacs/timing/wallcycle.h"
 +#include "gromacs/timing/walltime_accounting.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "gromacs/utility/sysinfo.h"
  
 -#include "nbnxn_cuda_data_mgmt.h"
 -#include "nbnxn_cuda/nbnxn_cuda.h"
 +#include "adress.h"
 +#include "nbnxn_gpu.h"
  
  void print_time(FILE                     *out,
                  gmx_walltime_accounting_t walltime_accounting,
@@@ -212,9 -213,11 +212,9 @@@ static void sum_forces(int start, int e
   * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
   *
   * Et[] contains the parameters for the time dependent
 - * part of the field (not yet used).
 + * part of the field.
   * Ex[] contains the parameters for
 - * the spatial dependent part of the field. You can have cool periodic
 - * fields in principle, but only a constant field is supported
 - * now.
 + * the spatial dependent part of the field.
   * The function should return the energy due to the electric field
   * (if any) but for now returns 0.
   *
   * For neutral systems with many charged molecules the error is small.
   * But for systems with a net charge or a few charged molecules
   * the error can be significant when the field is high.
 - * Solution: implement a self-consitent electric field into PME.
 + * Solution: implement a self-consistent electric field into PME.
   */
  static void calc_f_el(FILE *fp, int  start, int homenr,
                        real charge[], rvec f[],
@@@ -278,7 -281,8 +278,7 @@@ static void calc_virial(int start, int 
                          tensor vir_part, t_graph *graph, matrix box,
                          t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
  {
 -    int    i, j;
 -    tensor virtest;
 +    int    i;
  
      /* The short-range virial from surrounding boxes */
      clear_mat(vir_part);
      }
  }
  
 -static void posres_wrapper(FILE *fplog,
 -                           int flags,
 -                           gmx_bool bSepDVDL,
 -                           t_inputrec *ir,
 -                           t_nrnb *nrnb,
 -                           gmx_localtop_t *top,
 -                           matrix box, rvec x[],
 -                           gmx_enerdata_t *enerd,
 -                           real *lambda,
 -                           t_forcerec *fr)
 -{
 -    t_pbc pbc;
 -    real  v, dvdl;
 -    int   i;
 -
 -    /* Position restraints always require full pbc */
 -    set_pbc(&pbc, ir->ePBC, box);
 -    dvdl = 0;
 -    v    = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
 -                  top->idef.iparams_posres,
 -                  (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
 -                  ir->ePBC == epbcNONE ? NULL : &pbc,
 -                  lambda[efptRESTRAINT], &dvdl,
 -                  fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
 -    if (bSepDVDL)
 -    {
 -        gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
 -    }
 -    enerd->term[F_POSRES] += v;
 -    /* If just the force constant changes, the FEP term is linear,
 -     * but if k changes, it is not.
 -     */
 -    enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
 -    inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
 -
 -    if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
 -    {
 -        for (i = 0; i < enerd->n_lambda; i++)
 -        {
 -            real dvdl_dum, lambda_dum;
 -
 -            lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
 -            v          = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
 -                                top->idef.iparams_posres,
 -                                (const rvec*)x, NULL, NULL,
 -                                ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
 -                                fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
 -            enerd->enerpart_lambda[i] += v;
 -        }
 -    }
 -}
 -
 -static void fbposres_wrapper(t_inputrec *ir,
 -                             t_nrnb *nrnb,
 -                             gmx_localtop_t *top,
 -                             matrix box, rvec x[],
 -                             gmx_enerdata_t *enerd,
 -                             t_forcerec *fr)
 -{
 -    t_pbc pbc;
 -    real  v;
 -
 -    /* Flat-bottomed position restraints always require full pbc */
 -    set_pbc(&pbc, ir->ePBC, box);
 -    v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
 -                 top->idef.iparams_fbposres,
 -                 (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
 -                 ir->ePBC == epbcNONE ? NULL : &pbc,
 -                 fr->rc_scaling, fr->ePBC, fr->posres_com);
 -    enerd->term[F_FBPOSRES] += v;
 -    inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
 -}
 -
 -static void pull_potential_wrapper(FILE *fplog,
 -                                   gmx_bool bSepDVDL,
 -                                   t_commrec *cr,
 +static void pull_potential_wrapper(t_commrec *cr,
                                     t_inputrec *ir,
                                     matrix box, rvec x[],
                                     rvec f[],
      set_pbc(&pbc, ir->ePBC, box);
      dvdl                     = 0;
      enerd->term[F_COM_PULL] +=
 -        pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
 +        pull_potential(ir->pull_work, mdatoms, &pbc,
                         cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
 -    if (bSepDVDL)
 -    {
 -        gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
 -    }
      enerd->dvdl_lin[efptRESTRAINT] += dvdl;
      wallcycle_stop(wcycle, ewcPULLPOT);
  }
  
 -static void pme_receive_force_ener(FILE           *fplog,
 -                                   gmx_bool        bSepDVDL,
 -                                   t_commrec      *cr,
 +static void pme_receive_force_ener(t_commrec      *cr,
                                     gmx_wallcycle_t wcycle,
                                     gmx_enerdata_t *enerd,
                                     t_forcerec     *fr)
  {
 -    real   e_q, e_lj, v, dvdl_q, dvdl_lj;
 +    real   e_q, e_lj, dvdl_q, dvdl_lj;
      float  cycles_ppdpme, cycles_seppme;
  
      cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
      gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
                        fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
                        &cycles_seppme);
 -    if (bSepDVDL)
 -    {
 -        gmx_print_sepdvdl(fplog, "Electrostatic PME mesh", e_q, dvdl_q);
 -        gmx_print_sepdvdl(fplog, "Lennard-Jones PME mesh", e_lj, dvdl_lj);
 -    }
      enerd->term[F_COUL_RECIP] += e_q;
      enerd->term[F_LJ_RECIP]   += e_lj;
      enerd->dvdl_lin[efptCOUL] += dvdl_q;
@@@ -461,9 -551,10 +461,9 @@@ static void do_nb_verlet(t_forcerec *fr
                           t_nrnb *nrnb,
                           gmx_wallcycle_t wcycle)
  {
 -    int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
 -    char                      *env;
 +    int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
      nonbonded_verlet_group_t  *nbvg;
 -    gmx_bool                   bCUDA;
 +    gmx_bool                   bUsingGpuKernels;
  
      if (!(flags & GMX_FORCE_NONBONDED))
      {
  
      nbvg = &fr->nbv->grp[ilocality];
  
 -    /* CUDA kernel launch overhead is already timed separately */
 +    /* GPU kernel launch overhead is already timed separately */
      if (fr->cutoff_scheme != ecutsVERLET)
      {
          gmx_incons("Invalid cut-off scheme passed!");
      }
  
 -    bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
 +    bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
  
 -    if (!bCUDA)
 +    if (!bUsingGpuKernels)
      {
          wallcycle_sub_start(wcycle, ewcsNONBONDED);
      }
                                     enerd->grpp.ener[egLJSR]);
              break;
  
 -        case nbnxnk8x8x8_CUDA:
 -            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
 +        case nbnxnk8x8x8_GPU:
 +            nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
              break;
  
          case nbnxnk8x8x8_PlainC:
              gmx_incons("Invalid nonbonded kernel type passed!");
  
      }
 -    if (!bCUDA)
 +    if (!bUsingGpuKernels)
      {
          wallcycle_sub_stop(wcycle, ewcsNONBONDED);
      }
      {
          enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
      }
 -    else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
 -             (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
 +    else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
 +             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
      {
          enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
      }
@@@ -711,11 -802,6 +711,11 @@@ static void do_nb_verlet_fep(nbnxn_pair
      wallcycle_sub_stop(wcycle, ewcsNONBONDED);
  }
  
 +gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
 +{
 +    return nbv != NULL && nbv->bUseGPU;
 +}
 +
  void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                           t_inputrec *inputrec,
                           gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
                           gmx_bool bBornRadii,
                           int flags)
  {
 -    int                 cg0, cg1, i, j;
 +    int                 cg1, i, j;
      int                 start, homenr;
 -    int                 nb_kernel_type;
      double              mu[2*DIM];
 -    gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
 +    gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
      gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
      gmx_bool            bDiffKernels = FALSE;
 -    matrix              boxs;
      rvec                vzero, box_diag;
 -    real                e, v, dvdl;
      float               cycles_pme, cycles_force, cycles_wait_gpu;
      nonbonded_verlet_t *nbv;
  
      cycles_force    = 0;
      cycles_wait_gpu = 0;
      nbv             = fr->nbv;
 -    nb_kernel_type  = fr->nbv->grp[0].kernel_type;
  
      start  = 0;
      homenr = mdatoms->homenr;
  
 -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
 -
      clear_mat(vir_force);
  
 -    cg0 = 0;
      if (DOMAINDECOMP(cr))
      {
          cg1 = cr->dd->ncg_tot;
  #ifdef GMX_MPI
      if (!(cr->duty & DUTY_PME))
      {
 +        gmx_bool bBS;
 +        matrix   boxs;
 +
          /* Send particle coordinates to the pme nodes.
           * Since this is only implemented for domain decomposition
           * and domain decomposition does not use the graph,
          if (bNS)
          {
              wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 -            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 +            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
              wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
          }
  
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 -        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 +        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
      }
  
          if (bUseGPU)
          {
              /* initialize local pair-list on the GPU */
 -            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 -                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
 -                                     eintLocal);
 +            nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
 +                                    nbv->grp[eintLocal].nbl_lists.nbl[0],
 +                                    eintLocal);
          }
          wallcycle_stop(wcycle, ewcNS);
      }
  
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
  
 -            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
 +            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
              {
                  /* initialize non-local pair-list on the GPU */
 -                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 -                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
 -                                         eintNonlocal);
 +                nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
 +                                        nbv->grp[eintNonlocal].nbl_lists.nbl[0],
 +                                        eintNonlocal);
              }
              wallcycle_stop(wcycle, ewcNS);
          }
          {
              wallcycle_start(wcycle, ewcMOVEX);
              dd_move_x(cr->dd, box, x);
-             /* When we don't need the total dipole we sum it in global_stat */
-             if (bStateChanged && NEED_MUTOT(*inputrec))
-             {
-                 gmx_sumd(2*DIM, mu, cr);
-             }
              wallcycle_stop(wcycle, ewcMOVEX);
  
              wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
          if (DOMAINDECOMP(cr) && !bDiffKernels)
          {
 -            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
 -                                      flags, eatNonlocal);
 +            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
 +                                     flags, eatNonlocal);
          }
 -        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
 -                                  flags, eatLocal);
 +        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
 +                                 flags, eatLocal);
          cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
      }
  
      }
  
      /* Start the force cycle counter.
 -     * This counter is stopped in do_forcelow_level.
 +     * This counter is stopped after do_force_lowlevel.
       * No parallel communication should occur while this counter is running,
       * since that will interfere with the dynamic load balancing.
       */
          clear_rvec(fr->vir_diag_posres);
      }
  
 -    if (inputrec->ePull == epullCONSTRAINT)
 +    if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
      {
 -        clear_pull_forces(inputrec->pull);
 +        clear_pull_forces(inputrec->pull_work);
      }
  
      /* We calculate the non-bonded forces, when done on the CPU, here.
 -     * We do this before calling do_force_lowlevel, as in there bondeds
 -     * forces are calculated before PME, which does communication.
 -     * With this order, non-bonded and bonded force calculation imbalance
 -     * can be balanced out by the domain decomposition load balancing.
 +     * We do this before calling do_force_lowlevel, because in that
 +     * function, the listed forces are calculated before PME, which
 +     * does communication.  With this order, non-bonded and listed
 +     * force calculation imbalance can be balanced out by the domain
 +     * decomposition load balancing.
       */
  
      if (!bUseOrEmulGPU)
          }
  
          /* Add all the non-bonded force to the normal force array.
 -         * This can be split into a local a non-local part when overlapping
 +         * This can be split into a local and a non-local part when overlapping
           * communication with calculation with domain decomposition.
           */
          cycles_force += wallcycle_stop(wcycle, ewcFORCE);
          if ((flags & GMX_FORCE_VIRIAL) &&
              nbv->grp[aloc].nbl_lists.nnbl > 1)
          {
 +            /* This is not in a subcounter because it takes a
 +               negligible and constant-sized amount of time */
              nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
                                                       fr->fshift);
          }
          update_QMMMrec(cr, fr, x, mdatoms, box, top);
      }
  
 -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 -    {
 -        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
 -                       enerd, lambda, fr);
 -    }
 -
 -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
 -    {
 -        fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
 -    }
 -
      /* Compute the bonded and non-bonded energies and optionally forces */
 -    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
 +    do_force_lowlevel(fr, inputrec, &(top->idef),
                        cr, nrnb, wcycle, mdatoms,
                        x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
 -                      &(top->atomtypes), bBornRadii, box,
 +                      bBornRadii, box,
                        inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
                        flags, &cycles_pme);
  
                  float cycles_tmp;
  
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
 -                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
 -                                    nbv->grp[eintNonlocal].nbat,
 -                                    flags, eatNonlocal,
 -                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 -                                    fr->fshift);
 +                nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
 +                                       nbv->grp[eintNonlocal].nbat,
 +                                       flags, eatNonlocal,
 +                                       enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                       fr->fshift);
                  cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
                  cycles_wait_gpu += cycles_tmp;
                  cycles_force    += cycles_tmp;
          /* wait for local forces (or calculate in emulation mode) */
          if (bUseGPU)
          {
 +#if defined(GMX_GPU) && !defined(GMX_USE_OPENCL)
              float       cycles_tmp, cycles_wait_est;
              const float cuda_api_overhead_margin = 50000.0f; /* cycles */
  
              wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 -            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
 -                                nbv->grp[eintLocal].nbat,
 -                                flags, eatLocal,
 -                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 -                                fr->fshift);
 +            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
 +                                   nbv->grp[eintLocal].nbat,
 +                                   flags, eatLocal,
 +                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                   fr->fshift);
              cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
              if (bDoForces && DOMAINDECOMP(cr))
              cycles_force    += cycles_wait_est;
              cycles_wait_gpu += cycles_wait_est;
  
 -            /* now clear the GPU outputs while we finish the step on the CPU */
 +#elif defined(GMX_GPU) && defined(GMX_USE_OPENCL)
 +
 +            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
 +            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
 +                                   nbv->grp[eintLocal].nbat,
 +                                   flags, eatLocal,
 +                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                   fr->fshift);
 +            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 +#endif
  
 +            /* now clear the GPU outputs while we finish the step on the CPU */
              wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 -            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
 +            nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
              wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
          }
          else
          }
      }
  
 -    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
 +    if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
      {
          /* Since the COM pulling is always done mass-weighted, no forces are
           * applied to vsites and this call can be done after vsite spreading.
           */
 -        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
 +        pull_potential_wrapper(cr, inputrec, box, x,
                                 f, vir_force, mdatoms, enerd, lambda, t,
                                 wcycle);
      }
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
           */
 -        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
 +        pme_receive_force_ener(cr, wcycle, enerd, fr);
      }
  
      if (bDoForces)
@@@ -1525,15 -1606,20 +1519,15 @@@ void do_force_cutsGROUP(FILE *fplog, t_
      int        cg0, cg1, i, j;
      int        start, homenr;
      double     mu[2*DIM];
 -    gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
 -    gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
 +    gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
 +    gmx_bool   bDoLongRangeNS, bDoForces, bSepLRF;
      gmx_bool   bDoAdressWF;
 -    matrix     boxs;
 -    rvec       vzero, box_diag;
 -    real       e, v, dvdlambda[efptNR];
      t_pbc      pbc;
      float      cycles_pme, cycles_force;
  
      start  = 0;
      homenr = mdatoms->homenr;
  
 -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
 -
      clear_mat(vir_force);
  
      cg0 = 0;
      bFillGrid      = (bNS && bStateChanged);
      bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
      bDoForces      = (flags & GMX_FORCE_FORCES);
 -    bDoPotential   = (flags & GMX_FORCE_ENERGY);
      bSepLRF        = ((inputrec->nstcalclr > 1) && bDoForces &&
                        (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
  
  #ifdef GMX_MPI
      if (!(cr->duty & DUTY_PME))
      {
 +        gmx_bool bBS;
 +        matrix   boxs;
 +
          /* Send particle coordinates to the pme nodes.
           * Since this is only implemented for domain decomposition
           * and domain decomposition does not use the graph,
      }
  
      /* Start the force cycle counter.
 -     * This counter is stopped in do_forcelow_level.
 +     * This counter is stopped after do_force_lowlevel.
       * No parallel communication should occur while this counter is running,
       * since that will interfere with the dynamic load balancing.
       */
  
          clear_rvec(fr->vir_diag_posres);
      }
 -    if (inputrec->ePull == epullCONSTRAINT)
 +    if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
      {
 -        clear_pull_forces(inputrec->pull);
 +        clear_pull_forces(inputrec->pull_work);
      }
  
      /* update QMMMrec, if necessary */
          update_QMMMrec(cr, fr, x, mdatoms, box, top);
      }
  
 -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 -    {
 -        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
 -                       enerd, lambda, fr);
 -    }
 -
 -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
 -    {
 -        fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
 -    }
 -
      /* Compute the bonded and non-bonded energies and optionally forces */
 -    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
 +    do_force_lowlevel(fr, inputrec, &(top->idef),
                        cr, nrnb, wcycle, mdatoms,
                        x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
 -                      &(top->atomtypes), bBornRadii, box,
 +                      bBornRadii, box,
                        inputrec->fepvals, lambda,
                        graph, &(top->excls), fr->mu_tot,
                        flags,
          }
      }
  
 -    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
 +    if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
      {
 -        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
 +        pull_potential_wrapper(cr, inputrec, box, x,
                                 f, vir_force, mdatoms, enerd, lambda, t,
                                 wcycle);
      }
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
           */
 -        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
 +        pme_receive_force_ener(cr, wcycle, enerd, fr);
      }
  
      if (bDoForces)
@@@ -2062,21 -2157,24 +2056,21 @@@ void do_constrain_first(FILE *fplog, gm
  
      /* constrain the current position */
      constrain(NULL, TRUE, FALSE, constr, &(top->idef),
 -              ir, NULL, cr, step, 0, 1.0, md,
 +              ir, cr, step, 0, 1.0, md,
                state->x, state->x, NULL,
                fr->bMolPBC, state->box,
                state->lambda[efptBONDED], &dvdl_dum,
 -              NULL, NULL, nrnb, econqCoord,
 -              ir->epc == epcMTTK, state->veta, state->veta);
 +              NULL, NULL, nrnb, econqCoord);
      if (EI_VV(ir->eI))
      {
          /* constrain the inital velocity, and save it */
          /* also may be useful if we need the ekin from the halfstep for velocity verlet */
 -        /* might not yet treat veta correctly */
          constrain(NULL, TRUE, FALSE, constr, &(top->idef),
 -                  ir, NULL, cr, step, 0, 1.0, md,
 +                  ir, cr, step, 0, 1.0, md,
                    state->x, state->v, state->v,
                    fr->bMolPBC, state->box,
                    state->lambda[efptBONDED], &dvdl_dum,
 -                  NULL, NULL, nrnb, econqVeloc,
 -                  ir->epc == epcMTTK, state->veta, state->veta);
 +                  NULL, NULL, nrnb, econqVeloc);
      }
      /* constrain the inital velocities at t-dt/2 */
      if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
          }
          dvdl_dum = 0;
          constrain(NULL, TRUE, FALSE, constr, &(top->idef),
 -                  ir, NULL, cr, step, -1, 1.0, md,
 +                  ir, cr, step, -1, 1.0, md,
                    state->x, savex, NULL,
                    fr->bMolPBC, state->box,
                    state->lambda[efptBONDED], &dvdl_dum,
 -                  state->v, NULL, nrnb, econqCoord,
 -                  ir->epc == epcMTTK, state->veta, state->veta);
 +                  state->v, NULL, nrnb, econqCoord);
  
          for (i = start; i < end; i++)
          {
@@@ -2129,8 -2228,7 +2123,8 @@@ integrate_table(real vdwtab[], real sca
      double invscale, invscale2, invscale3;
      double r, ea, eb, ec, pa, pb, pc, pd;
      double y0, f, g, h;
 -    int    ri, offset, tabfactor;
 +    int    ri, offset;
 +    double tabfactor;
  
      invscale  = 1.0/scale;
      invscale2 = invscale*invscale;
  
  void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
  {
 -    double   eners[2], virs[2], enersum, virsum, y0, f, g, h;
 -    double   r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
 -    double   invscale, invscale2, invscale3;
 -    int      ri0, ri1, ri, i, offstart, offset;
 -    real     scale, *vdwtab, tabfactor, tmp;
 +    double   eners[2], virs[2], enersum, virsum;
 +    double   r0, rc3, rc9;
 +    int      ri0, ri1, i;
 +    real     scale, *vdwtab;
  
      fr->enershiftsix    = 0;
      fr->enershifttwelve = 0;
              vdwtab = fr->nblists[0].table_vdw.data;
  
              /* Round the cut-offs to exact table values for precision */
 -            ri0  = floor(fr->rvdw_switch*scale);
 -            ri1  = ceil(fr->rvdw*scale);
 +            ri0  = static_cast<int>(floor(fr->rvdw_switch*scale));
 +            ri1  = static_cast<int>(ceil(fr->rvdw*scale));
  
              /* The code below has some support for handling force-switching, i.e.
               * when the force (instead of potential) is switched over a limited
              ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
  
              r0   = ri0/scale;
 -            r1   = ri1/scale;
              rc3  = r0*r0*r0;
              rc9  = rc3*rc3*rc3;
  
      }
  }
  
 -void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
 -                   gmx_int64_t step, int natoms,
 +void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
 +                   int natoms,
                     matrix box, real lambda, tensor pres, tensor virial,
                     real *prescorr, real *enercorr, real *dvdlcorr)
  {
              }
          }
  
 -        if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
 -        {
 -            gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
 -        }
          if (fr->efep != efepNO)
          {
              *dvdlcorr += dvdlambda;
@@@ -2570,12 -2674,13 +2564,12 @@@ void finish_run(FILE *fplog, t_commrec 
                  t_inputrec *inputrec,
                  t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                  gmx_walltime_accounting_t walltime_accounting,
 -                wallclock_gpu_t *gputimes,
 +                nonbonded_verlet_t *nbv,
                  gmx_bool bWriteStat)
  {
 -    int     i, j;
      t_nrnb *nrnb_tot = NULL;
 -    real    delta_t;
 -    double  nbfs, mflop;
 +    double  delta_t  = 0;
 +    double  nbfs     = 0, mflop = 0;
      double  elapsed_time,
              elapsed_time_over_all_ranks,
              elapsed_time_over_all_threads,
  
      if (SIMMASTER(cr))
      {
 +        struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : NULL;
 +
          wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
                          elapsed_time_over_all_ranks,
                          wcycle, gputimes);
          {
              delta_t = inputrec->delta_t;
          }
 -        else
 -        {
 -            delta_t = 0;
 -        }
  
          if (fplog)
          {
@@@ -2745,7 -2852,8 +2739,7 @@@ void init_md(FILE *fplog
               gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
               gmx_wallcycle_t wcycle)
  {
 -    int  i, j, n;
 -    real tmpt, mod;
 +    int  i;
  
      /* Initial values */
      *t = *t0       = ir->init_t;
              please_cite(fplog, "Goga2012");
          }
      }
 -
 +    if ((ir->et[XX].n > 0) || (ir->et[YY].n > 0) || (ir->et[ZZ].n > 0))
 +    {
 +        please_cite(fplog, "Caleman2008a");
 +    }
      init_nrnb(nrnb);
  
      if (nfile != -1)