added Verlet scheme and NxN non-bonded functionality

[alexxy/gromacs.git] / src / mdlib / sim_util.c
diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c

index e06f3d88ce411a6dcdc26d2978d30fe31491b061..92486e389b13ed068fee22f2d446adf332b36e60 100644 (file)
--- a/src/mdlib/sim_util.c
+++ b/src/mdlib/sim_util.c
@@ -63,6 +63,7 @@
  #include "nrnb.h"
  #include "mshift.h"
  #include "mdrun.h"
+#include "sim_util.h"
  #include "update.h"
  #include "physics.h"
  #include "main.h"
@@ -86,6 +87,11 @@
  #include "partdec.h"
  #include "gmx_wallcycle.h"
  #include "genborn.h"
+#include "nbnxn_search.h"
+#include "nbnxn_kernels/nbnxn_kernel_ref.h"
+#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
+#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  
  #ifdef GMX_LIB_MPI
  #include <mpi.h>
@@ -97,6 +103,9 @@
  #include "adress.h"
  #include "qmmm.h"
  
+#include "nbnxn_cuda_data_mgmt.h"
+#include "nbnxn_cuda/nbnxn_cuda.h"
+
  #if 0
  typedef struct gmx_timeprint {
  
@@ -149,13 +158,10 @@ void print_time(FILE *out,gmx_runtime_t *runtime,gmx_large_int_t step,
      fprintf(out,"step %s",gmx_step_str(step,buf));
      if ((step >= ir->nstlist))
      {
-        if ((ir->nstlist == 0) || ((step % ir->nstlist) == 0))
-        {
-            /* We have done a full cycle let's update time_per_step */
-            runtime->last = gmx_gettime();
-            dt = difftime(runtime->last,runtime->real);
-            runtime->time_per_step = dt/(step - ir->init_step + 1);
-        }
+        runtime->last = gmx_gettime();
+        dt = difftime(runtime->last,runtime->real);
+        runtime->time_per_step = dt/(step - ir->init_step + 1);
+
          dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
  
          if (ir->nsteps >= 0)
@@ -343,70 +349,1046 @@ static void calc_f_el(FILE *fp,int  start,int homenr,
              for(i=start; (i<start+homenr); i++)
                  f[i][m] += charge[i]*Ext[m];
          }
-        else
+        else
+        {
+            Ext[m] = 0;
+        }
+    }
+    if (fp != NULL)
+    {
+        fprintf(fp,"%10g  %10g  %10g  %10g #FIELD\n",t,
+                Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
+    }
+}
+
+static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
+                       tensor vir_part,t_graph *graph,matrix box,
+                       t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
+{
+  int i,j;
+  tensor virtest;
+
+  /* The short-range virial from surrounding boxes */
+  clear_mat(vir_part);
+  calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
+  inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
+
+  /* Calculate partial virial, for local atoms only, based on short range.
+   * Total virial is computed in global_stat, called from do_md
+   */
+  f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
+  inc_nrnb(nrnb,eNR_VIRIAL,homenr);
+
+  /* Add position restraint contribution */
+  for(i=0; i<DIM; i++) {
+    vir_part[i][i] += fr->vir_diag_posres[i];
+  }
+
+  /* Add wall contribution */
+  for(i=0; i<DIM; i++) {
+    vir_part[i][ZZ] += fr->vir_wall_z[i];
+  }
+
+  if (debug)
+    pr_rvecs(debug,0,"vir_part",vir_part,DIM);
+}
+
+static void posres_wrapper(FILE *fplog,
+                           int flags,
+                           gmx_bool bSepDVDL,
+                           t_inputrec *ir,
+                           t_nrnb *nrnb,
+                           gmx_localtop_t *top,
+                           matrix box,rvec x[],
+                           rvec f[],
+                           gmx_enerdata_t *enerd,
+                           real *lambda,
+                           t_forcerec *fr)
+{
+    t_pbc pbc;
+    real  v,dvdl;
+    int   i;
+
+    /* Position restraints always require full pbc */
+    set_pbc(&pbc,ir->ePBC,box);
+    dvdl = 0;
+    v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
+               top->idef.iparams_posres,
+               (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
+               ir->ePBC==epbcNONE ? NULL : &pbc,
+               lambda[efptRESTRAINT],&dvdl,
+               fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
+    if (bSepDVDL)
+    {
+        fprintf(fplog,sepdvdlformat,
+                interaction_function[F_POSRES].longname,v,dvdl);
+    }
+    enerd->term[F_POSRES] += v;
+    /* If just the force constant changes, the FEP term is linear,
+     * but if k changes, it is not.
+     */
+    enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
+    inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
+
+    if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
+    {
+        for(i=0; i<enerd->n_lambda; i++)
+        {
+            real dvdl_dum,lambda_dum;
+
+            lambda_dum = (i==0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
+            v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
+                       top->idef.iparams_posres,
+                       (const rvec*)x,NULL,NULL,
+                       ir->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl,
+                       fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
+            enerd->enerpart_lambda[i] += v;
+        }
+    }
+}
+
+static void pull_potential_wrapper(FILE *fplog,
+                                   gmx_bool bSepDVDL,
+                                   t_commrec *cr,
+                                   t_inputrec *ir,
+                                   matrix box,rvec x[],
+                                   rvec f[],
+                                   tensor vir_force,
+                                   t_mdatoms *mdatoms,
+                                   gmx_enerdata_t *enerd,
+                                   real *lambda,
+                                   double t)
+{
+    t_pbc  pbc;
+    real   dvdl;
+
+    /* Calculate the center of mass forces, this requires communication,
+     * which is why pull_potential is called close to other communication.
+     * The virial contribution is calculated directly,
+     * which is why we call pull_potential after calc_virial.
+     */
+    set_pbc(&pbc,ir->ePBC,box);
+    dvdl = 0; 
+    enerd->term[F_COM_PULL] +=
+        pull_potential(ir->ePull,ir->pull,mdatoms,&pbc,
+                       cr,t,lambda[efptRESTRAINT],x,f,vir_force,&dvdl);
+    if (bSepDVDL)
+    {
+        fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdl);
+    }
+    enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+}
+
+static void pme_receive_force_ener(FILE *fplog,
+                                   gmx_bool bSepDVDL,
+                                   t_commrec *cr,
+                                   gmx_wallcycle_t wcycle,
+                                   gmx_enerdata_t *enerd,
+                                   t_forcerec *fr)
+{
+    real   e,v,dvdl;    
+    float  cycles_ppdpme,cycles_seppme;
+
+    cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
+    dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
+
+    /* In case of node-splitting, the PP nodes receive the long-range 
+     * forces, virial and energy from the PME nodes here.
+     */    
+    wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
+    dvdl = 0;
+    gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdl,
+                      &cycles_seppme);
+    if (bSepDVDL)
+    {
+        fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdl);
+    }
+    enerd->term[F_COUL_RECIP] += e;
+    enerd->dvdl_lin[efptCOUL] += dvdl;
+    if (wcycle)
+    {
+        dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
+    }
+    wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
+}
+
+static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
+                              gmx_large_int_t step,real pforce,rvec *x,rvec *f)
+{
+  int  i;
+  real pf2,fn2;
+  char buf[STEPSTRSIZE];
+
+  pf2 = sqr(pforce);
+  for(i=md->start; i<md->start+md->homenr; i++) {
+    fn2 = norm2(f[i]);
+    /* We also catch NAN, if the compiler does not optimize this away. */
+    if (fn2 >= pf2 || fn2 != fn2) {
+      fprintf(fp,"step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
+             gmx_step_str(step,buf),
+             ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
+    }
+  }
+}
+
+static void post_process_forces(FILE *fplog,
+                                t_commrec *cr,
+                                gmx_large_int_t step,
+                                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+                                gmx_localtop_t *top,
+                                matrix box,rvec x[],
+                                rvec f[],
+                                tensor vir_force,
+                                t_mdatoms *mdatoms,
+                                t_graph *graph,
+                                t_forcerec *fr,gmx_vsite_t *vsite,
+                                int flags)
+{
+    if (fr->bF_NoVirSum)
+    {
+        if (vsite)
+        {
+            /* Spread the mesh force on virtual sites to the other particles... 
+             * This is parallellized. MPI communication is performed
+             * if the constructing atoms aren't local.
+             */
+            wallcycle_start(wcycle,ewcVSITESPREAD);
+            spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
+                           (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
+                           nrnb,
+                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+            wallcycle_stop(wcycle,ewcVSITESPREAD);
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            /* Now add the forces, this is local */
+            if (fr->bDomDec)
+            {
+                sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
+            }
+            else
+            {
+                sum_forces(mdatoms->start,mdatoms->start+mdatoms->homenr,
+                           f,fr->f_novirsum);
+            }
+            if (EEL_FULL(fr->eeltype))
+            {
+                /* Add the mesh contribution to the virial */
+                m_add(vir_force,fr->vir_el_recip,vir_force);
+            }
+            if (debug)
+            {
+                pr_rvecs(debug,0,"vir_force",vir_force,DIM);
+            }
+        }
+    }
+    
+    if (fr->print_force >= 0)
+    {
+        print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
+    }
+}
+
+static void do_nb_verlet(t_forcerec *fr,
+                         interaction_const_t *ic,
+                         gmx_enerdata_t *enerd,
+                         int flags, int ilocality,
+                         int clearF,
+                         t_nrnb *nrnb,
+                         gmx_wallcycle_t wcycle)
+{
+    int     nnbl, kernel_type, sh_e;
+    char    *env;
+    nonbonded_verlet_group_t  *nbvg;
+
+    if (!(flags & GMX_FORCE_NONBONDED))
+    {
+        /* skip non-bonded calculation */
+        return;
+    }
+
+    nbvg = &fr->nbv->grp[ilocality];
+
+    /* CUDA kernel launch overhead is already timed separately */
+    if (fr->cutoff_scheme != ecutsVERLET)
+    {
+        gmx_incons("Invalid cut-off scheme passed!");
+    }
+
+    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    {
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+    }
+    switch (nbvg->kernel_type)
+    {
+        case nbk4x4_PlainC:
+            nbnxn_kernel_ref(&nbvg->nbl_lists,
+                             nbvg->nbat, ic,
+                             fr->shift_vec,
+                             flags,
+                             clearF,
+                             fr->fshift[0],
+                             enerd->grpp.ener[egCOULSR],
+                             fr->bBHAM ?
+                             enerd->grpp.ener[egBHAMSR] :
+                             enerd->grpp.ener[egLJSR]);
+            break;
+        
+        case nbk4xN_X86_SIMD128:
+            nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
+                                     nbvg->nbat, ic,
+                                     fr->shift_vec,
+                                     flags,
+                                     clearF,
+                                     fr->fshift[0],
+                                     enerd->grpp.ener[egCOULSR],
+                                     fr->bBHAM ?
+                                     enerd->grpp.ener[egBHAMSR] :
+                                     enerd->grpp.ener[egLJSR]);
+            break;
+        case nbk4xN_X86_SIMD256:
+            nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
+                                     nbvg->nbat, ic,
+                                     fr->shift_vec,
+                                     flags,
+                                     clearF,
+                                     fr->fshift[0],
+                                     enerd->grpp.ener[egCOULSR],
+                                     fr->bBHAM ?
+                                     enerd->grpp.ener[egBHAMSR] :
+                                     enerd->grpp.ener[egLJSR]);
+            break;
+
+        case nbk8x8x8_CUDA:
+            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+            break;
+
+        case nbk8x8x8_PlainC:
+            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
+                                 nbvg->nbat, ic,
+                                 fr->shift_vec,
+                                 flags,
+                                 clearF,
+                                 nbvg->nbat->out[0].f,
+                                 fr->fshift[0],
+                                 enerd->grpp.ener[egCOULSR],
+                                 fr->bBHAM ?
+                                 enerd->grpp.ener[egBHAMSR] :
+                                 enerd->grpp.ener[egLJSR]);
+            break;
+
+        default:
+            gmx_incons("Invalid nonbonded kernel type passed!");
+
+    }
+    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    {
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+    /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
+    sh_e = ((flags & GMX_FORCE_ENERGY) ? 1 : 0);
+    inc_nrnb(nrnb,
+             ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
+              eNR_NBNXN_LJ_RF : eNR_NBNXN_LJ_TAB) + sh_e,
+             nbvg->nbl_lists.natpair_ljq);
+    inc_nrnb(nrnb,eNR_NBNXN_LJ+sh_e,nbvg->nbl_lists.natpair_lj);
+    inc_nrnb(nrnb,
+             ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
+              eNR_NBNXN_RF : eNR_NBNXN_TAB)+sh_e,
+             nbvg->nbl_lists.natpair_q);
+}
+
+void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
+              t_inputrec *inputrec,
+              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+              gmx_localtop_t *top,
+              gmx_mtop_t *mtop,
+              gmx_groups_t *groups,
+              matrix box,rvec x[],history_t *hist,
+              rvec f[],
+              tensor vir_force,
+              t_mdatoms *mdatoms,
+              gmx_enerdata_t *enerd,t_fcdata *fcd,
+              real *lambda,t_graph *graph,
+              t_forcerec *fr, interaction_const_t *ic,
+              gmx_vsite_t *vsite,rvec mu_tot,
+              double t,FILE *field,gmx_edsam_t ed,
+              gmx_bool bBornRadii,
+              int flags)
+{
+    int     cg0,cg1,i,j;
+    int     start,homenr;
+    int     nb_kernel_type;
+    double  mu[2*DIM];
+    gmx_bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
+    gmx_bool   bDoLongRange,bDoForces,bSepLRF,bUseGPU,bUseOrEmulGPU;
+    gmx_bool   bDiffKernels=FALSE;
+    matrix  boxs;
+    rvec    vzero,box_diag;
+    real    e,v,dvdl;
+    float  cycles_pme,cycles_force;
+    nonbonded_verlet_t *nbv;
+
+    cycles_force = 0;
+    nbv = fr->nbv;
+    nb_kernel_type = fr->nbv->grp[0].kernel_type;
+
+    start  = mdatoms->start;
+    homenr = mdatoms->homenr;
+
+    bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));
+
+    clear_mat(vir_force);
+
+    cg0 = 0;
+    if (DOMAINDECOMP(cr))
+    {
+        cg1 = cr->dd->ncg_tot;
+    }
+    else
+    {
+        cg1 = top->cgs.nr;
+    }
+    if (fr->n_tpi > 0)
+    {
+        cg1--;
+    }
+
+    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE); 
+    bFillGrid     = (bNS && bStateChanged);
+    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
+    bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DOLR));
+    bDoForces     = (flags & GMX_FORCE_FORCES);
+    bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
+    bUseGPU       = fr->nbv->bUseGPU;
+    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
+
+    if (bStateChanged)
+    {
+        update_forcerec(fplog,fr,box);
+
+        if (NEED_MUTOT(*inputrec))
+        {
+            /* Calculate total (local) dipole moment in a temporary common array.
+             * This makes it possible to sum them over nodes faster.
+             */
+            calc_mu(start,homenr,
+                    x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
+                    mu,mu+DIM);
+        }
+    }
+
+    if (fr->ePBC != epbcNONE) { 
+        /* Compute shift vectors every step,
+         * because of pressure coupling or box deformation!
+         */
+        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+            calc_shifts(box,fr->shift_vec);
+
+        if (bCalcCGCM) { 
+            put_atoms_in_box_omp(fr->ePBC,box,homenr,x);
+            inc_nrnb(nrnb,eNR_SHIFTX,homenr);
+        } 
+        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
+            unshift_self(graph,box,x);
+        }
+    } 
+
+    nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
+                                  fr->shift_vec,nbv->grp[0].nbat);
+
+#ifdef GMX_MPI
+    if (!(cr->duty & DUTY_PME)) {
+        /* Send particle coordinates to the pme nodes.
+         * Since this is only implemented for domain decomposition
+         * and domain decomposition does not use the graph,
+         * we do not need to worry about shifting.
+         */    
+
+        wallcycle_start(wcycle,ewcPP_PMESENDX);
+        GMX_MPE_LOG(ev_send_coordinates_start);
+
+        bBS = (inputrec->nwall == 2);
+        if (bBS) {
+            copy_mat(box,boxs);
+            svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+        }
+
+        gmx_pme_send_x(cr,bBS ? boxs : box,x,
+                       mdatoms->nChargePerturbed,lambda[efptCOUL],
+                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
+
+        GMX_MPE_LOG(ev_send_coordinates_finish);
+        wallcycle_stop(wcycle,ewcPP_PMESENDX);
+    }
+#endif /* GMX_MPI */
+
+    /* do gridding for pair search */
+    if (bNS)
+    {
+        if (graph && bStateChanged)
+        {
+            /* Calculate intramolecular shift vectors to make molecules whole */
+            mk_mshift(fplog,graph,fr->ePBC,box,x);
+        }
+
+        clear_rvec(vzero);
+        box_diag[XX] = box[XX][XX];
+        box_diag[YY] = box[YY][YY];
+        box_diag[ZZ] = box[ZZ][ZZ];
+
+        wallcycle_start(wcycle,ewcNS);
+        if (!fr->bDomDec)
+        {
+            wallcycle_sub_start(wcycle,ewcsNBS_GRID_LOCAL);
+            nbnxn_put_on_grid(nbv->nbs,fr->ePBC,box,
+                              0,vzero,box_diag,
+                              0,mdatoms->homenr,-1,fr->cginfo,x,
+                              0,NULL,
+                              nbv->grp[eintLocal].kernel_type,
+                              nbv->grp[eintLocal].nbat);
+            wallcycle_sub_stop(wcycle,ewcsNBS_GRID_LOCAL);
+        }
+        else
+        {
+            wallcycle_sub_start(wcycle,ewcsNBS_GRID_NONLOCAL);
+            nbnxn_put_on_grid_nonlocal(nbv->nbs,domdec_zones(cr->dd),
+                                       fr->cginfo,x,
+                                       nbv->grp[eintNonlocal].kernel_type,
+                                       nbv->grp[eintNonlocal].nbat);
+            wallcycle_sub_stop(wcycle,ewcsNBS_GRID_NONLOCAL);
+        }
+
+        if (nbv->ngrp == 1 ||
+            nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
+        {
+            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatAll,
+                                nbv->nbs,mdatoms,fr->cginfo);
+        }
+        else
+        {
+            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatLocal,
+                                nbv->nbs,mdatoms,fr->cginfo);
+            nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat,eatAll,
+                                nbv->nbs,mdatoms,fr->cginfo);
+        }
+        wallcycle_stop(wcycle, ewcNS);
+    }
+
+    /* initialize the GPU atom data and copy shift vector */
+    if (bUseGPU)
+    {
+        if (bNS)
+        {
+            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+        }
+
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+    }
+
+    /* do local pair search */
+    if (bNS)
+    {
+        wallcycle_start_nocount(wcycle,ewcNS);
+        wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_LOCAL);
+        nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintLocal].nbat,
+                            &top->excls,
+                            ic->rlist,
+                            nbv->min_ci_balanced,
+                            &nbv->grp[eintLocal].nbl_lists,
+                            eintLocal,
+                            nbv->grp[eintLocal].kernel_type,
+                            nrnb);
+        wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_LOCAL);
+
+        if (bUseGPU)
+        {
+            /* initialize local pair-list on the GPU */
+            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
+                                     eintLocal);
+        }
+        wallcycle_stop(wcycle, ewcNS);
+    }
+    else
+    {
+        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,FALSE,x,
+                                        nbv->grp[eintLocal].nbat);
+        wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+    }
+
+    if (bUseGPU)
+    {
+        wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
+        /* launch local nonbonded F on GPU */
+        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+                     nrnb, wcycle);
+        wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+    }
+
+    /* Communicate coordinates and sum dipole if necessary + 
+       do non-local pair search */
+    if (DOMAINDECOMP(cr))
+    {
+        bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
+                        nbv->grp[eintLocal].kernel_type);
+
+        if (bDiffKernels)
+        {
+            /* With GPU+CPU non-bonded calculations we need to copy
+             * the local coordinates to the non-local nbat struct
+             * (in CPU format) as the non-local kernel call also
+             * calculates the local - non-local interactions.
+             */
+            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,TRUE,x,
+                                             nbv->grp[eintNonlocal].nbat);
+            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        }
+
+        if (bNS)
+        {
+            wallcycle_start_nocount(wcycle,ewcNS);
+            wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_NONLOCAL);
+
+            if (bDiffKernels)
+            {
+                nbnxn_grid_add_simple(nbv->nbs,nbv->grp[eintNonlocal].nbat);
+            }
+
+            nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintNonlocal].nbat,
+                                &top->excls,
+                                ic->rlist,
+                                nbv->min_ci_balanced,
+                                &nbv->grp[eintNonlocal].nbl_lists,
+                                eintNonlocal,
+                                nbv->grp[eintNonlocal].kernel_type,
+                                nrnb);
+
+            wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
+
+            if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
+            {
+                /* initialize non-local pair-list on the GPU */
+                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+                                         eintNonlocal);
+            }
+            wallcycle_stop(wcycle,ewcNS);
+        } 
+        else
+        {
+            wallcycle_start(wcycle,ewcMOVEX);
+            dd_move_x(cr->dd,box,x);
+
+            /* When we don't need the total dipole we sum it in global_stat */
+            if (bStateChanged && NEED_MUTOT(*inputrec))
+            {
+                gmx_sumd(2*DIM,mu,cr);
+            }
+            wallcycle_stop(wcycle,ewcMOVEX);
+
+            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatNonlocal,FALSE,x,
+                                            nbv->grp[eintNonlocal].nbat);
+            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        }
+
+        if (bUseGPU && !bDiffKernels)
+        { 
+            wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
+            /* launch non-local nonbonded F on GPU */
+            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+                         nrnb, wcycle);
+            cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+        }
+    }
+
+    if (bUseGPU)
+    {
+        /* launch D2H copy-back F */
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+        if (DOMAINDECOMP(cr) && !bDiffKernels)
+        {
+            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
+                                      flags, eatNonlocal);
+        }
+        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
+                                  flags, eatLocal);
+        cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+    }
+
+    if (bStateChanged && NEED_MUTOT(*inputrec))
+    {
+        if (PAR(cr))
+        {
+            gmx_sumd(2*DIM,mu,cr);
+        } 
+
+        for(i=0; i<2; i++)
+        {
+            for(j=0;j<DIM;j++)
+            {
+                fr->mu_tot[i][j] = mu[i*DIM + j];
+            }
+        }
+    }
+    if (fr->efep == efepNO)
+    {
+        copy_rvec(fr->mu_tot[0],mu_tot);
+    }
+    else
+    {
+        for(j=0; j<DIM; j++)
+        {
+            mu_tot[j] =
+                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
+                lambda[efptCOUL]*fr->mu_tot[1][j];
+        }
+    }
+
+    /* Reset energies */
+    reset_enerdata(&(inputrec->opts),fr,bNS,enerd,MASTER(cr));
+    clear_rvecs(SHIFTS,fr->fshift);
+
+    if (DOMAINDECOMP(cr))
+    {
+        if (!(cr->duty & DUTY_PME))
+        {
+            wallcycle_start(wcycle,ewcPPDURINGPME);
+            dd_force_flop_start(cr->dd,nrnb);
+        }
+    }
+    
+    /* Start the force cycle counter.
+     * This counter is stopped in do_forcelow_level.
+     * No parallel communication should occur while this counter is running,
+     * since that will interfere with the dynamic load balancing.
+     */
+    wallcycle_start(wcycle,ewcFORCE);
+    if (bDoForces)
+    {
+        /* Reset forces for which the virial is calculated separately:
+         * PME/Ewald forces if necessary */
+        if (fr->bF_NoVirSum) 
+        {
+            if (flags & GMX_FORCE_VIRIAL)
+            {
+                fr->f_novirsum = fr->f_novirsum_alloc;
+                GMX_BARRIER(cr->mpi_comm_mygroup);
+                if (fr->bDomDec)
+                {
+                    clear_rvecs(fr->f_novirsum_n,fr->f_novirsum);
+                }
+                else
+                {
+                    clear_rvecs(homenr,fr->f_novirsum+start);
+                }
+                GMX_BARRIER(cr->mpi_comm_mygroup);
+            }
+            else
+            {
+                /* We are not calculating the pressure so we do not need
+                 * a separate array for forces that do not contribute
+                 * to the pressure.
+                 */
+                fr->f_novirsum = f;
+            }
+        }
+
+        if (bSepLRF)
+        {
+            /* Add the long range forces to the short range forces */
+            for(i=0; i<fr->natoms_force_constr; i++)
+            {
+                copy_rvec(fr->f_twin[i],f[i]);
+            }
+        }
+        else if (!(fr->bTwinRange && bNS))
+        {
+            /* Clear the short-range forces */
+            clear_rvecs(fr->natoms_force_constr,f);
+        }
+
+        clear_rvec(fr->vir_diag_posres);
+
+        GMX_BARRIER(cr->mpi_comm_mygroup);
+    }
+    if (inputrec->ePull == epullCONSTRAINT)
+    {
+        clear_pull_forces(inputrec->pull);
+    }
+
+    /* update QMMMrec, if necessary */
+    if(fr->bQMMM)
+    {
+        update_QMMMrec(cr,fr,x,mdatoms,box,top);
+    }
+
+    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+    {
+        posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
+                       f,enerd,lambda,fr);
+    }
+
+    /* Compute the bonded and non-bonded energies and optionally forces */    
+    /* if we use the GPU turn off the nonbonded */
+    do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
+                      cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
+                      x,hist,f,enerd,fcd,mtop,top,fr->born,
+                      &(top->atomtypes),bBornRadii,box,
+                      inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
+                      ((nb_kernel_type == nbk8x8x8_CUDA || nb_kernel_type == nbk8x8x8_PlainC) 
+                        ? flags&~GMX_FORCE_NONBONDED : flags),
+                      &cycles_pme);
+
+    if (!bUseOrEmulGPU)
+    {
+        /* Maybe we should move this into do_force_lowlevel */
+        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+                     nrnb, wcycle);
+    }
+        
+
+    if (!bUseOrEmulGPU || bDiffKernels)
+    {
+        int aloc;
+
+        if (DOMAINDECOMP(cr))
+        {
+            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
+                         bDiffKernels ? enbvClearFYes : enbvClearFNo,
+                         nrnb, wcycle);
+        }
+
+        if (!bUseOrEmulGPU)
+        {
+            aloc = eintLocal;
+        }
+        else
+        {
+            aloc = eintNonlocal;
+        }
+
+        /* Add all the non-bonded force to the normal force array.
+         * This can be split into a local a non-local part when overlapping
+         * communication with calculation with domain decomposition.
+         */
+        cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatAll,nbv->grp[aloc].nbat,f);
+        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+        cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_start_nocount(wcycle,ewcFORCE);
+
+        /* if there are multiple fshift output buffers reduce them */
+        if ((flags & GMX_FORCE_VIRIAL) &&
+            nbv->grp[aloc].nbl_lists.nnbl > 1)
+        {
+            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+                                                      fr->fshift);
+        }
+    }
+    
+    cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+    GMX_BARRIER(cr->mpi_comm_mygroup);
+    
+    if (ed)
+    {
+        do_flood(fplog,cr,x,f,ed,box,step,bNS);
+    }
+
+    if (bUseOrEmulGPU && !bDiffKernels)
+    {
+        /* wait for non-local forces (or calculate in emulation mode) */
+        if (DOMAINDECOMP(cr))
+        {
+            if (bUseGPU)
+            {
+                wallcycle_start(wcycle,ewcWAIT_GPU_NB_NL);
+                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+                                    nbv->grp[eintNonlocal].nbat,
+                                    flags, eatNonlocal,
+                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                    fr->fshift);
+                cycles_force += wallcycle_stop(wcycle,ewcWAIT_GPU_NB_NL);
+            }
+            else
+            {
+                wallcycle_start_nocount(wcycle,ewcFORCE);
+                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+                             nrnb, wcycle);
+                cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+            }            
+            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+            wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+            /* skip the reduction if there was no non-local work to do */
+            if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+            {
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatNonlocal,
+                                               nbv->grp[eintNonlocal].nbat,f);
+            }
+            wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        }
+    }
+
+    if (bDoForces)
+    {
+        /* Communicate the forces */
+        if (PAR(cr))
+        {
+            wallcycle_start(wcycle,ewcMOVEF);
+            if (DOMAINDECOMP(cr))
+            {
+                dd_move_f(cr->dd,f,fr->fshift);
+                /* Do we need to communicate the separate force array
+                 * for terms that do not contribute to the single sum virial?
+                 * Position restraints and electric fields do not introduce
+                 * inter-cg forces, only full electrostatics methods do.
+                 * When we do not calculate the virial, fr->f_novirsum = f,
+                 * so we have already communicated these forces.
+                 */
+                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+                    (flags & GMX_FORCE_VIRIAL))
+                {
+                    dd_move_f(cr->dd,fr->f_novirsum,NULL);
+                }
+                if (bSepLRF)
+                {
+                    /* We should not update the shift forces here,
+                     * since f_twin is already included in f.
+                     */
+                    dd_move_f(cr->dd,fr->f_twin,NULL);
+                }
+            }
+            wallcycle_stop(wcycle,ewcMOVEF);
+        }
+    }
+ 
+    if (bUseOrEmulGPU)
+    {
+        /* wait for local forces (or calculate in emulation mode) */
+        if (bUseGPU)
+        {
+            wallcycle_start(wcycle,ewcWAIT_GPU_NB_L);
+            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+                                nbv->grp[eintLocal].nbat,
+                                flags, eatLocal,
+                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                fr->fshift);
+            wallcycle_stop(wcycle,ewcWAIT_GPU_NB_L);
+
+            /* now clear the GPU outputs while we finish the step on the CPU */
+            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+        }
+        else
+        {            
+            wallcycle_start_nocount(wcycle,ewcFORCE);
+            do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+                         DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+                         nrnb, wcycle);
+            wallcycle_stop(wcycle,ewcFORCE);
+        }
+        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+        if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+        {
+            /* skip the reduction if there was no non-local work to do */
+            nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatLocal,
+                                           nbv->grp[eintLocal].nbat,f);
+        }
+        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+    }
+    
+    if (DOMAINDECOMP(cr))
+    {
+        dd_force_flop_stop(cr->dd,nrnb);
+        if (wcycle)
+        {
+            dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
+        }
+    }
+
+    if (bDoForces)
+    {
+        if (IR_ELEC_FIELD(*inputrec))
+        {
+            /* Compute forces due to electric field */
+            calc_f_el(MASTER(cr) ? field : NULL,
+                      start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
+                      inputrec->ex,inputrec->et,t);
+        }
+
+        /* If we have NoVirSum forces, but we do not calculate the virial,
+         * we sum fr->f_novirum=f later.
+         */
+        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+        {
+            wallcycle_start(wcycle,ewcVSITESPREAD);
+            spread_vsite_f(fplog,vsite,x,f,fr->fshift,FALSE,NULL,nrnb,
+                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+            wallcycle_stop(wcycle,ewcVSITESPREAD);
+
+            if (bSepLRF)
+            {
+                wallcycle_start(wcycle,ewcVSITESPREAD);
+                spread_vsite_f(fplog,vsite,x,fr->f_twin,NULL,FALSE,NULL,
+                               nrnb,
+                               &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+                wallcycle_stop(wcycle,ewcVSITESPREAD);
+            }
+        }
+
+        if (flags & GMX_FORCE_VIRIAL)
          {
-            Ext[m] = 0;
+            /* Calculation of the virial must be done after vsites! */
+            calc_virial(fplog,mdatoms->start,mdatoms->homenr,x,f,
+                        vir_force,graph,box,nrnb,fr,inputrec->ePBC);
          }
      }
-    if (fp != NULL)
+
+    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
      {
-        fprintf(fp,"%10g  %10g  %10g  %10g #FIELD\n",t,
-                Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
+        pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
+                               f,vir_force,mdatoms,enerd,lambda,t);
      }
-}
-
-static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
-                       tensor vir_part,t_graph *graph,matrix box,
-                       t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
-{
-  int i,j;
-  tensor virtest;
-
-  /* The short-range virial from surrounding boxes */
-  clear_mat(vir_part);
-  calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
-  inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
-
-  /* Calculate partial virial, for local atoms only, based on short range.
-   * Total virial is computed in global_stat, called from do_md
-   */
-  f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
-  inc_nrnb(nrnb,eNR_VIRIAL,homenr);
-
-  /* Add position restraint contribution */
-  for(i=0; i<DIM; i++) {
-    vir_part[i][i] += fr->vir_diag_posres[i];
-  }
-
-  /* Add wall contribution */
-  for(i=0; i<DIM; i++) {
-    vir_part[i][ZZ] += fr->vir_wall_z[i];
-  }
-
-  if (debug)
-    pr_rvecs(debug,0,"vir_part",vir_part,DIM);
-}
  
-static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
-                              gmx_large_int_t step,real pforce,rvec *x,rvec *f)
-{
-  int  i;
-  real pf2,fn2;
-  char buf[STEPSTRSIZE];
+    if (PAR(cr) && !(cr->duty & DUTY_PME))
+    {
+        /* In case of node-splitting, the PP nodes receive the long-range 
+         * forces, virial and energy from the PME nodes here.
+         */    
+        pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
+    }
  
-  pf2 = sqr(pforce);
-  for(i=md->start; i<md->start+md->homenr; i++) {
-    fn2 = norm2(f[i]);
-    /* We also catch NAN, if the compiler does not optimize this away. */
-    if (fn2 >= pf2 || fn2 != fn2) {
-      fprintf(fp,"step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
-             gmx_step_str(step,buf),
-             ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
+    if (bDoForces)
+    {
+        post_process_forces(fplog,cr,step,nrnb,wcycle,
+                            top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
+                            flags);
      }
-  }
+    
+    /* Sum the potential energy terms from group contributions */
+    sum_epot(&(inputrec->opts),enerd);
  }
  
-void do_force(FILE *fplog,t_commrec *cr,
+void do_force_cutsGROUP(FILE *fplog,t_commrec *cr,
                t_inputrec *inputrec,
                gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
                gmx_localtop_t *top,
@@ -430,10 +1412,10 @@ void do_force(FILE *fplog,t_commrec *cr,
      gmx_bool   bDoLongRange,bDoForces,bSepLRF;
      gmx_bool   bDoAdressWF;
      matrix boxs;
+    rvec   vzero,box_diag;
      real   e,v,dvdlambda[efptNR];
-    real   dvdl_dum,lambda_dum;
      t_pbc  pbc;
-    float  cycles_ppdpme,cycles_pme,cycles_seppme,cycles_force;
+    float  cycles_pme,cycles_force;
  
      start  = mdatoms->start;
      homenr = mdatoms->homenr;
@@ -477,68 +1459,71 @@ void do_force(FILE *fplog,t_commrec *cr,
      {
          update_forcerec(fplog,fr,box);
  
-        /* Calculate total (local) dipole moment in a temporary common array.
-         * This makes it possible to sum them over nodes faster.
-         */
-        calc_mu(start,homenr,
-                x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
-                mu,mu+DIM);
+        if (NEED_MUTOT(*inputrec))
+        {
+            /* Calculate total (local) dipole moment in a temporary common array.
+             * This makes it possible to sum them over nodes faster.
+             */
+            calc_mu(start,homenr,
+                    x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
+                    mu,mu+DIM);
+        }
      }
  
-  if (fr->ePBC != epbcNONE) {
-    /* Compute shift vectors every step,
-     * because of pressure coupling or box deformation!
-     */
-    if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
-      calc_shifts(box,fr->shift_vec);
-
-    if (bCalcCGCM) {
-      put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
-                              &(top->cgs),x,fr->cg_cm);
-      inc_nrnb(nrnb,eNR_CGCM,homenr);
-      inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
-    }
-    else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
-      unshift_self(graph,box,x);
+    if (fr->ePBC != epbcNONE) { 
+        /* Compute shift vectors every step,
+         * because of pressure coupling or box deformation!
+         */
+        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+            calc_shifts(box,fr->shift_vec);
+
+        if (bCalcCGCM) { 
+            put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
+                    &(top->cgs),x,fr->cg_cm);
+            inc_nrnb(nrnb,eNR_CGCM,homenr);
+            inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
+        } 
+        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
+            unshift_self(graph,box,x);
+        }
+    } 
+    else if (bCalcCGCM) {
+        calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
+        inc_nrnb(nrnb,eNR_CGCM,homenr);
      }
-  }
-  else if (bCalcCGCM) {
-    calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
-    inc_nrnb(nrnb,eNR_CGCM,homenr);
-  }
  
-  if (bCalcCGCM) {
-    if (PAR(cr)) {
-      move_cgcm(fplog,cr,fr->cg_cm);
+    if (bCalcCGCM) {
+        if (PAR(cr)) {
+            move_cgcm(fplog,cr,fr->cg_cm);
+        }
+        if (gmx_debug_at)
+            pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
      }
-    if (gmx_debug_at)
-      pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
-  }
  
  #ifdef GMX_MPI
-  if (!(cr->duty & DUTY_PME)) {
-    /* Send particle coordinates to the pme nodes.
-     * Since this is only implemented for domain decomposition
-     * and domain decomposition does not use the graph,
-     * we do not need to worry about shifting.
-     */
+    if (!(cr->duty & DUTY_PME)) {
+        /* Send particle coordinates to the pme nodes.
+         * Since this is only implemented for domain decomposition
+         * and domain decomposition does not use the graph,
+         * we do not need to worry about shifting.
+         */    
+
+        wallcycle_start(wcycle,ewcPP_PMESENDX);
+        GMX_MPE_LOG(ev_send_coordinates_start);
+
+        bBS = (inputrec->nwall == 2);
+        if (bBS) {
+            copy_mat(box,boxs);
+            svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+        }
  
-    wallcycle_start(wcycle,ewcPP_PMESENDX);
-    GMX_MPE_LOG(ev_send_coordinates_start);
+        gmx_pme_send_x(cr,bBS ? boxs : box,x,
+                       mdatoms->nChargePerturbed,lambda[efptCOUL],
+                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
  
-    bBS = (inputrec->nwall == 2);
-    if (bBS) {
-      copy_mat(box,boxs);
-      svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+        GMX_MPE_LOG(ev_send_coordinates_finish);
+        wallcycle_stop(wcycle,ewcPP_PMESENDX);
      }
-
-    gmx_pme_send_x(cr,bBS ? boxs : box,x,
-                   mdatoms->nChargePerturbed,lambda[efptCOUL],
-                   ( flags & GMX_FORCE_VIRIAL),step);
-
-    GMX_MPE_LOG(ev_send_coordinates_finish);
-    wallcycle_stop(wcycle,ewcPP_PMESENDX);
-  }
  #endif /* GMX_MPI */
  
      /* Communicate coordinates and sum dipole if necessary */
@@ -553,60 +1538,63 @@ void do_force(FILE *fplog,t_commrec *cr,
          {
              move_x(fplog,cr,GMX_LEFT,GMX_RIGHT,x,nrnb);
          }
-        /* When we don't need the total dipole we sum it in global_stat */
-        if (bStateChanged && NEED_MUTOT(*inputrec))
+        wallcycle_stop(wcycle,ewcMOVEX);
+    }
+
+    /* update adress weight beforehand */
+    if(bStateChanged && bDoAdressWF)
+    {
+        /* need pbc for adress weight calculation with pbc_dx */
+        set_pbc(&pbc,inputrec->ePBC,box);
+        if(fr->adress_site == eAdressSITEcog)
          {
-            gmx_sumd(2*DIM,mu,cr);
+            update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
+                                      inputrec->ePBC==epbcNONE ? NULL : &pbc);
+        }
+        else if (fr->adress_site == eAdressSITEcom)
+        {
+            update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
+                                      inputrec->ePBC==epbcNONE ? NULL : &pbc);
+        }
+        else if (fr->adress_site == eAdressSITEatomatom){
+            update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
+                                                inputrec->ePBC==epbcNONE ? NULL : &pbc);
+        }
+        else
+        {
+            update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
+                                       inputrec->ePBC==epbcNONE ? NULL : &pbc);
          }
-        wallcycle_stop(wcycle,ewcMOVEX);
      }
-    if (bStateChanged)
+
+    if (NEED_MUTOT(*inputrec))
      {
  
-        /* update adress weight beforehand */
-        if(bDoAdressWF)
+        if (bStateChanged)
          {
-            /* need pbc for adress weight calculation with pbc_dx */
-            set_pbc(&pbc,inputrec->ePBC,box);
-            if(fr->adress_site == eAdressSITEcog)
-            {
-                update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
-                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
-            }
-            else if (fr->adress_site == eAdressSITEcom)
+            if (PAR(cr))
              {
-                update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
-                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
-            }
-            else if (fr->adress_site == eAdressSITEatomatom){
-                update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
-                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
+                gmx_sumd(2*DIM,mu,cr);
              }
-            else
+            for(i=0; i<2; i++)
              {
-                update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
-                                           inputrec->ePBC==epbcNONE ? NULL : &pbc);
+                for(j=0;j<DIM;j++)
+                {
+                    fr->mu_tot[i][j] = mu[i*DIM + j];
+                }
              }
          }
-
-        for(i=0; i<2; i++)
+        if (fr->efep == efepNO)
          {
-            for(j=0;j<DIM;j++)
-            {
-                fr->mu_tot[i][j] = mu[i*DIM + j];
-            }
+            copy_rvec(fr->mu_tot[0],mu_tot);
          }
-    }
-    if (fr->efep == efepNO)
-    {
-        copy_rvec(fr->mu_tot[0],mu_tot);
-    }
-    else
-    {
-        for(j=0; j<DIM; j++)
+        else
          {
-            mu_tot[j] =
-                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+            for(j=0; j<DIM; j++)
+            {
+                mu_tot[j] =
+                    (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+            }
          }
      }
  
@@ -680,7 +1668,7 @@ void do_force(FILE *fplog,t_commrec *cr,
       * since that will interfere with the dynamic load balancing.
       */
      wallcycle_start(wcycle,ewcFORCE);
-
+    
      if (bDoForces)
      {
          /* Reset forces for which the virial is calculated separately:
@@ -742,54 +1730,19 @@ void do_force(FILE *fplog,t_commrec *cr,
  
      if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
      {
-        /* Position restraints always require full pbc. Check if we already did it for Adress */
-        if(!(bStateChanged && bDoAdressWF))
-        {
-            set_pbc(&pbc,inputrec->ePBC,box);
-        }
-        v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
-                   top->idef.iparams_posres,
-                   (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
-                   inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda[efptRESTRAINT],&(dvdlambda[efptRESTRAINT]),
-                   fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
-        if (bSepDVDL)
-        {
-            fprintf(fplog,sepdvdlformat,
-                    interaction_function[F_POSRES].longname,v,dvdlambda);
-        }
-        enerd->term[F_POSRES] += v;
-        /* This linear lambda dependence assumption is only correct
-         * when only k depends on lambda,
-         * not when the reference position depends on lambda.
-         * grompp checks for this.  (verify this is still the case?)
-         */
-        enerd->dvdl_nonlin[efptRESTRAINT] += dvdlambda[efptRESTRAINT]; /* if just the force constant changes, this is linear,
-                                                                          but we can't be sure w/o additional checking that is
-                                                                          hard to do at this level of code. Otherwise,
-                                                                          the dvdl is not differentiable */
-        inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
-        if ((inputrec->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
-        {
-            for(i=0; i<enerd->n_lambda; i++)
-            {
-                lambda_dum = (i==0 ? lambda[efptRESTRAINT] : inputrec->fepvals->all_lambda[efptRESTRAINT][i-1]);
-                v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
-                           top->idef.iparams_posres,
-                           (const rvec*)x,NULL,NULL,
-                           inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl_dum,
-                           fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
-                enerd->enerpart_lambda[i] += v;
-            }
-        }
-   }
+        posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
+                       f,enerd,lambda,fr);
+    }
  
      /* Compute the bonded and non-bonded energies and optionally forces */
      do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
                        cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
                        x,hist,f,enerd,fcd,mtop,top,fr->born,
                        &(top->atomtypes),bBornRadii,box,
-                      inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
-                      flags,&cycles_pme);
+                      inputrec->fepvals,lambda,
+                      graph,&(top->excls),fr->mu_tot,
+                      flags,
+                      &cycles_pme);
  
      cycles_force = wallcycle_stop(wcycle,ewcFORCE);
      GMX_BARRIER(cr->mpi_comm_mygroup);
@@ -891,24 +1844,10 @@ void do_force(FILE *fplog,t_commrec *cr,
          }
      }
  
-    enerd->term[F_COM_PULL] = 0;
      if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
      {
-        /* Calculate the center of mass forces, this requires communication,
-         * which is why pull_potential is called close to other communication.
-         * The virial contribution is calculated directly,
-         * which is why we call pull_potential after calc_virial.
-         */
-        set_pbc(&pbc,inputrec->ePBC,box);
-        dvdlambda[efptRESTRAINT] = 0;
-        enerd->term[F_COM_PULL] +=
-            pull_potential(inputrec->ePull,inputrec->pull,mdatoms,&pbc,
-                           cr,t,lambda[efptRESTRAINT],x,f,vir_force,&(dvdlambda[efptRESTRAINT]));
-        if (bSepDVDL)
-        {
-            fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdlambda[efptRESTRAINT]);
-        }
-        enerd->dvdl_lin[efptRESTRAINT] += dvdlambda[efptRESTRAINT];
+        pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
+                               f,vir_force,mdatoms,enerd,lambda,t);
      }
  
      /* Add the forces from enforced rotation potentials (if any) */
@@ -921,76 +1860,86 @@ void do_force(FILE *fplog,t_commrec *cr,
  
      if (PAR(cr) && !(cr->duty & DUTY_PME))
      {
-        cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
-        dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
-
-        /* In case of node-splitting, the PP nodes receive the long-range
+        /* In case of node-splitting, the PP nodes receive the long-range 
           * forces, virial and energy from the PME nodes here.
           */
-        wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
-        dvdlambda[efptCOUL] = 0;
-        gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdlambda[efptCOUL],
-                          &cycles_seppme);
-        if (bSepDVDL)
-        {
-            fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdlambda[efptCOUL]);
-        }
-        enerd->term[F_COUL_RECIP] += e;
-        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
-        if (wcycle)
-        {
-            dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
-        }
-        wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
+        pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
      }
  
-    if (bDoForces && fr->bF_NoVirSum)
+    if (bDoForces)
      {
-        if (vsite)
-        {
-            /* Spread the mesh force on virtual sites to the other particles...
-             * This is parallellized. MPI communication is performed
-             * if the constructing atoms aren't local.
-             */
-            wallcycle_start(wcycle,ewcVSITESPREAD);
-            spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
-                           (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
-                           nrnb,
-                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
-            wallcycle_stop(wcycle,ewcVSITESPREAD);
-        }
-        if (flags & GMX_FORCE_VIRIAL)
-        {
-            /* Now add the forces, this is local */
-            if (fr->bDomDec)
-            {
-                sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
-            }
-            else
-            {
-                sum_forces(start,start+homenr,f,fr->f_novirsum);
-            }
-            if (EEL_FULL(fr->eeltype))
-            {
-                /* Add the mesh contribution to the virial */
-                m_add(vir_force,fr->vir_el_recip,vir_force);
-            }
-            if (debug)
-            {
-                pr_rvecs(debug,0,"vir_force",vir_force,DIM);
-            }
-        }
+        post_process_forces(fplog,cr,step,nrnb,wcycle,
+                            top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
+                            flags);
      }
  
      /* Sum the potential energy terms from group contributions */
      sum_epot(&(inputrec->opts),enerd);
+}
+
+void do_force(FILE *fplog,t_commrec *cr,
+              t_inputrec *inputrec,
+              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+              gmx_localtop_t *top,
+              gmx_mtop_t *mtop,
+              gmx_groups_t *groups,
+              matrix box,rvec x[],history_t *hist,
+              rvec f[],
+              tensor vir_force,
+              t_mdatoms *mdatoms,
+              gmx_enerdata_t *enerd,t_fcdata *fcd,
+              real *lambda,t_graph *graph,
+              t_forcerec *fr,
+              gmx_vsite_t *vsite,rvec mu_tot,
+              double t,FILE *field,gmx_edsam_t ed,
+              gmx_bool bBornRadii,
+              int flags)
+{
+    /* modify force flag if not doing nonbonded */
+    if (!fr->bNonbonded)
+    {
+        flags &= ~GMX_FORCE_NONBONDED;
+    }
  
-    if (fr->print_force >= 0 && bDoForces)
+    switch (inputrec->cutoff_scheme)
      {
-        print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
+        case ecutsVERLET:
+            do_force_cutsVERLET(fplog, cr, inputrec,
+                                step, nrnb, wcycle,
+                                top, mtop,
+                                groups,
+                                box, x, hist,
+                                f, vir_force,
+                                mdatoms,
+                                enerd, fcd,
+                                lambda, graph,
+                                fr, fr->ic, 
+                                vsite, mu_tot,
+                                t, field, ed,
+                                bBornRadii,
+                                flags);
+            break;
+        case ecutsGROUP:
+             do_force_cutsGROUP(fplog, cr, inputrec,
+                                step, nrnb, wcycle,
+                                top, mtop,
+                                groups,
+                                box, x, hist,
+                                f, vir_force,
+                                mdatoms,
+                                enerd, fcd,
+                                lambda, graph,
+                                fr, vsite, mu_tot,
+                                t, field, ed,
+                                bBornRadii,
+                                flags);
+            break;
+        default:
+            gmx_incons("Invalid cut-off scheme passed!");
      }
  }
  
+
  void do_constrain_first(FILE *fplog,gmx_constr_t constr,
                          t_inputrec *ir,t_mdatoms *md,
                          t_state *state,rvec *f,
@@ -1025,8 +1974,10 @@ void do_constrain_first(FILE *fplog,gmx_constr_t constr,
      constrain(NULL,TRUE,FALSE,constr,&(top->idef),
                ir,NULL,cr,step,0,md,
                state->x,state->x,NULL,
-              state->box,state->lambda[efptBONDED],&dvdl_dum,
-              NULL,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
+              fr->bMolPBC,state->box,
+              state->lambda[efptBONDED],&dvdl_dum,
+              NULL,NULL,nrnb,econqCoord,
+              ir->epc==epcMTTK,state->veta,state->veta);
      if (EI_VV(ir->eI))
      {
          /* constrain the inital velocity, and save it */
@@ -1035,8 +1986,10 @@ void do_constrain_first(FILE *fplog,gmx_constr_t constr,
          constrain(NULL,TRUE,FALSE,constr,&(top->idef),
                    ir,NULL,cr,step,0,md,
                    state->x,state->v,state->v,
-                  state->box,state->lambda[efptBONDED],&dvdl_dum,
-                  NULL,NULL,nrnb,econqVeloc,ir->epc==epcMTTK,state->veta,state->veta);
+                  fr->bMolPBC,state->box,
+                  state->lambda[efptBONDED],&dvdl_dum,
+                  NULL,NULL,nrnb,econqVeloc,
+                  ir->epc==epcMTTK,state->veta,state->veta);
      }
      /* constrain the inital velocities at t-dt/2 */
      if (EI_STATE_VELOCITY(ir->eI) && ir->eI!=eiVV)
@@ -1064,9 +2017,11 @@ void do_constrain_first(FILE *fplog,gmx_constr_t constr,
          constrain(NULL,TRUE,FALSE,constr,&(top->idef),
                    ir,NULL,cr,step,-1,md,
                    state->x,savex,NULL,
-                  state->box,state->lambda[efptBONDED],&dvdl_dum,
-                  state->v,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
-
+                  fr->bMolPBC,state->box,
+                  state->lambda[efptBONDED],&dvdl_dum,
+                  state->v,NULL,nrnb,econqCoord,
+                  ir->epc==epcMTTK,state->veta,state->veta);
+        
          for(i=start; i<end; i++) {
              for(m=0; m<DIM; m++) {
                  /* Re-reverse the velocities */
@@ -1190,8 +2145,15 @@ void calc_enervirdiff(FILE *fplog,int eDispCorr,t_forcerec *fr)
                 "WARNING: using dispersion correction with user tables\n");
        rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
        rc9  = rc3*rc3*rc3;
+      /* Contribution beyond the cut-off */
        eners[0] += -4.0*M_PI/(3.0*rc3);
        eners[1] +=  4.0*M_PI/(9.0*rc9);
+      if (fr->cutoff_scheme == ecutsVERLET && fr->ic->sh_invrc6 != 0) {
+          /* Contribution within the cut-off */
+          eners[0] += -4.0*M_PI/(3.0*rc3);
+          eners[1] +=  4.0*M_PI/(3.0*rc9);
+      }
+      /* Contribution beyond the cut-off */
        virs[0]  +=  8.0*M_PI/rc3;
        virs[1]  += -16.0*M_PI/(3.0*rc9);
      } else {
@@ -1399,37 +2361,54 @@ void finish_run(FILE *fplog,t_commrec *cr,const char *confout,
                  t_inputrec *inputrec,
                  t_nrnb nrnb[],gmx_wallcycle_t wcycle,
                  gmx_runtime_t *runtime,
+                wallclock_gpu_t *gputimes,
+                int omp_nth_pp,
                  gmx_bool bWriteStat)
  {
-  int    i,j;
-  t_nrnb *nrnb_tot=NULL;
-  real   delta_t;
-  double nbfs,mflop;
-  double cycles[ewcNR];
+    int    i,j;
+    t_nrnb *nrnb_tot=NULL;
+    real   delta_t;
+    double nbfs,mflop;
  
-  wallcycle_sum(cr,wcycle,cycles);
+    wallcycle_sum(cr,wcycle);
  
-  if (cr->nnodes > 1) {
-    if (SIMMASTER(cr))
-      snew(nrnb_tot,1);
+    if (cr->nnodes > 1)
+    {
+        snew(nrnb_tot,1);
  #ifdef GMX_MPI
-    MPI_Reduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
-               MASTERRANK(cr),cr->mpi_comm_mysim);
+        MPI_Allreduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
+                      cr->mpi_comm_mysim);
  #endif
-  } else {
-    nrnb_tot = nrnb;
-  }
+    }
+    else
+    {
+        nrnb_tot = nrnb;
+    }
  
-  if (SIMMASTER(cr)) {
-    print_flop(fplog,nrnb_tot,&nbfs,&mflop);
-    if (cr->nnodes > 1) {
-      sfree(nrnb_tot);
+#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
+    if (cr->nnodes > 1)
+    {
+        /* reduce nodetime over all MPI processes in the current simulation */
+        double sum;
+        MPI_Allreduce(&runtime->proctime,&sum,1,MPI_DOUBLE,MPI_SUM,
+                      cr->mpi_comm_mysim);
+        runtime->proctime = sum;
      }
-  }
+#endif
  
-  if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr)) {
-    print_dd_statistics(cr,inputrec,fplog);
-  }
+    if (SIMMASTER(cr))
+    {
+        print_flop(fplog,nrnb_tot,&nbfs,&mflop);
+    }
+    if (cr->nnodes > 1)
+    {
+        sfree(nrnb_tot);
+    }
+
+    if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+    {
+        print_dd_statistics(cr,inputrec,fplog);
+    }
  
  #ifdef GMX_MPI
      if (PARTDECOMP(cr))
@@ -1458,44 +2437,35 @@ void finish_run(FILE *fplog,t_commrec *cr,const char *confout,
      }
  #endif
  
-  if (SIMMASTER(cr)) {
-    wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
-                    wcycle,cycles);
-
-    if (EI_DYNAMICS(inputrec->eI)) {
-      delta_t = inputrec->delta_t;
-    } else {
-      delta_t = 0;
-    }
+    if (SIMMASTER(cr))
+    {
+        wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
+                        wcycle,gputimes);
  
-    if (fplog) {
-        print_perf(fplog,runtime->proctime,runtime->realtime,
-                   cr->nnodes-cr->npmenodes,
-                   runtime->nsteps_done,delta_t,nbfs,mflop);
-    }
-    if (bWriteStat) {
-        print_perf(stderr,runtime->proctime,runtime->realtime,
-                   cr->nnodes-cr->npmenodes,
-                   runtime->nsteps_done,delta_t,nbfs,mflop);
-    }
+        if (EI_DYNAMICS(inputrec->eI))
+        {
+            delta_t = inputrec->delta_t;
+        }
+        else
+        {
+            delta_t = 0;
+        }
  
-    /*
-    runtime=inputrec->nsteps*inputrec->delta_t;
-    if (bWriteStat) {
-      if (cr->nnodes == 1)
-       fprintf(stderr,"\n\n");
-      print_perf(stderr,nodetime,realtime,runtime,&ntot,
-                cr->nnodes-cr->npmenodes,FALSE);
+        if (fplog)
+        {
+            print_perf(fplog,runtime->proctime,runtime->realtime,
+                       cr->nnodes-cr->npmenodes,
+                       runtime->nsteps_done,delta_t,nbfs,mflop,
+                       omp_nth_pp);
+        }
+        if (bWriteStat)
+        {
+            print_perf(stderr,runtime->proctime,runtime->realtime,
+                       cr->nnodes-cr->npmenodes,
+                       runtime->nsteps_done,delta_t,nbfs,mflop,
+                       omp_nth_pp);
+        }
      }
-    wallcycle_print(fplog,cr->nnodes,cr->npmenodes,realtime,wcycle,cycles);
-    print_perf(fplog,nodetime,realtime,runtime,&ntot,cr->nnodes-cr->npmenodes,
-              TRUE);
-    if (PARTDECOMP(cr))
-      pr_load(fplog,cr,nrnb_all);
-    if (cr->nnodes > 1)
-      sfree(nrnb_all);
-    */
-  }
  }
  
  extern void initialize_lambdas(FILE *fplog,t_inputrec *ir,int *fep_state,real *lambda,double *lam0)
@@ -1641,6 +2611,3 @@ void init_md(FILE *fplog,
      debug_gmx();
  }
  
-
-
-