Merge branch release-5-0 into release-5-1

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 11 Mar 2016 00:37:02 +0000 (01:37 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Fri, 11 Mar 2016 00:37:55 +0000 (01:37 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 11 Mar 2016 00:37:02 +0000 (01:37 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Fri, 11 Mar 2016 00:37:55 +0000 (01:37 +0100)
diff --combined src/gromacs/mdlib/sim_util.cpp

index b2d98a11a017e75e3b4021aab4b9b7b85371b075,13b8fc7a9a81dc5e5b800c4231fd68ffa7756040..ac13d64b9392aef451aa29cd444b2331a266b1af
--- 1/src/gromacs/mdlib/sim_util.cpp
--- 2/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.cpp
@@@ -3,7 -3,7 +3,7 @@@
    *
    * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
    * Copyright (c) 2001-2004, The GROMACS development team.
-  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+  * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
    * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
    * and including many others, as listed in the AUTHORS file in the
    * top-level source directory and at http://www.gromacs.org.
@@@ -34,69 -34,70 +34,69 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "gromacs/legacyheaders/sim_util.h"
+ +
+ +#include "config.h"
   
   #include <assert.h>
   #include <math.h>
   #include <stdio.h>
   #include <string.h>
- -#ifdef HAVE_SYS_TIME_H
- -#include <sys/time.h>
- -#endif
- -
- -#include "typedefs.h"
- -#include "gromacs/utility/cstringutil.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "names.h"
- -#include "txtdump.h"
- -#include "pbc.h"
- -#include "chargegroup.h"
- -#include "vec.h"
- -#include "nrnb.h"
- -#include "mshift.h"
- -#include "mdrun.h"
- -#include "sim_util.h"
- -#include "update.h"
- -#include "physics.h"
- -#include "main.h"
- -#include "mdatoms.h"
- -#include "force.h"
- -#include "bondf.h"
- -#include "pme.h"
- -#include "disre.h"
- -#include "orires.h"
- -#include "network.h"
- -#include "calcmu.h"
- -#include "constr.h"
- -#include "xvgr.h"
- -#include "copyrite.h"
- -#include "domdec.h"
- -#include "genborn.h"
- -#include "nbnxn_atomdata.h"
- -#include "nbnxn_search.h"
- -#include "nbnxn_kernels/nbnxn_kernel_ref.h"
- -#include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
- -#include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
- -#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
- -#include "nonbonded.h"
- -#include "../gmxlib/nonbonded/nb_kernel.h"
- -#include "../gmxlib/nonbonded/nb_free_energy.h"
   
- -#include "gromacs/timing/wallcycle.h"
- -#include "gromacs/timing/walltime_accounting.h"
- -#include "gromacs/utility/gmxmpi.h"
+ +#include "gromacs/domdec/domdec.h"
   #include "gromacs/essentialdynamics/edsam.h"
+ +#include "gromacs/ewald/pme.h"
+ +#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
+ +#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
+ +#include "gromacs/imd/imd.h"
+ +#include "gromacs/legacyheaders/calcmu.h"
+ +#include "gromacs/legacyheaders/chargegroup.h"
+ +#include "gromacs/legacyheaders/constr.h"
+ +#include "gromacs/legacyheaders/copyrite.h"
+ +#include "gromacs/legacyheaders/disre.h"
+ +#include "gromacs/legacyheaders/force.h"
+ +#include "gromacs/legacyheaders/genborn.h"
+ +#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+ +#include "gromacs/legacyheaders/mdatoms.h"
+ +#include "gromacs/legacyheaders/mdrun.h"
+ +#include "gromacs/legacyheaders/names.h"
+ +#include "gromacs/legacyheaders/network.h"
+ +#include "gromacs/legacyheaders/nonbonded.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/orires.h"
+ +#include "gromacs/legacyheaders/qmmm.h"
+ +#include "gromacs/legacyheaders/txtdump.h"
+ +#include "gromacs/legacyheaders/typedefs.h"
+ +#include "gromacs/legacyheaders/update.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
+ +#include "gromacs/listed-forces/bonded.h"
+ +#include "gromacs/math/units.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
+ +#include "gromacs/mdlib/nbnxn_atomdata.h"
+ +#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+ +#include "gromacs/mdlib/nbnxn_search.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
+ +#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/mshift.h"
+ +#include "gromacs/pbcutil/pbc.h"
   #include "gromacs/pulling/pull.h"
   #include "gromacs/pulling/pull_rotation.h"
- -#include "gromacs/imd/imd.h"
- -#include "adress.h"
- -#include "qmmm.h"
- -
- -#include "gmx_omp_nthreads.h"
+ +#include "gromacs/timing/gpu_timing.h"
+ +#include "gromacs/timing/wallcycle.h"
+ +#include "gromacs/timing/walltime_accounting.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/gmxmpi.h"
+ +#include "gromacs/utility/smalloc.h"
+ +#include "gromacs/utility/sysinfo.h"
   
- -#include "nbnxn_cuda_data_mgmt.h"
- -#include "nbnxn_cuda/nbnxn_cuda.h"
+ +#include "adress.h"
+ +#include "nbnxn_gpu.h"
   
   void print_time(FILE                     *out,
                   gmx_walltime_accounting_t walltime_accounting,
@@@ -212,9 -213,11 +212,9 @@@ static void sum_forces(int start, int e
    * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
    *
    * Et[] contains the parameters for the time dependent
- - * part of the field (not yet used).
+ + * part of the field.
    * Ex[] contains the parameters for
- - * the spatial dependent part of the field. You can have cool periodic
- - * fields in principle, but only a constant field is supported
- - * now.
+ + * the spatial dependent part of the field.
    * The function should return the energy due to the electric field
    * (if any) but for now returns 0.
    *
@@@ -225,7 -228,7 +225,7 @@@
    * For neutral systems with many charged molecules the error is small.
    * But for systems with a net charge or a few charged molecules
    * the error can be significant when the field is high.
- - * Solution: implement a self-consitent electric field into PME.
+ + * Solution: implement a self-consistent electric field into PME.
    */
   static void calc_f_el(FILE *fp, int  start, int homenr,
                         real charge[], rvec f[],
@@@ -278,7 -281,8 +278,7 @@@ static void calc_virial(int start, int 
                           tensor vir_part, t_graph *graph, matrix box,
                           t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
   {
- -    int    i, j;
- -    tensor virtest;
+ +    int    i;
   
       /* The short-range virial from surrounding boxes */
       clear_mat(vir_part);
@@@ -309,7 -313,82 +309,7 @@@
       }
   }
   
- -static void posres_wrapper(FILE *fplog,
- -                           int flags,
- -                           gmx_bool bSepDVDL,
- -                           t_inputrec *ir,
- -                           t_nrnb *nrnb,
- -                           gmx_localtop_t *top,
- -                           matrix box, rvec x[],
- -                           gmx_enerdata_t *enerd,
- -                           real *lambda,
- -                           t_forcerec *fr)
- -{
- -    t_pbc pbc;
- -    real  v, dvdl;
- -    int   i;
- -
- -    /* Position restraints always require full pbc */
- -    set_pbc(&pbc, ir->ePBC, box);
- -    dvdl = 0;
- -    v    = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
- -                  top->idef.iparams_posres,
- -                  (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
- -                  ir->ePBC == epbcNONE ? NULL : &pbc,
- -                  lambda[efptRESTRAINT], &dvdl,
- -                  fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
- -    if (bSepDVDL)
- -    {
- -        gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
- -    }
- -    enerd->term[F_POSRES] += v;
- -    /* If just the force constant changes, the FEP term is linear,
- -     * but if k changes, it is not.
- -     */
- -    enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
- -    inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
- -
- -    if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
- -    {
- -        for (i = 0; i < enerd->n_lambda; i++)
- -        {
- -            real dvdl_dum, lambda_dum;
- -
- -            lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
- -            v          = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
- -                                top->idef.iparams_posres,
- -                                (const rvec*)x, NULL, NULL,
- -                                ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
- -                                fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
- -            enerd->enerpart_lambda[i] += v;
- -        }
- -    }
- -}
- -
- -static void fbposres_wrapper(t_inputrec *ir,
- -                             t_nrnb *nrnb,
- -                             gmx_localtop_t *top,
- -                             matrix box, rvec x[],
- -                             gmx_enerdata_t *enerd,
- -                             t_forcerec *fr)
- -{
- -    t_pbc pbc;
- -    real  v;
- -
- -    /* Flat-bottomed position restraints always require full pbc */
- -    set_pbc(&pbc, ir->ePBC, box);
- -    v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
- -                 top->idef.iparams_fbposres,
- -                 (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
- -                 ir->ePBC == epbcNONE ? NULL : &pbc,
- -                 fr->rc_scaling, fr->ePBC, fr->posres_com);
- -    enerd->term[F_FBPOSRES] += v;
- -    inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
- -}
- -
- -static void pull_potential_wrapper(FILE *fplog,
- -                                   gmx_bool bSepDVDL,
- -                                   t_commrec *cr,
+ +static void pull_potential_wrapper(t_commrec *cr,
                                      t_inputrec *ir,
                                      matrix box, rvec x[],
                                      rvec f[],
@@@ -332,18 -411,24 +332,18 @@@
       set_pbc(&pbc, ir->ePBC, box);
       dvdl                     = 0;
       enerd->term[F_COM_PULL] +=
- -        pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
+ +        pull_potential(ir->pull_work, mdatoms, &pbc,
                          cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
- -    if (bSepDVDL)
- -    {
- -        gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
- -    }
       enerd->dvdl_lin[efptRESTRAINT] += dvdl;
       wallcycle_stop(wcycle, ewcPULLPOT);
   }
   
- -static void pme_receive_force_ener(FILE           *fplog,
- -                                   gmx_bool        bSepDVDL,
- -                                   t_commrec      *cr,
+ +static void pme_receive_force_ener(t_commrec      *cr,
                                      gmx_wallcycle_t wcycle,
                                      gmx_enerdata_t *enerd,
                                      t_forcerec     *fr)
   {
- -    real   e_q, e_lj, v, dvdl_q, dvdl_lj;
+ +    real   e_q, e_lj, dvdl_q, dvdl_lj;
       float  cycles_ppdpme, cycles_seppme;
   
       cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
@@@ -358,6 -443,11 +358,6 @@@
       gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
                         fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
                         &cycles_seppme);
- -    if (bSepDVDL)
- -    {
- -        gmx_print_sepdvdl(fplog, "Electrostatic PME mesh", e_q, dvdl_q);
- -        gmx_print_sepdvdl(fplog, "Lennard-Jones PME mesh", e_lj, dvdl_lj);
- -    }
       enerd->term[F_COUL_RECIP] += e_q;
       enerd->term[F_LJ_RECIP]   += e_lj;
       enerd->dvdl_lin[efptCOUL] += dvdl_q;
@@@ -461,9 -551,10 +461,9 @@@ static void do_nb_verlet(t_forcerec *fr
                            t_nrnb *nrnb,
                            gmx_wallcycle_t wcycle)
   {
- -    int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
- -    char                      *env;
+ +    int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
       nonbonded_verlet_group_t  *nbvg;
- -    gmx_bool                   bCUDA;
+ +    gmx_bool                   bUsingGpuKernels;
   
       if (!(flags & GMX_FORCE_NONBONDED))
       {
@@@ -473,15 -564,15 +473,15 @@@
   
       nbvg = &fr->nbv->grp[ilocality];
   
- -    /* CUDA kernel launch overhead is already timed separately */
+ +    /* GPU kernel launch overhead is already timed separately */
       if (fr->cutoff_scheme != ecutsVERLET)
       {
           gmx_incons("Invalid cut-off scheme passed!");
       }
   
- -    bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
+ +    bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
   
- -    if (!bCUDA)
+ +    if (!bUsingGpuKernels)
       {
           wallcycle_sub_start(wcycle, ewcsNONBONDED);
       }
@@@ -527,8 -618,8 +527,8 @@@
                                      enerd->grpp.ener[egLJSR]);
               break;
   
- -        case nbnxnk8x8x8_CUDA:
- -            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+ +        case nbnxnk8x8x8_GPU:
+ +            nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
               break;
   
           case nbnxnk8x8x8_PlainC:
@@@ -549,7 -640,7 +549,7 @@@
               gmx_incons("Invalid nonbonded kernel type passed!");
   
       }
- -    if (!bCUDA)
+ +    if (!bUsingGpuKernels)
       {
           wallcycle_sub_stop(wcycle, ewcsNONBONDED);
       }
@@@ -558,8 -649,8 +558,8 @@@
       {
           enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
       }
- -    else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
- -             (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
+ +    else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
+ +             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
       {
           enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
       }
@@@ -711,11 -802,6 +711,11 @@@ static void do_nb_verlet_fep(nbnxn_pair
       wallcycle_sub_stop(wcycle, ewcsNONBONDED);
   }
   
+ +gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
+ +{
+ +    return nbv != NULL && nbv->bUseGPU;
+ +}
+ +
   void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                            t_inputrec *inputrec,
                            gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
@@@ -733,25 -819,32 +733,25 @@@
                            gmx_bool bBornRadii,
                            int flags)
   {
- -    int                 cg0, cg1, i, j;
+ +    int                 cg1, i, j;
       int                 start, homenr;
- -    int                 nb_kernel_type;
       double              mu[2*DIM];
- -    gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ +    gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM;
       gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
       gmx_bool            bDiffKernels = FALSE;
- -    matrix              boxs;
       rvec                vzero, box_diag;
- -    real                e, v, dvdl;
       float               cycles_pme, cycles_force, cycles_wait_gpu;
       nonbonded_verlet_t *nbv;
   
       cycles_force    = 0;
       cycles_wait_gpu = 0;
       nbv             = fr->nbv;
- -    nb_kernel_type  = fr->nbv->grp[0].kernel_type;
   
       start  = 0;
       homenr = mdatoms->homenr;
   
- -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
- -
       clear_mat(vir_force);
   
- -    cg0 = 0;
       if (DOMAINDECOMP(cr))
       {
           cg1 = cr->dd->ncg_tot;
@@@ -817,9 -910,6 +817,9 @@@
   #ifdef GMX_MPI
       if (!(cr->duty & DUTY_PME))
       {
+ +        gmx_bool bBS;
+ +        matrix   boxs;
+ +
           /* Send particle coordinates to the pme nodes.
            * Since this is only implemented for domain decomposition
            * and domain decomposition does not use the graph,
@@@ -914,12 -1004,12 +914,12 @@@
           if (bNS)
           {
               wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- -            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ +            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
               wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
           }
   
           wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- -        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ +        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
           wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
       }
   
@@@ -941,9 -1031,9 +941,9 @@@
           if (bUseGPU)
           {
               /* initialize local pair-list on the GPU */
- -            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
- -                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
- -                                     eintLocal);
+ +            nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
+ +                                    nbv->grp[eintLocal].nbl_lists.nbl[0],
+ +                                    eintLocal);
           }
           wallcycle_stop(wcycle, ewcNS);
       }
@@@ -1009,12 -1099,12 +1009,12 @@@
   
               wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
   
- -            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
+ +            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
               {
                   /* initialize non-local pair-list on the GPU */
- -                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
- -                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
- -                                         eintNonlocal);
+ +                nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
+ +                                        nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+ +                                        eintNonlocal);
               }
               wallcycle_stop(wcycle, ewcNS);
           }
@@@ -1022,12 -1112,6 +1022,6 @@@
           {
               wallcycle_start(wcycle, ewcMOVEX);
               dd_move_x(cr->dd, box, x);
- 
-             /* When we don't need the total dipole we sum it in global_stat */
-             if (bStateChanged && NEED_MUTOT(*inputrec))
-             {
-                 gmx_sumd(2*DIM, mu, cr);
-             }
               wallcycle_stop(wcycle, ewcMOVEX);
   
               wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
@@@ -1054,11 -1138,11 +1048,11 @@@
           wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
           if (DOMAINDECOMP(cr) && !bDiffKernels)
           {
- -            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
- -                                      flags, eatNonlocal);
+ +            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
+ +                                     flags, eatNonlocal);
           }
- -        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
- -                                  flags, eatLocal);
+ +        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
+ +                                 flags, eatLocal);
           cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
       }
   
@@@ -1112,7 -1196,7 +1106,7 @@@
       }
   
       /* Start the force cycle counter.
- -     * This counter is stopped in do_forcelow_level.
+ +     * This counter is stopped after do_force_lowlevel.
        * No parallel communication should occur while this counter is running,
        * since that will interfere with the dynamic load balancing.
        */
@@@ -1155,17 -1239,16 +1149,17 @@@
           clear_rvec(fr->vir_diag_posres);
       }
   
- -    if (inputrec->ePull == epullCONSTRAINT)
+ +    if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
       {
- -        clear_pull_forces(inputrec->pull);
+ +        clear_pull_forces(inputrec->pull_work);
       }
   
       /* We calculate the non-bonded forces, when done on the CPU, here.
- -     * We do this before calling do_force_lowlevel, as in there bondeds
- -     * forces are calculated before PME, which does communication.
- -     * With this order, non-bonded and bonded force calculation imbalance
- -     * can be balanced out by the domain decomposition load balancing.
+ +     * We do this before calling do_force_lowlevel, because in that
+ +     * function, the listed forces are calculated before PME, which
+ +     * does communication.  With this order, non-bonded and listed
+ +     * force calculation imbalance can be balanced out by the domain
+ +     * decomposition load balancing.
        */
   
       if (!bUseOrEmulGPU)
@@@ -1219,7 -1302,7 +1213,7 @@@
           }
   
           /* Add all the non-bonded force to the normal force array.
- -         * This can be split into a local a non-local part when overlapping
+ +         * This can be split into a local and a non-local part when overlapping
            * communication with calculation with domain decomposition.
            */
           cycles_force += wallcycle_stop(wcycle, ewcFORCE);
@@@ -1234,8 -1317,6 +1228,8 @@@
           if ((flags & GMX_FORCE_VIRIAL) &&
               nbv->grp[aloc].nbl_lists.nnbl > 1)
           {
+ +            /* This is not in a subcounter because it takes a
+ +               negligible and constant-sized amount of time */
               nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
                                                        fr->fshift);
           }
@@@ -1247,11 -1328,22 +1241,11 @@@
           update_QMMMrec(cr, fr, x, mdatoms, box, top);
       }
   
- -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
- -    {
- -        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
- -                       enerd, lambda, fr);
- -    }
- -
- -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
- -    {
- -        fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
- -    }
- -
       /* Compute the bonded and non-bonded energies and optionally forces */
- -    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ +    do_force_lowlevel(fr, inputrec, &(top->idef),
                         cr, nrnb, wcycle, mdatoms,
                         x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
- -                      &(top->atomtypes), bBornRadii, box,
+ +                      bBornRadii, box,
                         inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
                         flags, &cycles_pme);
   
@@@ -1284,11 -1376,11 +1278,11 @@@
                   float cycles_tmp;
   
                   wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
- -                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
- -                                    nbv->grp[eintNonlocal].nbat,
- -                                    flags, eatNonlocal,
- -                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- -                                    fr->fshift);
+ +                nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+ +                                       nbv->grp[eintNonlocal].nbat,
+ +                                       flags, eatNonlocal,
+ +                                       enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ +                                       fr->fshift);
                   cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
                   cycles_wait_gpu += cycles_tmp;
                   cycles_force    += cycles_tmp;
@@@ -1343,16 -1435,15 +1337,16 @@@
           /* wait for local forces (or calculate in emulation mode) */
           if (bUseGPU)
           {
+ +#if defined(GMX_GPU) && !defined(GMX_USE_OPENCL)
               float       cycles_tmp, cycles_wait_est;
               const float cuda_api_overhead_margin = 50000.0f; /* cycles */
   
               wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- -            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
- -                                nbv->grp[eintLocal].nbat,
- -                                flags, eatLocal,
- -                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- -                                fr->fshift);
+ +            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+ +                                   nbv->grp[eintLocal].nbat,
+ +                                   flags, eatLocal,
+ +                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ +                                   fr->fshift);
               cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
   
               if (bDoForces && DOMAINDECOMP(cr))
@@@ -1383,20 -1474,10 +1377,20 @@@
               cycles_force    += cycles_wait_est;
               cycles_wait_gpu += cycles_wait_est;
   
- -            /* now clear the GPU outputs while we finish the step on the CPU */
+ +#elif defined(GMX_GPU) && defined(GMX_USE_OPENCL)
+ +
+ +            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ +            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+ +                                   nbv->grp[eintLocal].nbat,
+ +                                   flags, eatLocal,
+ +                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ +                                   fr->fshift);
+ +            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ +#endif
   
+ +            /* now clear the GPU outputs while we finish the step on the CPU */
               wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- -            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+ +            nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
               wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
           }
           else
@@@ -1466,12 -1547,12 +1460,12 @@@
           }
       }
   
- -    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ +    if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
       {
           /* Since the COM pulling is always done mass-weighted, no forces are
            * applied to vsites and this call can be done after vsite spreading.
            */
- -        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ +        pull_potential_wrapper(cr, inputrec, box, x,
                                  f, vir_force, mdatoms, enerd, lambda, t,
                                  wcycle);
       }
@@@ -1492,7 -1573,7 +1486,7 @@@
           /* In case of node-splitting, the PP nodes receive the long-range
            * forces, virial and energy from the PME nodes here.
            */
- -        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ +        pme_receive_force_ener(cr, wcycle, enerd, fr);
       }
   
       if (bDoForces)
@@@ -1525,15 -1606,20 +1519,15 @@@ void do_force_cutsGROUP(FILE *fplog, t_
       int        cg0, cg1, i, j;
       int        start, homenr;
       double     mu[2*DIM];
- -    gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
- -    gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
+ +    gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM;
+ +    gmx_bool   bDoLongRangeNS, bDoForces, bSepLRF;
       gmx_bool   bDoAdressWF;
- -    matrix     boxs;
- -    rvec       vzero, box_diag;
- -    real       e, v, dvdlambda[efptNR];
       t_pbc      pbc;
       float      cycles_pme, cycles_force;
   
       start  = 0;
       homenr = mdatoms->homenr;
   
- -    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
- -
       clear_mat(vir_force);
   
       cg0 = 0;
@@@ -1558,6 -1644,7 +1552,6 @@@
       bFillGrid      = (bNS && bStateChanged);
       bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
       bDoForces      = (flags & GMX_FORCE_FORCES);
- -    bDoPotential   = (flags & GMX_FORCE_ENERGY);
       bSepLRF        = ((inputrec->nstcalclr > 1) && bDoForces &&
                         (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
   
@@@ -1615,9 -1702,6 +1609,9 @@@
   #ifdef GMX_MPI
       if (!(cr->duty & DUTY_PME))
       {
+ +        gmx_bool bBS;
+ +        matrix   boxs;
+ +
           /* Send particle coordinates to the pme nodes.
            * Since this is only implemented for domain decomposition
            * and domain decomposition does not use the graph,
@@@ -1766,7 -1850,7 +1760,7 @@@
       }
   
       /* Start the force cycle counter.
- -     * This counter is stopped in do_forcelow_level.
+ +     * This counter is stopped after do_force_lowlevel.
        * No parallel communication should occur while this counter is running,
        * since that will interfere with the dynamic load balancing.
        */
@@@ -1809,9 -1893,9 +1803,9 @@@
   
           clear_rvec(fr->vir_diag_posres);
       }
- -    if (inputrec->ePull == epullCONSTRAINT)
+ +    if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
       {
- -        clear_pull_forces(inputrec->pull);
+ +        clear_pull_forces(inputrec->pull_work);
       }
   
       /* update QMMMrec, if necessary */
@@@ -1820,11 -1904,22 +1814,11 @@@
           update_QMMMrec(cr, fr, x, mdatoms, box, top);
       }
   
- -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
- -    {
- -        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
- -                       enerd, lambda, fr);
- -    }
- -
- -    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
- -    {
- -        fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
- -    }
- -
       /* Compute the bonded and non-bonded energies and optionally forces */
- -    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ +    do_force_lowlevel(fr, inputrec, &(top->idef),
                         cr, nrnb, wcycle, mdatoms,
                         x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
- -                      &(top->atomtypes), bBornRadii, box,
+ +                      bBornRadii, box,
                         inputrec->fepvals, lambda,
                         graph, &(top->excls), fr->mu_tot,
                         flags,
@@@ -1930,9 -2025,9 +1924,9 @@@
           }
       }
   
- -    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ +    if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
       {
- -        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ +        pull_potential_wrapper(cr, inputrec, box, x,
                                  f, vir_force, mdatoms, enerd, lambda, t,
                                  wcycle);
       }
@@@ -1953,7 -2048,7 +1947,7 @@@
           /* In case of node-splitting, the PP nodes receive the long-range
            * forces, virial and energy from the PME nodes here.
            */
- -        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ +        pme_receive_force_ener(cr, wcycle, enerd, fr);
       }
   
       if (bDoForces)
@@@ -2062,21 -2157,24 +2056,21 @@@ void do_constrain_first(FILE *fplog, gm
   
       /* constrain the current position */
       constrain(NULL, TRUE, FALSE, constr, &(top->idef),
- -              ir, NULL, cr, step, 0, 1.0, md,
+ +              ir, cr, step, 0, 1.0, md,
                 state->x, state->x, NULL,
                 fr->bMolPBC, state->box,
                 state->lambda[efptBONDED], &dvdl_dum,
- -              NULL, NULL, nrnb, econqCoord,
- -              ir->epc == epcMTTK, state->veta, state->veta);
+ +              NULL, NULL, nrnb, econqCoord);
       if (EI_VV(ir->eI))
       {
           /* constrain the inital velocity, and save it */
           /* also may be useful if we need the ekin from the halfstep for velocity verlet */
- -        /* might not yet treat veta correctly */
           constrain(NULL, TRUE, FALSE, constr, &(top->idef),
- -                  ir, NULL, cr, step, 0, 1.0, md,
+ +                  ir, cr, step, 0, 1.0, md,
                     state->x, state->v, state->v,
                     fr->bMolPBC, state->box,
                     state->lambda[efptBONDED], &dvdl_dum,
- -                  NULL, NULL, nrnb, econqVeloc,
- -                  ir->epc == epcMTTK, state->veta, state->veta);
+ +                  NULL, NULL, nrnb, econqVeloc);
       }
       /* constrain the inital velocities at t-dt/2 */
       if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
@@@ -2102,11 -2200,12 +2096,11 @@@
           }
           dvdl_dum = 0;
           constrain(NULL, TRUE, FALSE, constr, &(top->idef),
- -                  ir, NULL, cr, step, -1, 1.0, md,
+ +                  ir, cr, step, -1, 1.0, md,
                     state->x, savex, NULL,
                     fr->bMolPBC, state->box,
                     state->lambda[efptBONDED], &dvdl_dum,
- -                  state->v, NULL, nrnb, econqCoord,
- -                  ir->epc == epcMTTK, state->veta, state->veta);
+ +                  state->v, NULL, nrnb, econqCoord);
   
           for (i = start; i < end; i++)
           {
@@@ -2129,8 -2228,7 +2123,8 @@@ integrate_table(real vdwtab[], real sca
       double invscale, invscale2, invscale3;
       double r, ea, eb, ec, pa, pb, pc, pd;
       double y0, f, g, h;
- -    int    ri, offset, tabfactor;
+ +    int    ri, offset;
+ +    double tabfactor;
   
       invscale  = 1.0/scale;
       invscale2 = invscale*invscale;
@@@ -2191,10 -2289,11 +2185,10 @@@
   
   void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
   {
- -    double   eners[2], virs[2], enersum, virsum, y0, f, g, h;
- -    double   r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
- -    double   invscale, invscale2, invscale3;
- -    int      ri0, ri1, ri, i, offstart, offset;
- -    real     scale, *vdwtab, tabfactor, tmp;
+ +    double   eners[2], virs[2], enersum, virsum;
+ +    double   r0, rc3, rc9;
+ +    int      ri0, ri1, i;
+ +    real     scale, *vdwtab;
   
       fr->enershiftsix    = 0;
       fr->enershifttwelve = 0;
@@@ -2229,8 -2328,8 +2223,8 @@@
               vdwtab = fr->nblists[0].table_vdw.data;
   
               /* Round the cut-offs to exact table values for precision */
- -            ri0  = floor(fr->rvdw_switch*scale);
- -            ri1  = ceil(fr->rvdw*scale);
+ +            ri0  = static_cast<int>(floor(fr->rvdw_switch*scale));
+ +            ri1  = static_cast<int>(ceil(fr->rvdw*scale));
   
               /* The code below has some support for handling force-switching, i.e.
                * when the force (instead of potential) is switched over a limited
@@@ -2251,6 -2350,7 +2245,6 @@@
               ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
   
               r0   = ri0/scale;
- -            r1   = ri1/scale;
               rc3  = r0*r0*r0;
               rc9  = rc3*rc3*rc3;
   
@@@ -2364,8 -2464,8 +2358,8 @@@
       }
   }
   
- -void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
- -                   gmx_int64_t step, int natoms,
+ +void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
+ +                   int natoms,
                      matrix box, real lambda, tensor pres, tensor virial,
                      real *prescorr, real *enercorr, real *dvdlcorr)
   {
@@@ -2466,6 -2566,10 +2460,6 @@@
               }
           }
   
- -        if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
- -        {
- -            gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
- -        }
           if (fr->efep != efepNO)
           {
               *dvdlcorr += dvdlambda;
@@@ -2570,12 -2674,13 +2564,12 @@@ void finish_run(FILE *fplog, t_commrec 
                   t_inputrec *inputrec,
                   t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                   gmx_walltime_accounting_t walltime_accounting,
- -                wallclock_gpu_t *gputimes,
+ +                nonbonded_verlet_t *nbv,
                   gmx_bool bWriteStat)
   {
- -    int     i, j;
       t_nrnb *nrnb_tot = NULL;
- -    real    delta_t;
- -    double  nbfs, mflop;
+ +    double  delta_t  = 0;
+ +    double  nbfs     = 0, mflop = 0;
       double  elapsed_time,
               elapsed_time_over_all_ranks,
               elapsed_time_over_all_threads,
@@@ -2633,8 -2738,6 +2627,8 @@@
   
       if (SIMMASTER(cr))
       {
+ +        struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : NULL;
+ +
           wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
                           elapsed_time_over_all_ranks,
                           wcycle, gputimes);
@@@ -2643,6 -2746,10 +2637,6 @@@
           {
               delta_t = inputrec->delta_t;
           }
- -        else
- -        {
- -            delta_t = 0;
- -        }
   
           if (fplog)
           {
@@@ -2745,7 -2852,8 +2739,7 @@@ void init_md(FILE *fplog
                gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
                gmx_wallcycle_t wcycle)
   {
- -    int  i, j, n;
- -    real tmpt, mod;
+ +    int  i;
   
       /* Initial values */
       *t = *t0       = ir->init_t;
@@@ -2793,10 -2901,7 +2787,10 @@@
               please_cite(fplog, "Goga2012");
           }
       }
- -
+ +    if ((ir->et[XX].n > 0) || (ir->et[YY].n > 0) || (ir->et[ZZ].n > 0))
+ +    {
+ +        please_cite(fplog, "Caleman2008a");
+ +    }
       init_nrnb(nrnb);
   
       if (nfile != -1)
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 11 Mar 2016 00:37:02 +0000 (01:37 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 11 Mar 2016 00:37:55 +0000 (01:37 +0100)