#include "nrnb.h"
#include "mshift.h"
#include "mdrun.h"
+#include "sim_util.h"
#include "update.h"
#include "physics.h"
#include "main.h"
#include "partdec.h"
#include "gmx_wallcycle.h"
#include "genborn.h"
+#include "nbnxn_search.h"
+#include "nbnxn_kernels/nbnxn_kernel_ref.h"
+#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
+#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
#ifdef GMX_LIB_MPI
#include <mpi.h>
#include "adress.h"
#include "qmmm.h"
+#include "nbnxn_cuda_data_mgmt.h"
+#include "nbnxn_cuda/nbnxn_cuda.h"
+
#if 0
typedef struct gmx_timeprint {
fprintf(out,"step %s",gmx_step_str(step,buf));
if ((step >= ir->nstlist))
{
- if ((ir->nstlist == 0) || ((step % ir->nstlist) == 0))
- {
- /* We have done a full cycle let's update time_per_step */
- runtime->last = gmx_gettime();
- dt = difftime(runtime->last,runtime->real);
- runtime->time_per_step = dt/(step - ir->init_step + 1);
- }
+ runtime->last = gmx_gettime();
+ dt = difftime(runtime->last,runtime->real);
+ runtime->time_per_step = dt/(step - ir->init_step + 1);
+
dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
if (ir->nsteps >= 0)
for(i=start; (i<start+homenr); i++)
f[i][m] += charge[i]*Ext[m];
}
- else
+ else
+ {
+ Ext[m] = 0;
+ }
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp,"%10g %10g %10g %10g #FIELD\n",t,
+ Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
+ }
+}
+
+static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
+ tensor vir_part,t_graph *graph,matrix box,
+ t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
+{
+ int i,j;
+ tensor virtest;
+
+ /* The short-range virial from surrounding boxes */
+ clear_mat(vir_part);
+ calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
+ inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
+
+ /* Calculate partial virial, for local atoms only, based on short range.
+ * Total virial is computed in global_stat, called from do_md
+ */
+ f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
+ inc_nrnb(nrnb,eNR_VIRIAL,homenr);
+
+ /* Add position restraint contribution */
+ for(i=0; i<DIM; i++) {
+ vir_part[i][i] += fr->vir_diag_posres[i];
+ }
+
+ /* Add wall contribution */
+ for(i=0; i<DIM; i++) {
+ vir_part[i][ZZ] += fr->vir_wall_z[i];
+ }
+
+ if (debug)
+ pr_rvecs(debug,0,"vir_part",vir_part,DIM);
+}
+
+static void posres_wrapper(FILE *fplog,
+ int flags,
+ gmx_bool bSepDVDL,
+ t_inputrec *ir,
+ t_nrnb *nrnb,
+ gmx_localtop_t *top,
+ matrix box,rvec x[],
+ rvec f[],
+ gmx_enerdata_t *enerd,
+ real *lambda,
+ t_forcerec *fr)
+{
+ t_pbc pbc;
+ real v,dvdl;
+ int i;
+
+ /* Position restraints always require full pbc */
+ set_pbc(&pbc,ir->ePBC,box);
+ dvdl = 0;
+ v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
+ top->idef.iparams_posres,
+ (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
+ ir->ePBC==epbcNONE ? NULL : &pbc,
+ lambda[efptRESTRAINT],&dvdl,
+ fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
+ if (bSepDVDL)
+ {
+ fprintf(fplog,sepdvdlformat,
+ interaction_function[F_POSRES].longname,v,dvdl);
+ }
+ enerd->term[F_POSRES] += v;
+ /* If just the force constant changes, the FEP term is linear,
+ * but if k changes, it is not.
+ */
+ enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
+ inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
+
+ if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
+ {
+ for(i=0; i<enerd->n_lambda; i++)
+ {
+ real dvdl_dum,lambda_dum;
+
+ lambda_dum = (i==0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
+ v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
+ top->idef.iparams_posres,
+ (const rvec*)x,NULL,NULL,
+ ir->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl,
+ fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
+ enerd->enerpart_lambda[i] += v;
+ }
+ }
+}
+
+static void pull_potential_wrapper(FILE *fplog,
+ gmx_bool bSepDVDL,
+ t_commrec *cr,
+ t_inputrec *ir,
+ matrix box,rvec x[],
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd,
+ real *lambda,
+ double t)
+{
+ t_pbc pbc;
+ real dvdl;
+
+ /* Calculate the center of mass forces, this requires communication,
+ * which is why pull_potential is called close to other communication.
+ * The virial contribution is calculated directly,
+ * which is why we call pull_potential after calc_virial.
+ */
+ set_pbc(&pbc,ir->ePBC,box);
+ dvdl = 0;
+ enerd->term[F_COM_PULL] +=
+ pull_potential(ir->ePull,ir->pull,mdatoms,&pbc,
+ cr,t,lambda[efptRESTRAINT],x,f,vir_force,&dvdl);
+ if (bSepDVDL)
+ {
+ fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdl);
+ }
+ enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+}
+
+static void pme_receive_force_ener(FILE *fplog,
+ gmx_bool bSepDVDL,
+ t_commrec *cr,
+ gmx_wallcycle_t wcycle,
+ gmx_enerdata_t *enerd,
+ t_forcerec *fr)
+{
+ real e,v,dvdl;
+ float cycles_ppdpme,cycles_seppme;
+
+ cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
+ dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
+
+ /* In case of node-splitting, the PP nodes receive the long-range
+ * forces, virial and energy from the PME nodes here.
+ */
+ wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
+ dvdl = 0;
+ gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdl,
+ &cycles_seppme);
+ if (bSepDVDL)
+ {
+ fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdl);
+ }
+ enerd->term[F_COUL_RECIP] += e;
+ enerd->dvdl_lin[efptCOUL] += dvdl;
+ if (wcycle)
+ {
+ dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
+ }
+ wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
+}
+
+static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
+ gmx_large_int_t step,real pforce,rvec *x,rvec *f)
+{
+ int i;
+ real pf2,fn2;
+ char buf[STEPSTRSIZE];
+
+ pf2 = sqr(pforce);
+ for(i=md->start; i<md->start+md->homenr; i++) {
+ fn2 = norm2(f[i]);
+ /* We also catch NAN, if the compiler does not optimize this away. */
+ if (fn2 >= pf2 || fn2 != fn2) {
+ fprintf(fp,"step %s atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
+ gmx_step_str(step,buf),
+ ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
+ }
+ }
+}
+
+static void post_process_forces(FILE *fplog,
+ t_commrec *cr,
+ gmx_large_int_t step,
+ t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ matrix box,rvec x[],
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ t_graph *graph,
+ t_forcerec *fr,gmx_vsite_t *vsite,
+ int flags)
+{
+ if (fr->bF_NoVirSum)
+ {
+ if (vsite)
+ {
+ /* Spread the mesh force on virtual sites to the other particles...
+ * This is parallellized. MPI communication is performed
+ * if the constructing atoms aren't local.
+ */
+ wallcycle_start(wcycle,ewcVSITESPREAD);
+ spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
+ (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
+ nrnb,
+ &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+ wallcycle_stop(wcycle,ewcVSITESPREAD);
+ }
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ /* Now add the forces, this is local */
+ if (fr->bDomDec)
+ {
+ sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
+ }
+ else
+ {
+ sum_forces(mdatoms->start,mdatoms->start+mdatoms->homenr,
+ f,fr->f_novirsum);
+ }
+ if (EEL_FULL(fr->eeltype))
+ {
+ /* Add the mesh contribution to the virial */
+ m_add(vir_force,fr->vir_el_recip,vir_force);
+ }
+ if (debug)
+ {
+ pr_rvecs(debug,0,"vir_force",vir_force,DIM);
+ }
+ }
+ }
+
+ if (fr->print_force >= 0)
+ {
+ print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
+ }
+}
+
+static void do_nb_verlet(t_forcerec *fr,
+ interaction_const_t *ic,
+ gmx_enerdata_t *enerd,
+ int flags, int ilocality,
+ int clearF,
+ t_nrnb *nrnb,
+ gmx_wallcycle_t wcycle)
+{
+ int nnbl, kernel_type, sh_e;
+ char *env;
+ nonbonded_verlet_group_t *nbvg;
+
+ if (!(flags & GMX_FORCE_NONBONDED))
+ {
+ /* skip non-bonded calculation */
+ return;
+ }
+
+ nbvg = &fr->nbv->grp[ilocality];
+
+ /* CUDA kernel launch overhead is already timed separately */
+ if (fr->cutoff_scheme != ecutsVERLET)
+ {
+ gmx_incons("Invalid cut-off scheme passed!");
+ }
+
+ if (nbvg->kernel_type != nbk8x8x8_CUDA)
+ {
+ wallcycle_sub_start(wcycle, ewcsNONBONDED);
+ }
+ switch (nbvg->kernel_type)
+ {
+ case nbk4x4_PlainC:
+ nbnxn_kernel_ref(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ case nbk4xN_X86_SIMD128:
+ nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+ case nbk4xN_X86_SIMD256:
+ nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ case nbk8x8x8_CUDA:
+ nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+ break;
+
+ case nbk8x8x8_PlainC:
+ nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
+ nbvg->nbat, ic,
+ fr->shift_vec,
+ flags,
+ clearF,
+ nbvg->nbat->out[0].f,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ default:
+ gmx_incons("Invalid nonbonded kernel type passed!");
+
+ }
+ if (nbvg->kernel_type != nbk8x8x8_CUDA)
+ {
+ wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+ }
+
+ /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
+ sh_e = ((flags & GMX_FORCE_ENERGY) ? 1 : 0);
+ inc_nrnb(nrnb,
+ ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
+ eNR_NBNXN_LJ_RF : eNR_NBNXN_LJ_TAB) + sh_e,
+ nbvg->nbl_lists.natpair_ljq);
+ inc_nrnb(nrnb,eNR_NBNXN_LJ+sh_e,nbvg->nbl_lists.natpair_lj);
+ inc_nrnb(nrnb,
+ ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
+ eNR_NBNXN_RF : eNR_NBNXN_TAB)+sh_e,
+ nbvg->nbl_lists.natpair_q);
+}
+
+void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_mtop_t *mtop,
+ gmx_groups_t *groups,
+ matrix box,rvec x[],history_t *hist,
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd,t_fcdata *fcd,
+ real *lambda,t_graph *graph,
+ t_forcerec *fr, interaction_const_t *ic,
+ gmx_vsite_t *vsite,rvec mu_tot,
+ double t,FILE *field,gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags)
+{
+ int cg0,cg1,i,j;
+ int start,homenr;
+ int nb_kernel_type;
+ double mu[2*DIM];
+ gmx_bool bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
+ gmx_bool bDoLongRange,bDoForces,bSepLRF,bUseGPU,bUseOrEmulGPU;
+ gmx_bool bDiffKernels=FALSE;
+ matrix boxs;
+ rvec vzero,box_diag;
+ real e,v,dvdl;
+ float cycles_pme,cycles_force;
+ nonbonded_verlet_t *nbv;
+
+ cycles_force = 0;
+ nbv = fr->nbv;
+ nb_kernel_type = fr->nbv->grp[0].kernel_type;
+
+ start = mdatoms->start;
+ homenr = mdatoms->homenr;
+
+ bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));
+
+ clear_mat(vir_force);
+
+ cg0 = 0;
+ if (DOMAINDECOMP(cr))
+ {
+ cg1 = cr->dd->ncg_tot;
+ }
+ else
+ {
+ cg1 = top->cgs.nr;
+ }
+ if (fr->n_tpi > 0)
+ {
+ cg1--;
+ }
+
+ bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+ bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE);
+ bFillGrid = (bNS && bStateChanged);
+ bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
+ bDoLongRange = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DOLR));
+ bDoForces = (flags & GMX_FORCE_FORCES);
+ bSepLRF = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
+ bUseGPU = fr->nbv->bUseGPU;
+ bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
+
+ if (bStateChanged)
+ {
+ update_forcerec(fplog,fr,box);
+
+ if (NEED_MUTOT(*inputrec))
+ {
+ /* Calculate total (local) dipole moment in a temporary common array.
+ * This makes it possible to sum them over nodes faster.
+ */
+ calc_mu(start,homenr,
+ x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
+ mu,mu+DIM);
+ }
+ }
+
+ if (fr->ePBC != epbcNONE) {
+ /* Compute shift vectors every step,
+ * because of pressure coupling or box deformation!
+ */
+ if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ calc_shifts(box,fr->shift_vec);
+
+ if (bCalcCGCM) {
+ put_atoms_in_box_omp(fr->ePBC,box,homenr,x);
+ inc_nrnb(nrnb,eNR_SHIFTX,homenr);
+ }
+ else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
+ unshift_self(graph,box,x);
+ }
+ }
+
+ nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
+ fr->shift_vec,nbv->grp[0].nbat);
+
+#ifdef GMX_MPI
+ if (!(cr->duty & DUTY_PME)) {
+ /* Send particle coordinates to the pme nodes.
+ * Since this is only implemented for domain decomposition
+ * and domain decomposition does not use the graph,
+ * we do not need to worry about shifting.
+ */
+
+ wallcycle_start(wcycle,ewcPP_PMESENDX);
+ GMX_MPE_LOG(ev_send_coordinates_start);
+
+ bBS = (inputrec->nwall == 2);
+ if (bBS) {
+ copy_mat(box,boxs);
+ svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+ }
+
+ gmx_pme_send_x(cr,bBS ? boxs : box,x,
+ mdatoms->nChargePerturbed,lambda[efptCOUL],
+ (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
+
+ GMX_MPE_LOG(ev_send_coordinates_finish);
+ wallcycle_stop(wcycle,ewcPP_PMESENDX);
+ }
+#endif /* GMX_MPI */
+
+ /* do gridding for pair search */
+ if (bNS)
+ {
+ if (graph && bStateChanged)
+ {
+ /* Calculate intramolecular shift vectors to make molecules whole */
+ mk_mshift(fplog,graph,fr->ePBC,box,x);
+ }
+
+ clear_rvec(vzero);
+ box_diag[XX] = box[XX][XX];
+ box_diag[YY] = box[YY][YY];
+ box_diag[ZZ] = box[ZZ][ZZ];
+
+ wallcycle_start(wcycle,ewcNS);
+ if (!fr->bDomDec)
+ {
+ wallcycle_sub_start(wcycle,ewcsNBS_GRID_LOCAL);
+ nbnxn_put_on_grid(nbv->nbs,fr->ePBC,box,
+ 0,vzero,box_diag,
+ 0,mdatoms->homenr,-1,fr->cginfo,x,
+ 0,NULL,
+ nbv->grp[eintLocal].kernel_type,
+ nbv->grp[eintLocal].nbat);
+ wallcycle_sub_stop(wcycle,ewcsNBS_GRID_LOCAL);
+ }
+ else
+ {
+ wallcycle_sub_start(wcycle,ewcsNBS_GRID_NONLOCAL);
+ nbnxn_put_on_grid_nonlocal(nbv->nbs,domdec_zones(cr->dd),
+ fr->cginfo,x,
+ nbv->grp[eintNonlocal].kernel_type,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle,ewcsNBS_GRID_NONLOCAL);
+ }
+
+ if (nbv->ngrp == 1 ||
+ nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
+ {
+ nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatAll,
+ nbv->nbs,mdatoms,fr->cginfo);
+ }
+ else
+ {
+ nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatLocal,
+ nbv->nbs,mdatoms,fr->cginfo);
+ nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat,eatAll,
+ nbv->nbs,mdatoms,fr->cginfo);
+ }
+ wallcycle_stop(wcycle, ewcNS);
+ }
+
+ /* initialize the GPU atom data and copy shift vector */
+ if (bUseGPU)
+ {
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ /* do local pair search */
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle,ewcNS);
+ wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_LOCAL);
+ nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintLocal].nbat,
+ &top->excls,
+ ic->rlist,
+ nbv->min_ci_balanced,
+ &nbv->grp[eintLocal].nbl_lists,
+ eintLocal,
+ nbv->grp[eintLocal].kernel_type,
+ nrnb);
+ wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_LOCAL);
+
+ if (bUseGPU)
+ {
+ /* initialize local pair-list on the GPU */
+ nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ nbv->grp[eintLocal].nbl_lists.nbl[0],
+ eintLocal);
+ }
+ wallcycle_stop(wcycle, ewcNS);
+ }
+ else
+ {
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,FALSE,x,
+ nbv->grp[eintLocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
+ /* launch local nonbonded F on GPU */
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ nrnb, wcycle);
+ wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+ }
+
+ /* Communicate coordinates and sum dipole if necessary +
+ do non-local pair search */
+ if (DOMAINDECOMP(cr))
+ {
+ bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
+ nbv->grp[eintLocal].kernel_type);
+
+ if (bDiffKernels)
+ {
+ /* With GPU+CPU non-bonded calculations we need to copy
+ * the local coordinates to the non-local nbat struct
+ * (in CPU format) as the non-local kernel call also
+ * calculates the local - non-local interactions.
+ */
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,TRUE,x,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle,ewcNS);
+ wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_NONLOCAL);
+
+ if (bDiffKernels)
+ {
+ nbnxn_grid_add_simple(nbv->nbs,nbv->grp[eintNonlocal].nbat);
+ }
+
+ nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintNonlocal].nbat,
+ &top->excls,
+ ic->rlist,
+ nbv->min_ci_balanced,
+ &nbv->grp[eintNonlocal].nbl_lists,
+ eintNonlocal,
+ nbv->grp[eintNonlocal].kernel_type,
+ nrnb);
+
+ wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
+
+ if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
+ {
+ /* initialize non-local pair-list on the GPU */
+ nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+ eintNonlocal);
+ }
+ wallcycle_stop(wcycle,ewcNS);
+ }
+ else
+ {
+ wallcycle_start(wcycle,ewcMOVEX);
+ dd_move_x(cr->dd,box,x);
+
+ /* When we don't need the total dipole we sum it in global_stat */
+ if (bStateChanged && NEED_MUTOT(*inputrec))
+ {
+ gmx_sumd(2*DIM,mu,cr);
+ }
+ wallcycle_stop(wcycle,ewcMOVEX);
+
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatNonlocal,FALSE,x,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bUseGPU && !bDiffKernels)
+ {
+ wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
+ /* launch non-local nonbonded F on GPU */
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ nrnb, wcycle);
+ cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+ }
+ }
+
+ if (bUseGPU)
+ {
+ /* launch D2H copy-back F */
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ if (DOMAINDECOMP(cr) && !bDiffKernels)
+ {
+ nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
+ flags, eatNonlocal);
+ }
+ nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
+ flags, eatLocal);
+ cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+ }
+
+ if (bStateChanged && NEED_MUTOT(*inputrec))
+ {
+ if (PAR(cr))
+ {
+ gmx_sumd(2*DIM,mu,cr);
+ }
+
+ for(i=0; i<2; i++)
+ {
+ for(j=0;j<DIM;j++)
+ {
+ fr->mu_tot[i][j] = mu[i*DIM + j];
+ }
+ }
+ }
+ if (fr->efep == efepNO)
+ {
+ copy_rvec(fr->mu_tot[0],mu_tot);
+ }
+ else
+ {
+ for(j=0; j<DIM; j++)
+ {
+ mu_tot[j] =
+ (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
+ lambda[efptCOUL]*fr->mu_tot[1][j];
+ }
+ }
+
+ /* Reset energies */
+ reset_enerdata(&(inputrec->opts),fr,bNS,enerd,MASTER(cr));
+ clear_rvecs(SHIFTS,fr->fshift);
+
+ if (DOMAINDECOMP(cr))
+ {
+ if (!(cr->duty & DUTY_PME))
+ {
+ wallcycle_start(wcycle,ewcPPDURINGPME);
+ dd_force_flop_start(cr->dd,nrnb);
+ }
+ }
+
+ /* Start the force cycle counter.
+ * This counter is stopped in do_forcelow_level.
+ * No parallel communication should occur while this counter is running,
+ * since that will interfere with the dynamic load balancing.
+ */
+ wallcycle_start(wcycle,ewcFORCE);
+ if (bDoForces)
+ {
+ /* Reset forces for which the virial is calculated separately:
+ * PME/Ewald forces if necessary */
+ if (fr->bF_NoVirSum)
+ {
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ fr->f_novirsum = fr->f_novirsum_alloc;
+ GMX_BARRIER(cr->mpi_comm_mygroup);
+ if (fr->bDomDec)
+ {
+ clear_rvecs(fr->f_novirsum_n,fr->f_novirsum);
+ }
+ else
+ {
+ clear_rvecs(homenr,fr->f_novirsum+start);
+ }
+ GMX_BARRIER(cr->mpi_comm_mygroup);
+ }
+ else
+ {
+ /* We are not calculating the pressure so we do not need
+ * a separate array for forces that do not contribute
+ * to the pressure.
+ */
+ fr->f_novirsum = f;
+ }
+ }
+
+ if (bSepLRF)
+ {
+ /* Add the long range forces to the short range forces */
+ for(i=0; i<fr->natoms_force_constr; i++)
+ {
+ copy_rvec(fr->f_twin[i],f[i]);
+ }
+ }
+ else if (!(fr->bTwinRange && bNS))
+ {
+ /* Clear the short-range forces */
+ clear_rvecs(fr->natoms_force_constr,f);
+ }
+
+ clear_rvec(fr->vir_diag_posres);
+
+ GMX_BARRIER(cr->mpi_comm_mygroup);
+ }
+ if (inputrec->ePull == epullCONSTRAINT)
+ {
+ clear_pull_forces(inputrec->pull);
+ }
+
+ /* update QMMMrec, if necessary */
+ if(fr->bQMMM)
+ {
+ update_QMMMrec(cr,fr,x,mdatoms,box,top);
+ }
+
+ if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ {
+ posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
+ f,enerd,lambda,fr);
+ }
+
+ /* Compute the bonded and non-bonded energies and optionally forces */
+ /* if we use the GPU turn off the nonbonded */
+ do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
+ cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
+ x,hist,f,enerd,fcd,mtop,top,fr->born,
+ &(top->atomtypes),bBornRadii,box,
+ inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
+ ((nb_kernel_type == nbk8x8x8_CUDA || nb_kernel_type == nbk8x8x8_PlainC)
+ ? flags&~GMX_FORCE_NONBONDED : flags),
+ &cycles_pme);
+
+ if (!bUseOrEmulGPU)
+ {
+ /* Maybe we should move this into do_force_lowlevel */
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+ nrnb, wcycle);
+ }
+
+
+ if (!bUseOrEmulGPU || bDiffKernels)
+ {
+ int aloc;
+
+ if (DOMAINDECOMP(cr))
+ {
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
+ bDiffKernels ? enbvClearFYes : enbvClearFNo,
+ nrnb, wcycle);
+ }
+
+ if (!bUseOrEmulGPU)
+ {
+ aloc = eintLocal;
+ }
+ else
+ {
+ aloc = eintNonlocal;
+ }
+
+ /* Add all the non-bonded force to the normal force array.
+ * This can be split into a local a non-local part when overlapping
+ * communication with calculation with domain decomposition.
+ */
+ cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatAll,nbv->grp[aloc].nbat,f);
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_start_nocount(wcycle,ewcFORCE);
+
+ /* if there are multiple fshift output buffers reduce them */
+ if ((flags & GMX_FORCE_VIRIAL) &&
+ nbv->grp[aloc].nbl_lists.nnbl > 1)
+ {
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+ fr->fshift);
+ }
+ }
+
+ cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+ GMX_BARRIER(cr->mpi_comm_mygroup);
+
+ if (ed)
+ {
+ do_flood(fplog,cr,x,f,ed,box,step,bNS);
+ }
+
+ if (bUseOrEmulGPU && !bDiffKernels)
+ {
+ /* wait for non-local forces (or calculate in emulation mode) */
+ if (DOMAINDECOMP(cr))
+ {
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle,ewcWAIT_GPU_NB_NL);
+ nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ nbv->grp[eintNonlocal].nbat,
+ flags, eatNonlocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
+ cycles_force += wallcycle_stop(wcycle,ewcWAIT_GPU_NB_NL);
+ }
+ else
+ {
+ wallcycle_start_nocount(wcycle,ewcFORCE);
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+ nrnb, wcycle);
+ cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+ }
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ /* skip the reduction if there was no non-local work to do */
+ if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ {
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatNonlocal,
+ nbv->grp[eintNonlocal].nbat,f);
+ }
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+ }
+
+ if (bDoForces)
+ {
+ /* Communicate the forces */
+ if (PAR(cr))
+ {
+ wallcycle_start(wcycle,ewcMOVEF);
+ if (DOMAINDECOMP(cr))
+ {
+ dd_move_f(cr->dd,f,fr->fshift);
+ /* Do we need to communicate the separate force array
+ * for terms that do not contribute to the single sum virial?
+ * Position restraints and electric fields do not introduce
+ * inter-cg forces, only full electrostatics methods do.
+ * When we do not calculate the virial, fr->f_novirsum = f,
+ * so we have already communicated these forces.
+ */
+ if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ (flags & GMX_FORCE_VIRIAL))
+ {
+ dd_move_f(cr->dd,fr->f_novirsum,NULL);
+ }
+ if (bSepLRF)
+ {
+ /* We should not update the shift forces here,
+ * since f_twin is already included in f.
+ */
+ dd_move_f(cr->dd,fr->f_twin,NULL);
+ }
+ }
+ wallcycle_stop(wcycle,ewcMOVEF);
+ }
+ }
+
+ if (bUseOrEmulGPU)
+ {
+ /* wait for local forces (or calculate in emulation mode) */
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle,ewcWAIT_GPU_NB_L);
+ nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ nbv->grp[eintLocal].nbat,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
+ wallcycle_stop(wcycle,ewcWAIT_GPU_NB_L);
+
+ /* now clear the GPU outputs while we finish the step on the CPU */
+ nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+ }
+ else
+ {
+ wallcycle_start_nocount(wcycle,ewcFORCE);
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+ nrnb, wcycle);
+ wallcycle_stop(wcycle,ewcFORCE);
+ }
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ {
+ /* skip the reduction if there was no non-local work to do */
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatLocal,
+ nbv->grp[eintLocal].nbat,f);
+ }
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ dd_force_flop_stop(cr->dd,nrnb);
+ if (wcycle)
+ {
+ dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
+ }
+ }
+
+ if (bDoForces)
+ {
+ if (IR_ELEC_FIELD(*inputrec))
+ {
+ /* Compute forces due to electric field */
+ calc_f_el(MASTER(cr) ? field : NULL,
+ start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
+ inputrec->ex,inputrec->et,t);
+ }
+
+ /* If we have NoVirSum forces, but we do not calculate the virial,
+ * we sum fr->f_novirum=f later.
+ */
+ if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ {
+ wallcycle_start(wcycle,ewcVSITESPREAD);
+ spread_vsite_f(fplog,vsite,x,f,fr->fshift,FALSE,NULL,nrnb,
+ &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+ wallcycle_stop(wcycle,ewcVSITESPREAD);
+
+ if (bSepLRF)
+ {
+ wallcycle_start(wcycle,ewcVSITESPREAD);
+ spread_vsite_f(fplog,vsite,x,fr->f_twin,NULL,FALSE,NULL,
+ nrnb,
+ &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+ wallcycle_stop(wcycle,ewcVSITESPREAD);
+ }
+ }
+
+ if (flags & GMX_FORCE_VIRIAL)
{
- Ext[m] = 0;
+ /* Calculation of the virial must be done after vsites! */
+ calc_virial(fplog,mdatoms->start,mdatoms->homenr,x,f,
+ vir_force,graph,box,nrnb,fr,inputrec->ePBC);
}
}
- if (fp != NULL)
+
+ if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
{
- fprintf(fp,"%10g %10g %10g %10g #FIELD\n",t,
- Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
+ pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
+ f,vir_force,mdatoms,enerd,lambda,t);
}
-}
-
-static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
- tensor vir_part,t_graph *graph,matrix box,
- t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
-{
- int i,j;
- tensor virtest;
-
- /* The short-range virial from surrounding boxes */
- clear_mat(vir_part);
- calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
- inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
-
- /* Calculate partial virial, for local atoms only, based on short range.
- * Total virial is computed in global_stat, called from do_md
- */
- f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
- inc_nrnb(nrnb,eNR_VIRIAL,homenr);
-
- /* Add position restraint contribution */
- for(i=0; i<DIM; i++) {
- vir_part[i][i] += fr->vir_diag_posres[i];
- }
-
- /* Add wall contribution */
- for(i=0; i<DIM; i++) {
- vir_part[i][ZZ] += fr->vir_wall_z[i];
- }
-
- if (debug)
- pr_rvecs(debug,0,"vir_part",vir_part,DIM);
-}
-static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
- gmx_large_int_t step,real pforce,rvec *x,rvec *f)
-{
- int i;
- real pf2,fn2;
- char buf[STEPSTRSIZE];
+ if (PAR(cr) && !(cr->duty & DUTY_PME))
+ {
+ /* In case of node-splitting, the PP nodes receive the long-range
+ * forces, virial and energy from the PME nodes here.
+ */
+ pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
+ }
- pf2 = sqr(pforce);
- for(i=md->start; i<md->start+md->homenr; i++) {
- fn2 = norm2(f[i]);
- /* We also catch NAN, if the compiler does not optimize this away. */
- if (fn2 >= pf2 || fn2 != fn2) {
- fprintf(fp,"step %s atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
- gmx_step_str(step,buf),
- ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
+ if (bDoForces)
+ {
+ post_process_forces(fplog,cr,step,nrnb,wcycle,
+ top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
+ flags);
}
- }
+
+ /* Sum the potential energy terms from group contributions */
+ sum_epot(&(inputrec->opts),enerd);
}
-void do_force(FILE *fplog,t_commrec *cr,
+void do_force_cutsGROUP(FILE *fplog,t_commrec *cr,
t_inputrec *inputrec,
gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_localtop_t *top,
gmx_bool bDoLongRange,bDoForces,bSepLRF;
gmx_bool bDoAdressWF;
matrix boxs;
+ rvec vzero,box_diag;
real e,v,dvdlambda[efptNR];
- real dvdl_dum,lambda_dum;
t_pbc pbc;
- float cycles_ppdpme,cycles_pme,cycles_seppme,cycles_force;
+ float cycles_pme,cycles_force;
start = mdatoms->start;
homenr = mdatoms->homenr;
{
update_forcerec(fplog,fr,box);
- /* Calculate total (local) dipole moment in a temporary common array.
- * This makes it possible to sum them over nodes faster.
- */
- calc_mu(start,homenr,
- x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
- mu,mu+DIM);
+ if (NEED_MUTOT(*inputrec))
+ {
+ /* Calculate total (local) dipole moment in a temporary common array.
+ * This makes it possible to sum them over nodes faster.
+ */
+ calc_mu(start,homenr,
+ x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
+ mu,mu+DIM);
+ }
}
- if (fr->ePBC != epbcNONE) {
- /* Compute shift vectors every step,
- * because of pressure coupling or box deformation!
- */
- if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
- calc_shifts(box,fr->shift_vec);
-
- if (bCalcCGCM) {
- put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
- &(top->cgs),x,fr->cg_cm);
- inc_nrnb(nrnb,eNR_CGCM,homenr);
- inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
- }
- else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
- unshift_self(graph,box,x);
+ if (fr->ePBC != epbcNONE) {
+ /* Compute shift vectors every step,
+ * because of pressure coupling or box deformation!
+ */
+ if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ calc_shifts(box,fr->shift_vec);
+
+ if (bCalcCGCM) {
+ put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
+ &(top->cgs),x,fr->cg_cm);
+ inc_nrnb(nrnb,eNR_CGCM,homenr);
+ inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
+ }
+ else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
+ unshift_self(graph,box,x);
+ }
+ }
+ else if (bCalcCGCM) {
+ calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
+ inc_nrnb(nrnb,eNR_CGCM,homenr);
}
- }
- else if (bCalcCGCM) {
- calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
- inc_nrnb(nrnb,eNR_CGCM,homenr);
- }
- if (bCalcCGCM) {
- if (PAR(cr)) {
- move_cgcm(fplog,cr,fr->cg_cm);
+ if (bCalcCGCM) {
+ if (PAR(cr)) {
+ move_cgcm(fplog,cr,fr->cg_cm);
+ }
+ if (gmx_debug_at)
+ pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
}
- if (gmx_debug_at)
- pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
- }
#ifdef GMX_MPI
- if (!(cr->duty & DUTY_PME)) {
- /* Send particle coordinates to the pme nodes.
- * Since this is only implemented for domain decomposition
- * and domain decomposition does not use the graph,
- * we do not need to worry about shifting.
- */
+ if (!(cr->duty & DUTY_PME)) {
+ /* Send particle coordinates to the pme nodes.
+ * Since this is only implemented for domain decomposition
+ * and domain decomposition does not use the graph,
+ * we do not need to worry about shifting.
+ */
+
+ wallcycle_start(wcycle,ewcPP_PMESENDX);
+ GMX_MPE_LOG(ev_send_coordinates_start);
+
+ bBS = (inputrec->nwall == 2);
+ if (bBS) {
+ copy_mat(box,boxs);
+ svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+ }
- wallcycle_start(wcycle,ewcPP_PMESENDX);
- GMX_MPE_LOG(ev_send_coordinates_start);
+ gmx_pme_send_x(cr,bBS ? boxs : box,x,
+ mdatoms->nChargePerturbed,lambda[efptCOUL],
+ (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
- bBS = (inputrec->nwall == 2);
- if (bBS) {
- copy_mat(box,boxs);
- svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+ GMX_MPE_LOG(ev_send_coordinates_finish);
+ wallcycle_stop(wcycle,ewcPP_PMESENDX);
}
-
- gmx_pme_send_x(cr,bBS ? boxs : box,x,
- mdatoms->nChargePerturbed,lambda[efptCOUL],
- ( flags & GMX_FORCE_VIRIAL),step);
-
- GMX_MPE_LOG(ev_send_coordinates_finish);
- wallcycle_stop(wcycle,ewcPP_PMESENDX);
- }
#endif /* GMX_MPI */
/* Communicate coordinates and sum dipole if necessary */
{
move_x(fplog,cr,GMX_LEFT,GMX_RIGHT,x,nrnb);
}
- /* When we don't need the total dipole we sum it in global_stat */
- if (bStateChanged && NEED_MUTOT(*inputrec))
+ wallcycle_stop(wcycle,ewcMOVEX);
+ }
+
+ /* update adress weight beforehand */
+ if(bStateChanged && bDoAdressWF)
+ {
+ /* need pbc for adress weight calculation with pbc_dx */
+ set_pbc(&pbc,inputrec->ePBC,box);
+ if(fr->adress_site == eAdressSITEcog)
{
- gmx_sumd(2*DIM,mu,cr);
+ update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
+ inputrec->ePBC==epbcNONE ? NULL : &pbc);
+ }
+ else if (fr->adress_site == eAdressSITEcom)
+ {
+ update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
+ inputrec->ePBC==epbcNONE ? NULL : &pbc);
+ }
+ else if (fr->adress_site == eAdressSITEatomatom){
+ update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
+ inputrec->ePBC==epbcNONE ? NULL : &pbc);
+ }
+ else
+ {
+ update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
+ inputrec->ePBC==epbcNONE ? NULL : &pbc);
}
- wallcycle_stop(wcycle,ewcMOVEX);
}
- if (bStateChanged)
+
+ if (NEED_MUTOT(*inputrec))
{
- /* update adress weight beforehand */
- if(bDoAdressWF)
+ if (bStateChanged)
{
- /* need pbc for adress weight calculation with pbc_dx */
- set_pbc(&pbc,inputrec->ePBC,box);
- if(fr->adress_site == eAdressSITEcog)
- {
- update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
- inputrec->ePBC==epbcNONE ? NULL : &pbc);
- }
- else if (fr->adress_site == eAdressSITEcom)
+ if (PAR(cr))
{
- update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
- inputrec->ePBC==epbcNONE ? NULL : &pbc);
- }
- else if (fr->adress_site == eAdressSITEatomatom){
- update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
- inputrec->ePBC==epbcNONE ? NULL : &pbc);
+ gmx_sumd(2*DIM,mu,cr);
}
- else
+ for(i=0; i<2; i++)
{
- update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
- inputrec->ePBC==epbcNONE ? NULL : &pbc);
+ for(j=0;j<DIM;j++)
+ {
+ fr->mu_tot[i][j] = mu[i*DIM + j];
+ }
}
}
-
- for(i=0; i<2; i++)
+ if (fr->efep == efepNO)
{
- for(j=0;j<DIM;j++)
- {
- fr->mu_tot[i][j] = mu[i*DIM + j];
- }
+ copy_rvec(fr->mu_tot[0],mu_tot);
}
- }
- if (fr->efep == efepNO)
- {
- copy_rvec(fr->mu_tot[0],mu_tot);
- }
- else
- {
- for(j=0; j<DIM; j++)
+ else
{
- mu_tot[j] =
- (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+ for(j=0; j<DIM; j++)
+ {
+ mu_tot[j] =
+ (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+ }
}
}
* since that will interfere with the dynamic load balancing.
*/
wallcycle_start(wcycle,ewcFORCE);
-
+
if (bDoForces)
{
/* Reset forces for which the virial is calculated separately:
if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
{
- /* Position restraints always require full pbc. Check if we already did it for Adress */
- if(!(bStateChanged && bDoAdressWF))
- {
- set_pbc(&pbc,inputrec->ePBC,box);
- }
- v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
- top->idef.iparams_posres,
- (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
- inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda[efptRESTRAINT],&(dvdlambda[efptRESTRAINT]),
- fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
- if (bSepDVDL)
- {
- fprintf(fplog,sepdvdlformat,
- interaction_function[F_POSRES].longname,v,dvdlambda);
- }
- enerd->term[F_POSRES] += v;
- /* This linear lambda dependence assumption is only correct
- * when only k depends on lambda,
- * not when the reference position depends on lambda.
- * grompp checks for this. (verify this is still the case?)
- */
- enerd->dvdl_nonlin[efptRESTRAINT] += dvdlambda[efptRESTRAINT]; /* if just the force constant changes, this is linear,
- but we can't be sure w/o additional checking that is
- hard to do at this level of code. Otherwise,
- the dvdl is not differentiable */
- inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
- if ((inputrec->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
- {
- for(i=0; i<enerd->n_lambda; i++)
- {
- lambda_dum = (i==0 ? lambda[efptRESTRAINT] : inputrec->fepvals->all_lambda[efptRESTRAINT][i-1]);
- v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
- top->idef.iparams_posres,
- (const rvec*)x,NULL,NULL,
- inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl_dum,
- fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
- enerd->enerpart_lambda[i] += v;
- }
- }
- }
+ posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
+ f,enerd,lambda,fr);
+ }
/* Compute the bonded and non-bonded energies and optionally forces */
do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
x,hist,f,enerd,fcd,mtop,top,fr->born,
&(top->atomtypes),bBornRadii,box,
- inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
- flags,&cycles_pme);
+ inputrec->fepvals,lambda,
+ graph,&(top->excls),fr->mu_tot,
+ flags,
+ &cycles_pme);
cycles_force = wallcycle_stop(wcycle,ewcFORCE);
GMX_BARRIER(cr->mpi_comm_mygroup);
}
}
- enerd->term[F_COM_PULL] = 0;
if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
{
- /* Calculate the center of mass forces, this requires communication,
- * which is why pull_potential is called close to other communication.
- * The virial contribution is calculated directly,
- * which is why we call pull_potential after calc_virial.
- */
- set_pbc(&pbc,inputrec->ePBC,box);
- dvdlambda[efptRESTRAINT] = 0;
- enerd->term[F_COM_PULL] +=
- pull_potential(inputrec->ePull,inputrec->pull,mdatoms,&pbc,
- cr,t,lambda[efptRESTRAINT],x,f,vir_force,&(dvdlambda[efptRESTRAINT]));
- if (bSepDVDL)
- {
- fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdlambda[efptRESTRAINT]);
- }
- enerd->dvdl_lin[efptRESTRAINT] += dvdlambda[efptRESTRAINT];
+ pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
+ f,vir_force,mdatoms,enerd,lambda,t);
}
/* Add the forces from enforced rotation potentials (if any) */
if (PAR(cr) && !(cr->duty & DUTY_PME))
{
- cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
- dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
-
- /* In case of node-splitting, the PP nodes receive the long-range
+ /* In case of node-splitting, the PP nodes receive the long-range
* forces, virial and energy from the PME nodes here.
*/
- wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
- dvdlambda[efptCOUL] = 0;
- gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdlambda[efptCOUL],
- &cycles_seppme);
- if (bSepDVDL)
- {
- fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdlambda[efptCOUL]);
- }
- enerd->term[F_COUL_RECIP] += e;
- enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
- if (wcycle)
- {
- dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
- }
- wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
+ pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
}
- if (bDoForces && fr->bF_NoVirSum)
+ if (bDoForces)
{
- if (vsite)
- {
- /* Spread the mesh force on virtual sites to the other particles...
- * This is parallellized. MPI communication is performed
- * if the constructing atoms aren't local.
- */
- wallcycle_start(wcycle,ewcVSITESPREAD);
- spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
- (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
- nrnb,
- &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
- wallcycle_stop(wcycle,ewcVSITESPREAD);
- }
- if (flags & GMX_FORCE_VIRIAL)
- {
- /* Now add the forces, this is local */
- if (fr->bDomDec)
- {
- sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
- }
- else
- {
- sum_forces(start,start+homenr,f,fr->f_novirsum);
- }
- if (EEL_FULL(fr->eeltype))
- {
- /* Add the mesh contribution to the virial */
- m_add(vir_force,fr->vir_el_recip,vir_force);
- }
- if (debug)
- {
- pr_rvecs(debug,0,"vir_force",vir_force,DIM);
- }
- }
+ post_process_forces(fplog,cr,step,nrnb,wcycle,
+ top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
+ flags);
}
/* Sum the potential energy terms from group contributions */
sum_epot(&(inputrec->opts),enerd);
+}
+
+void do_force(FILE *fplog,t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_mtop_t *mtop,
+ gmx_groups_t *groups,
+ matrix box,rvec x[],history_t *hist,
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd,t_fcdata *fcd,
+ real *lambda,t_graph *graph,
+ t_forcerec *fr,
+ gmx_vsite_t *vsite,rvec mu_tot,
+ double t,FILE *field,gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags)
+{
+ /* modify force flag if not doing nonbonded */
+ if (!fr->bNonbonded)
+ {
+ flags &= ~GMX_FORCE_NONBONDED;
+ }
- if (fr->print_force >= 0 && bDoForces)
+ switch (inputrec->cutoff_scheme)
{
- print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
+ case ecutsVERLET:
+ do_force_cutsVERLET(fplog, cr, inputrec,
+ step, nrnb, wcycle,
+ top, mtop,
+ groups,
+ box, x, hist,
+ f, vir_force,
+ mdatoms,
+ enerd, fcd,
+ lambda, graph,
+ fr, fr->ic,
+ vsite, mu_tot,
+ t, field, ed,
+ bBornRadii,
+ flags);
+ break;
+ case ecutsGROUP:
+ do_force_cutsGROUP(fplog, cr, inputrec,
+ step, nrnb, wcycle,
+ top, mtop,
+ groups,
+ box, x, hist,
+ f, vir_force,
+ mdatoms,
+ enerd, fcd,
+ lambda, graph,
+ fr, vsite, mu_tot,
+ t, field, ed,
+ bBornRadii,
+ flags);
+ break;
+ default:
+ gmx_incons("Invalid cut-off scheme passed!");
}
}
+
void do_constrain_first(FILE *fplog,gmx_constr_t constr,
t_inputrec *ir,t_mdatoms *md,
t_state *state,rvec *f,
constrain(NULL,TRUE,FALSE,constr,&(top->idef),
ir,NULL,cr,step,0,md,
state->x,state->x,NULL,
- state->box,state->lambda[efptBONDED],&dvdl_dum,
- NULL,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
+ fr->bMolPBC,state->box,
+ state->lambda[efptBONDED],&dvdl_dum,
+ NULL,NULL,nrnb,econqCoord,
+ ir->epc==epcMTTK,state->veta,state->veta);
if (EI_VV(ir->eI))
{
/* constrain the inital velocity, and save it */
constrain(NULL,TRUE,FALSE,constr,&(top->idef),
ir,NULL,cr,step,0,md,
state->x,state->v,state->v,
- state->box,state->lambda[efptBONDED],&dvdl_dum,
- NULL,NULL,nrnb,econqVeloc,ir->epc==epcMTTK,state->veta,state->veta);
+ fr->bMolPBC,state->box,
+ state->lambda[efptBONDED],&dvdl_dum,
+ NULL,NULL,nrnb,econqVeloc,
+ ir->epc==epcMTTK,state->veta,state->veta);
}
/* constrain the inital velocities at t-dt/2 */
if (EI_STATE_VELOCITY(ir->eI) && ir->eI!=eiVV)
constrain(NULL,TRUE,FALSE,constr,&(top->idef),
ir,NULL,cr,step,-1,md,
state->x,savex,NULL,
- state->box,state->lambda[efptBONDED],&dvdl_dum,
- state->v,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
-
+ fr->bMolPBC,state->box,
+ state->lambda[efptBONDED],&dvdl_dum,
+ state->v,NULL,nrnb,econqCoord,
+ ir->epc==epcMTTK,state->veta,state->veta);
+
for(i=start; i<end; i++) {
for(m=0; m<DIM; m++) {
/* Re-reverse the velocities */
"WARNING: using dispersion correction with user tables\n");
rc3 = fr->rvdw*fr->rvdw*fr->rvdw;
rc9 = rc3*rc3*rc3;
+ /* Contribution beyond the cut-off */
eners[0] += -4.0*M_PI/(3.0*rc3);
eners[1] += 4.0*M_PI/(9.0*rc9);
+ if (fr->cutoff_scheme == ecutsVERLET && fr->ic->sh_invrc6 != 0) {
+ /* Contribution within the cut-off */
+ eners[0] += -4.0*M_PI/(3.0*rc3);
+ eners[1] += 4.0*M_PI/(3.0*rc9);
+ }
+ /* Contribution beyond the cut-off */
virs[0] += 8.0*M_PI/rc3;
virs[1] += -16.0*M_PI/(3.0*rc9);
} else {
t_inputrec *inputrec,
t_nrnb nrnb[],gmx_wallcycle_t wcycle,
gmx_runtime_t *runtime,
+ wallclock_gpu_t *gputimes,
+ int omp_nth_pp,
gmx_bool bWriteStat)
{
- int i,j;
- t_nrnb *nrnb_tot=NULL;
- real delta_t;
- double nbfs,mflop;
- double cycles[ewcNR];
+ int i,j;
+ t_nrnb *nrnb_tot=NULL;
+ real delta_t;
+ double nbfs,mflop;
- wallcycle_sum(cr,wcycle,cycles);
+ wallcycle_sum(cr,wcycle);
- if (cr->nnodes > 1) {
- if (SIMMASTER(cr))
- snew(nrnb_tot,1);
+ if (cr->nnodes > 1)
+ {
+ snew(nrnb_tot,1);
#ifdef GMX_MPI
- MPI_Reduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
- MASTERRANK(cr),cr->mpi_comm_mysim);
+ MPI_Allreduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
+ cr->mpi_comm_mysim);
#endif
- } else {
- nrnb_tot = nrnb;
- }
+ }
+ else
+ {
+ nrnb_tot = nrnb;
+ }
- if (SIMMASTER(cr)) {
- print_flop(fplog,nrnb_tot,&nbfs,&mflop);
- if (cr->nnodes > 1) {
- sfree(nrnb_tot);
+#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
+ if (cr->nnodes > 1)
+ {
+ /* reduce nodetime over all MPI processes in the current simulation */
+ double sum;
+ MPI_Allreduce(&runtime->proctime,&sum,1,MPI_DOUBLE,MPI_SUM,
+ cr->mpi_comm_mysim);
+ runtime->proctime = sum;
}
- }
+#endif
- if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr)) {
- print_dd_statistics(cr,inputrec,fplog);
- }
+ if (SIMMASTER(cr))
+ {
+ print_flop(fplog,nrnb_tot,&nbfs,&mflop);
+ }
+ if (cr->nnodes > 1)
+ {
+ sfree(nrnb_tot);
+ }
+
+ if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+ {
+ print_dd_statistics(cr,inputrec,fplog);
+ }
#ifdef GMX_MPI
if (PARTDECOMP(cr))
}
#endif
- if (SIMMASTER(cr)) {
- wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
- wcycle,cycles);
-
- if (EI_DYNAMICS(inputrec->eI)) {
- delta_t = inputrec->delta_t;
- } else {
- delta_t = 0;
- }
+ if (SIMMASTER(cr))
+ {
+ wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
+ wcycle,gputimes);
- if (fplog) {
- print_perf(fplog,runtime->proctime,runtime->realtime,
- cr->nnodes-cr->npmenodes,
- runtime->nsteps_done,delta_t,nbfs,mflop);
- }
- if (bWriteStat) {
- print_perf(stderr,runtime->proctime,runtime->realtime,
- cr->nnodes-cr->npmenodes,
- runtime->nsteps_done,delta_t,nbfs,mflop);
- }
+ if (EI_DYNAMICS(inputrec->eI))
+ {
+ delta_t = inputrec->delta_t;
+ }
+ else
+ {
+ delta_t = 0;
+ }
- /*
- runtime=inputrec->nsteps*inputrec->delta_t;
- if (bWriteStat) {
- if (cr->nnodes == 1)
- fprintf(stderr,"\n\n");
- print_perf(stderr,nodetime,realtime,runtime,&ntot,
- cr->nnodes-cr->npmenodes,FALSE);
+ if (fplog)
+ {
+ print_perf(fplog,runtime->proctime,runtime->realtime,
+ cr->nnodes-cr->npmenodes,
+ runtime->nsteps_done,delta_t,nbfs,mflop,
+ omp_nth_pp);
+ }
+ if (bWriteStat)
+ {
+ print_perf(stderr,runtime->proctime,runtime->realtime,
+ cr->nnodes-cr->npmenodes,
+ runtime->nsteps_done,delta_t,nbfs,mflop,
+ omp_nth_pp);
+ }
}
- wallcycle_print(fplog,cr->nnodes,cr->npmenodes,realtime,wcycle,cycles);
- print_perf(fplog,nodetime,realtime,runtime,&ntot,cr->nnodes-cr->npmenodes,
- TRUE);
- if (PARTDECOMP(cr))
- pr_load(fplog,cr,nrnb_all);
- if (cr->nnodes > 1)
- sfree(nrnb_all);
- */
- }
}
extern void initialize_lambdas(FILE *fplog,t_inputrec *ir,int *fep_state,real *lambda,double *lam0)
debug_gmx();
}
-
-
-