Merge branch 'release-4-6'
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  *          GROningen MAchine for Chemical Simulations
9  *
10  *                        VERSION 3.2.0
11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13  * Copyright (c) 2001-2004, The GROMACS development team,
14  * check out http://www.gromacs.org for more information.
15
16  * This program is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU General Public License
18  * as published by the Free Software Foundation; either version 2
19  * of the License, or (at your option) any later version.
20  *
21  * If you want to redistribute modifications, please consider that
22  * scientific software is very special. Version control is crucial -
23  * bugs must be traceable. We will be happy to consider code for
24  * inclusion in the official distribution, but derived work must not
25  * be called official GROMACS. Details are found in the README & COPYING
26  * files - if they are missing, get the official version at www.gromacs.org.
27  *
28  * To help us fund GROMACS development, we humbly ask that you cite
29  * the papers on the package - you can find them in the top README file.
30  *
31  * For more info, check our website at http://www.gromacs.org
32  *
33  * And Hey:
34  * GROwing Monsters And Cloning Shrimps
35  */
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39
40 #include <stdio.h>
41 #ifdef HAVE_SYS_TIME_H
42 #include <sys/time.h>
43 #endif
44 #include <math.h>
45 #include "typedefs.h"
46 #include "string2.h"
47 #include "smalloc.h"
48 #include "names.h"
49 #include "mvdata.h"
50 #include "txtdump.h"
51 #include "pbc.h"
52 #include "chargegroup.h"
53 #include "vec.h"
54 #include "nrnb.h"
55 #include "mshift.h"
56 #include "mdrun.h"
57 #include "sim_util.h"
58 #include "update.h"
59 #include "physics.h"
60 #include "main.h"
61 #include "mdatoms.h"
62 #include "force.h"
63 #include "bondf.h"
64 #include "pme.h"
65 #include "disre.h"
66 #include "orires.h"
67 #include "network.h"
68 #include "calcmu.h"
69 #include "constr.h"
70 #include "xvgr.h"
71 #include "copyrite.h"
72 #include "pull_rotation.h"
73 #include "gmx_random.h"
74 #include "domdec.h"
75 #include "partdec.h"
76 #include "genborn.h"
77 #include "nbnxn_atomdata.h"
78 #include "nbnxn_search.h"
79 #include "nbnxn_kernels/nbnxn_kernel_ref.h"
80 #include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
81 #include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
82 #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
83
84 #include "gromacs/timing/wallcycle.h"
85 #include "gromacs/timing/walltime_accounting.h"
86 #include "gromacs/utility/gmxmpi.h"
87
88 #include "adress.h"
89 #include "qmmm.h"
90
91 #include "nbnxn_cuda_data_mgmt.h"
92 #include "nbnxn_cuda/nbnxn_cuda.h"
93
94 void print_time(FILE                     *out,
95                 gmx_walltime_accounting_t walltime_accounting,
96                 gmx_large_int_t           step,
97                 t_inputrec               *ir,
98                 t_commrec gmx_unused     *cr)
99 {
100     time_t finish;
101     char   timebuf[STRLEN];
102     double dt, elapsed_seconds, time_per_step;
103     char   buf[48];
104
105 #ifndef GMX_THREAD_MPI
106     if (!PAR(cr))
107 #endif
108     {
109         fprintf(out, "\r");
110     }
111     fprintf(out, "step %s", gmx_step_str(step, buf));
112     if ((step >= ir->nstlist))
113     {
114         double seconds_since_epoch = gmx_gettime();
115         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
116         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
117         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
118
119         if (ir->nsteps >= 0)
120         {
121             if (dt >= 300)
122             {
123                 finish = (time_t) (seconds_since_epoch + dt);
124                 gmx_ctime_r(&finish, timebuf, STRLEN);
125                 sprintf(buf, "%s", timebuf);
126                 buf[strlen(buf)-1] = '\0';
127                 fprintf(out, ", will finish %s", buf);
128             }
129             else
130             {
131                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
132             }
133         }
134         else
135         {
136             fprintf(out, " performance: %.1f ns/day    ",
137                     ir->delta_t/1000*24*60*60/time_per_step);
138         }
139     }
140 #ifndef GMX_THREAD_MPI
141     if (PAR(cr))
142     {
143         fprintf(out, "\n");
144     }
145 #endif
146
147     fflush(out);
148 }
149
150 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
151                          const gmx_walltime_accounting_t walltime_accounting)
152 {
153     int    i;
154     char   timebuf[STRLEN];
155     char   time_string[STRLEN];
156     time_t tmptime;
157
158     if (fplog)
159     {
160         if (walltime_accounting != NULL)
161         {
162             tmptime = (time_t) walltime_accounting_get_start_time_stamp(walltime_accounting);
163             gmx_ctime_r(&tmptime, timebuf, STRLEN);
164         }
165         else
166         {
167             tmptime = (time_t) gmx_gettime();
168             gmx_ctime_r(&tmptime, timebuf, STRLEN);
169         }
170         for (i = 0; timebuf[i] >= ' '; i++)
171         {
172             time_string[i] = timebuf[i];
173         }
174         time_string[i] = '\0';
175
176         fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
177     }
178 }
179
180 static void sum_forces(int start, int end, rvec f[], rvec flr[])
181 {
182     int i;
183
184     if (gmx_debug_at)
185     {
186         pr_rvecs(debug, 0, "fsr", f+start, end-start);
187         pr_rvecs(debug, 0, "flr", flr+start, end-start);
188     }
189     for (i = start; (i < end); i++)
190     {
191         rvec_inc(f[i], flr[i]);
192     }
193 }
194
195 /*
196  * calc_f_el calculates forces due to an electric field.
197  *
198  * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
199  *
200  * Et[] contains the parameters for the time dependent
201  * part of the field (not yet used).
202  * Ex[] contains the parameters for
203  * the spatial dependent part of the field. You can have cool periodic
204  * fields in principle, but only a constant field is supported
205  * now.
206  * The function should return the energy due to the electric field
207  * (if any) but for now returns 0.
208  *
209  * WARNING:
210  * There can be problems with the virial.
211  * Since the field is not self-consistent this is unavoidable.
212  * For neutral molecules the virial is correct within this approximation.
213  * For neutral systems with many charged molecules the error is small.
214  * But for systems with a net charge or a few charged molecules
215  * the error can be significant when the field is high.
216  * Solution: implement a self-consitent electric field into PME.
217  */
218 static void calc_f_el(FILE *fp, int  start, int homenr,
219                       real charge[], rvec f[],
220                       t_cosines Ex[], t_cosines Et[], double t)
221 {
222     rvec Ext;
223     real t0;
224     int  i, m;
225
226     for (m = 0; (m < DIM); m++)
227     {
228         if (Et[m].n > 0)
229         {
230             if (Et[m].n == 3)
231             {
232                 t0     = Et[m].a[1];
233                 Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
234             }
235             else
236             {
237                 Ext[m] = cos(Et[m].a[0]*t);
238             }
239         }
240         else
241         {
242             Ext[m] = 1.0;
243         }
244         if (Ex[m].n > 0)
245         {
246             /* Convert the field strength from V/nm to MD-units */
247             Ext[m] *= Ex[m].a[0]*FIELDFAC;
248             for (i = start; (i < start+homenr); i++)
249             {
250                 f[i][m] += charge[i]*Ext[m];
251             }
252         }
253         else
254         {
255             Ext[m] = 0;
256         }
257     }
258     if (fp != NULL)
259     {
260         fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
261                 Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
262     }
263 }
264
265 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
266                         tensor vir_part, t_graph *graph, matrix box,
267                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
268 {
269     int    i, j;
270     tensor virtest;
271
272     /* The short-range virial from surrounding boxes */
273     clear_mat(vir_part);
274     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
275     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
276
277     /* Calculate partial virial, for local atoms only, based on short range.
278      * Total virial is computed in global_stat, called from do_md
279      */
280     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
281     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
282
283     /* Add position restraint contribution */
284     for (i = 0; i < DIM; i++)
285     {
286         vir_part[i][i] += fr->vir_diag_posres[i];
287     }
288
289     /* Add wall contribution */
290     for (i = 0; i < DIM; i++)
291     {
292         vir_part[i][ZZ] += fr->vir_wall_z[i];
293     }
294
295     if (debug)
296     {
297         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
298     }
299 }
300
301 static void posres_wrapper(FILE *fplog,
302                            int flags,
303                            gmx_bool bSepDVDL,
304                            t_inputrec *ir,
305                            t_nrnb *nrnb,
306                            gmx_localtop_t *top,
307                            matrix box, rvec x[],
308                            gmx_enerdata_t *enerd,
309                            real *lambda,
310                            t_forcerec *fr)
311 {
312     t_pbc pbc;
313     real  v, dvdl;
314     int   i;
315
316     /* Position restraints always require full pbc */
317     set_pbc(&pbc, ir->ePBC, box);
318     dvdl = 0;
319     v    = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
320                   top->idef.iparams_posres,
321                   (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
322                   ir->ePBC == epbcNONE ? NULL : &pbc,
323                   lambda[efptRESTRAINT], &dvdl,
324                   fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
325     if (bSepDVDL)
326     {
327         gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
328     }
329     enerd->term[F_POSRES] += v;
330     /* If just the force constant changes, the FEP term is linear,
331      * but if k changes, it is not.
332      */
333     enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
334     inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
335
336     if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
337     {
338         for (i = 0; i < enerd->n_lambda; i++)
339         {
340             real dvdl_dum, lambda_dum;
341
342             lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
343             v          = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
344                                 top->idef.iparams_posres,
345                                 (const rvec*)x, NULL, NULL,
346                                 ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
347                                 fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
348             enerd->enerpart_lambda[i] += v;
349         }
350     }
351 }
352
353 static void pull_potential_wrapper(FILE *fplog,
354                                    gmx_bool bSepDVDL,
355                                    t_commrec *cr,
356                                    t_inputrec *ir,
357                                    matrix box, rvec x[],
358                                    rvec f[],
359                                    tensor vir_force,
360                                    t_mdatoms *mdatoms,
361                                    gmx_enerdata_t *enerd,
362                                    real *lambda,
363                                    double t)
364 {
365     t_pbc  pbc;
366     real   dvdl;
367
368     /* Calculate the center of mass forces, this requires communication,
369      * which is why pull_potential is called close to other communication.
370      * The virial contribution is calculated directly,
371      * which is why we call pull_potential after calc_virial.
372      */
373     set_pbc(&pbc, ir->ePBC, box);
374     dvdl                     = 0;
375     enerd->term[F_COM_PULL] +=
376         pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
377                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
378     if (bSepDVDL)
379     {
380         gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
381     }
382     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
383 }
384
385 static void pme_receive_force_ener(FILE           *fplog,
386                                    gmx_bool        bSepDVDL,
387                                    t_commrec      *cr,
388                                    gmx_wallcycle_t wcycle,
389                                    gmx_enerdata_t *enerd,
390                                    t_forcerec     *fr)
391 {
392     real   e, v, dvdl;
393     float  cycles_ppdpme, cycles_seppme;
394
395     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
396     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
397
398     /* In case of node-splitting, the PP nodes receive the long-range
399      * forces, virial and energy from the PME nodes here.
400      */
401     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
402     dvdl = 0;
403     gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
404                       &cycles_seppme);
405     if (bSepDVDL)
406     {
407         gmx_print_sepdvdl(fplog, "PME mesh", e, dvdl);
408     }
409     enerd->term[F_COUL_RECIP] += e;
410     enerd->dvdl_lin[efptCOUL] += dvdl;
411     if (wcycle)
412     {
413         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
414     }
415     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
416 }
417
418 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
419                                gmx_large_int_t step, real pforce, rvec *x, rvec *f)
420 {
421     int  i;
422     real pf2, fn2;
423     char buf[STEPSTRSIZE];
424
425     pf2 = sqr(pforce);
426     for (i = md->start; i < md->start+md->homenr; i++)
427     {
428         fn2 = norm2(f[i]);
429         /* We also catch NAN, if the compiler does not optimize this away. */
430         if (fn2 >= pf2 || fn2 != fn2)
431         {
432             fprintf(fp, "step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
433                     gmx_step_str(step, buf),
434                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], sqrt(fn2));
435         }
436     }
437 }
438
439 static void post_process_forces(t_commrec *cr,
440                                 gmx_large_int_t step,
441                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
442                                 gmx_localtop_t *top,
443                                 matrix box, rvec x[],
444                                 rvec f[],
445                                 tensor vir_force,
446                                 t_mdatoms *mdatoms,
447                                 t_graph *graph,
448                                 t_forcerec *fr, gmx_vsite_t *vsite,
449                                 int flags)
450 {
451     if (fr->bF_NoVirSum)
452     {
453         if (vsite)
454         {
455             /* Spread the mesh force on virtual sites to the other particles...
456              * This is parallellized. MPI communication is performed
457              * if the constructing atoms aren't local.
458              */
459             wallcycle_start(wcycle, ewcVSITESPREAD);
460             spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
461                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
462                            nrnb,
463                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
464             wallcycle_stop(wcycle, ewcVSITESPREAD);
465         }
466         if (flags & GMX_FORCE_VIRIAL)
467         {
468             /* Now add the forces, this is local */
469             if (fr->bDomDec)
470             {
471                 sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
472             }
473             else
474             {
475                 sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
476                            f, fr->f_novirsum);
477             }
478             if (EEL_FULL(fr->eeltype))
479             {
480                 /* Add the mesh contribution to the virial */
481                 m_add(vir_force, fr->vir_el_recip, vir_force);
482             }
483             if (debug)
484             {
485                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
486             }
487         }
488     }
489
490     if (fr->print_force >= 0)
491     {
492         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
493     }
494 }
495
496 static void do_nb_verlet(t_forcerec *fr,
497                          interaction_const_t *ic,
498                          gmx_enerdata_t *enerd,
499                          int flags, int ilocality,
500                          int clearF,
501                          t_nrnb *nrnb,
502                          gmx_wallcycle_t wcycle)
503 {
504     int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
505     char                      *env;
506     nonbonded_verlet_group_t  *nbvg;
507     gmx_bool                   bCUDA;
508
509     if (!(flags & GMX_FORCE_NONBONDED))
510     {
511         /* skip non-bonded calculation */
512         return;
513     }
514
515     nbvg = &fr->nbv->grp[ilocality];
516
517     /* CUDA kernel launch overhead is already timed separately */
518     if (fr->cutoff_scheme != ecutsVERLET)
519     {
520         gmx_incons("Invalid cut-off scheme passed!");
521     }
522
523     bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
524
525     if (!bCUDA)
526     {
527         wallcycle_sub_start(wcycle, ewcsNONBONDED);
528     }
529     switch (nbvg->kernel_type)
530     {
531         case nbnxnk4x4_PlainC:
532             nbnxn_kernel_ref(&nbvg->nbl_lists,
533                              nbvg->nbat, ic,
534                              fr->shift_vec,
535                              flags,
536                              clearF,
537                              fr->fshift[0],
538                              enerd->grpp.ener[egCOULSR],
539                              fr->bBHAM ?
540                              enerd->grpp.ener[egBHAMSR] :
541                              enerd->grpp.ener[egLJSR]);
542             break;
543
544         case nbnxnk4xN_SIMD_4xN:
545             nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
546                                   nbvg->nbat, ic,
547                                   nbvg->ewald_excl,
548                                   fr->shift_vec,
549                                   flags,
550                                   clearF,
551                                   fr->fshift[0],
552                                   enerd->grpp.ener[egCOULSR],
553                                   fr->bBHAM ?
554                                   enerd->grpp.ener[egBHAMSR] :
555                                   enerd->grpp.ener[egLJSR]);
556             break;
557         case nbnxnk4xN_SIMD_2xNN:
558             nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
559                                    nbvg->nbat, ic,
560                                    nbvg->ewald_excl,
561                                    fr->shift_vec,
562                                    flags,
563                                    clearF,
564                                    fr->fshift[0],
565                                    enerd->grpp.ener[egCOULSR],
566                                    fr->bBHAM ?
567                                    enerd->grpp.ener[egBHAMSR] :
568                                    enerd->grpp.ener[egLJSR]);
569             break;
570
571         case nbnxnk8x8x8_CUDA:
572             nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
573             break;
574
575         case nbnxnk8x8x8_PlainC:
576             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
577                                  nbvg->nbat, ic,
578                                  fr->shift_vec,
579                                  flags,
580                                  clearF,
581                                  nbvg->nbat->out[0].f,
582                                  fr->fshift[0],
583                                  enerd->grpp.ener[egCOULSR],
584                                  fr->bBHAM ?
585                                  enerd->grpp.ener[egBHAMSR] :
586                                  enerd->grpp.ener[egLJSR]);
587             break;
588
589         default:
590             gmx_incons("Invalid nonbonded kernel type passed!");
591
592     }
593     if (!bCUDA)
594     {
595         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
596     }
597
598     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
599     {
600         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
601     }
602     else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
603              (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
604     {
605         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
606     }
607     else
608     {
609         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
610     }
611     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
612     if (flags & GMX_FORCE_ENERGY)
613     {
614         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
615         enr_nbnxn_kernel_ljc += 1;
616         enr_nbnxn_kernel_lj  += 1;
617     }
618
619     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
620              nbvg->nbl_lists.natpair_ljq);
621     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
622              nbvg->nbl_lists.natpair_lj);
623     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
624              nbvg->nbl_lists.natpair_q);
625 }
626
627 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
628                          t_inputrec *inputrec,
629                          gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
630                          gmx_localtop_t *top,
631                          gmx_groups_t gmx_unused *groups,
632                          matrix box, rvec x[], history_t *hist,
633                          rvec f[],
634                          tensor vir_force,
635                          t_mdatoms *mdatoms,
636                          gmx_enerdata_t *enerd, t_fcdata *fcd,
637                          real *lambda, t_graph *graph,
638                          t_forcerec *fr, interaction_const_t *ic,
639                          gmx_vsite_t *vsite, rvec mu_tot,
640                          double t, FILE *field, gmx_edsam_t ed,
641                          gmx_bool bBornRadii,
642                          int flags)
643 {
644     int                 cg0, cg1, i, j;
645     int                 start, homenr;
646     int                 nb_kernel_type;
647     double              mu[2*DIM];
648     gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
649     gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
650     gmx_bool            bDiffKernels = FALSE;
651     matrix              boxs;
652     rvec                vzero, box_diag;
653     real                e, v, dvdl;
654     float               cycles_pme, cycles_force, cycles_wait_gpu;
655     nonbonded_verlet_t *nbv;
656
657     cycles_force    = 0;
658     cycles_wait_gpu = 0;
659     nbv             = fr->nbv;
660     nb_kernel_type  = fr->nbv->grp[0].kernel_type;
661
662     start  = mdatoms->start;
663     homenr = mdatoms->homenr;
664
665     bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
666
667     clear_mat(vir_force);
668
669     cg0 = 0;
670     if (DOMAINDECOMP(cr))
671     {
672         cg1 = cr->dd->ncg_tot;
673     }
674     else
675     {
676         cg1 = top->cgs.nr;
677     }
678     if (fr->n_tpi > 0)
679     {
680         cg1--;
681     }
682
683     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
684     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
685     bFillGrid     = (bNS && bStateChanged);
686     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
687     bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
688     bDoForces     = (flags & GMX_FORCE_FORCES);
689     bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
690     bUseGPU       = fr->nbv->bUseGPU;
691     bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
692
693     if (bStateChanged)
694     {
695         update_forcerec(fr, box);
696
697         if (NEED_MUTOT(*inputrec))
698         {
699             /* Calculate total (local) dipole moment in a temporary common array.
700              * This makes it possible to sum them over nodes faster.
701              */
702             calc_mu(start, homenr,
703                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
704                     mu, mu+DIM);
705         }
706     }
707
708     if (fr->ePBC != epbcNONE)
709     {
710         /* Compute shift vectors every step,
711          * because of pressure coupling or box deformation!
712          */
713         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
714         {
715             calc_shifts(box, fr->shift_vec);
716         }
717
718         if (bCalcCGCM)
719         {
720             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
721             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
722         }
723         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
724         {
725             unshift_self(graph, box, x);
726         }
727     }
728
729     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
730                                  fr->shift_vec, nbv->grp[0].nbat);
731
732 #ifdef GMX_MPI
733     if (!(cr->duty & DUTY_PME))
734     {
735         /* Send particle coordinates to the pme nodes.
736          * Since this is only implemented for domain decomposition
737          * and domain decomposition does not use the graph,
738          * we do not need to worry about shifting.
739          */
740
741         wallcycle_start(wcycle, ewcPP_PMESENDX);
742
743         bBS = (inputrec->nwall == 2);
744         if (bBS)
745         {
746             copy_mat(box, boxs);
747             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
748         }
749
750         gmx_pme_send_x(cr, bBS ? boxs : box, x,
751                        mdatoms->nChargePerturbed, lambda[efptCOUL],
752                        (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
753
754         wallcycle_stop(wcycle, ewcPP_PMESENDX);
755     }
756 #endif /* GMX_MPI */
757
758     /* do gridding for pair search */
759     if (bNS)
760     {
761         if (graph && bStateChanged)
762         {
763             /* Calculate intramolecular shift vectors to make molecules whole */
764             mk_mshift(fplog, graph, fr->ePBC, box, x);
765         }
766
767         clear_rvec(vzero);
768         box_diag[XX] = box[XX][XX];
769         box_diag[YY] = box[YY][YY];
770         box_diag[ZZ] = box[ZZ][ZZ];
771
772         wallcycle_start(wcycle, ewcNS);
773         if (!fr->bDomDec)
774         {
775             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
776             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
777                               0, vzero, box_diag,
778                               0, mdatoms->homenr, -1, fr->cginfo, x,
779                               0, NULL,
780                               nbv->grp[eintLocal].kernel_type,
781                               nbv->grp[eintLocal].nbat);
782             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
783         }
784         else
785         {
786             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
787             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
788                                        fr->cginfo, x,
789                                        nbv->grp[eintNonlocal].kernel_type,
790                                        nbv->grp[eintNonlocal].nbat);
791             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
792         }
793
794         if (nbv->ngrp == 1 ||
795             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
796         {
797             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
798                                nbv->nbs, mdatoms, fr->cginfo);
799         }
800         else
801         {
802             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
803                                nbv->nbs, mdatoms, fr->cginfo);
804             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
805                                nbv->nbs, mdatoms, fr->cginfo);
806         }
807         wallcycle_stop(wcycle, ewcNS);
808     }
809
810     /* initialize the GPU atom data and copy shift vector */
811     if (bUseGPU)
812     {
813         if (bNS)
814         {
815             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
816             nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
817             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
818         }
819
820         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
821         nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
822         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
823     }
824
825     /* do local pair search */
826     if (bNS)
827     {
828         wallcycle_start_nocount(wcycle, ewcNS);
829         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
830         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
831                             &top->excls,
832                             ic->rlist,
833                             nbv->min_ci_balanced,
834                             &nbv->grp[eintLocal].nbl_lists,
835                             eintLocal,
836                             nbv->grp[eintLocal].kernel_type,
837                             nrnb);
838         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
839
840         if (bUseGPU)
841         {
842             /* initialize local pair-list on the GPU */
843             nbnxn_cuda_init_pairlist(nbv->cu_nbv,
844                                      nbv->grp[eintLocal].nbl_lists.nbl[0],
845                                      eintLocal);
846         }
847         wallcycle_stop(wcycle, ewcNS);
848     }
849     else
850     {
851         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
852         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
853         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
854                                         nbv->grp[eintLocal].nbat);
855         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
856         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
857     }
858
859     if (bUseGPU)
860     {
861         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
862         /* launch local nonbonded F on GPU */
863         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
864                      nrnb, wcycle);
865         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
866     }
867
868     /* Communicate coordinates and sum dipole if necessary +
869        do non-local pair search */
870     if (DOMAINDECOMP(cr))
871     {
872         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
873                         nbv->grp[eintLocal].kernel_type);
874
875         if (bDiffKernels)
876         {
877             /* With GPU+CPU non-bonded calculations we need to copy
878              * the local coordinates to the non-local nbat struct
879              * (in CPU format) as the non-local kernel call also
880              * calculates the local - non-local interactions.
881              */
882             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
883             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
884             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
885                                             nbv->grp[eintNonlocal].nbat);
886             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
887             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
888         }
889
890         if (bNS)
891         {
892             wallcycle_start_nocount(wcycle, ewcNS);
893             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
894
895             if (bDiffKernels)
896             {
897                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
898             }
899
900             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
901                                 &top->excls,
902                                 ic->rlist,
903                                 nbv->min_ci_balanced,
904                                 &nbv->grp[eintNonlocal].nbl_lists,
905                                 eintNonlocal,
906                                 nbv->grp[eintNonlocal].kernel_type,
907                                 nrnb);
908
909             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
910
911             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
912             {
913                 /* initialize non-local pair-list on the GPU */
914                 nbnxn_cuda_init_pairlist(nbv->cu_nbv,
915                                          nbv->grp[eintNonlocal].nbl_lists.nbl[0],
916                                          eintNonlocal);
917             }
918             wallcycle_stop(wcycle, ewcNS);
919         }
920         else
921         {
922             wallcycle_start(wcycle, ewcMOVEX);
923             dd_move_x(cr->dd, box, x);
924
925             /* When we don't need the total dipole we sum it in global_stat */
926             if (bStateChanged && NEED_MUTOT(*inputrec))
927             {
928                 gmx_sumd(2*DIM, mu, cr);
929             }
930             wallcycle_stop(wcycle, ewcMOVEX);
931
932             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
933             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
934             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
935                                             nbv->grp[eintNonlocal].nbat);
936             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
937             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
938         }
939
940         if (bUseGPU && !bDiffKernels)
941         {
942             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
943             /* launch non-local nonbonded F on GPU */
944             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
945                          nrnb, wcycle);
946             cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
947         }
948     }
949
950     if (bUseGPU)
951     {
952         /* launch D2H copy-back F */
953         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
954         if (DOMAINDECOMP(cr) && !bDiffKernels)
955         {
956             nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
957                                       flags, eatNonlocal);
958         }
959         nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
960                                   flags, eatLocal);
961         cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
962     }
963
964     if (bStateChanged && NEED_MUTOT(*inputrec))
965     {
966         if (PAR(cr))
967         {
968             gmx_sumd(2*DIM, mu, cr);
969         }
970
971         for (i = 0; i < 2; i++)
972         {
973             for (j = 0; j < DIM; j++)
974             {
975                 fr->mu_tot[i][j] = mu[i*DIM + j];
976             }
977         }
978     }
979     if (fr->efep == efepNO)
980     {
981         copy_rvec(fr->mu_tot[0], mu_tot);
982     }
983     else
984     {
985         for (j = 0; j < DIM; j++)
986         {
987             mu_tot[j] =
988                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
989                 lambda[efptCOUL]*fr->mu_tot[1][j];
990         }
991     }
992
993     /* Reset energies */
994     reset_enerdata(fr, bNS, enerd, MASTER(cr));
995     clear_rvecs(SHIFTS, fr->fshift);
996
997     if (DOMAINDECOMP(cr))
998     {
999         if (!(cr->duty & DUTY_PME))
1000         {
1001             wallcycle_start(wcycle, ewcPPDURINGPME);
1002             dd_force_flop_start(cr->dd, nrnb);
1003         }
1004     }
1005
1006     if (inputrec->bRot)
1007     {
1008         /* Enforced rotation has its own cycle counter that starts after the collective
1009          * coordinates have been communicated. It is added to ddCyclF to allow
1010          * for proper load-balancing */
1011         wallcycle_start(wcycle, ewcROT);
1012         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1013         wallcycle_stop(wcycle, ewcROT);
1014     }
1015
1016     /* Start the force cycle counter.
1017      * This counter is stopped in do_forcelow_level.
1018      * No parallel communication should occur while this counter is running,
1019      * since that will interfere with the dynamic load balancing.
1020      */
1021     wallcycle_start(wcycle, ewcFORCE);
1022     if (bDoForces)
1023     {
1024         /* Reset forces for which the virial is calculated separately:
1025          * PME/Ewald forces if necessary */
1026         if (fr->bF_NoVirSum)
1027         {
1028             if (flags & GMX_FORCE_VIRIAL)
1029             {
1030                 fr->f_novirsum = fr->f_novirsum_alloc;
1031                 if (fr->bDomDec)
1032                 {
1033                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1034                 }
1035                 else
1036                 {
1037                     clear_rvecs(homenr, fr->f_novirsum+start);
1038                 }
1039             }
1040             else
1041             {
1042                 /* We are not calculating the pressure so we do not need
1043                  * a separate array for forces that do not contribute
1044                  * to the pressure.
1045                  */
1046                 fr->f_novirsum = f;
1047             }
1048         }
1049
1050         /* Clear the short- and long-range forces */
1051         clear_rvecs(fr->natoms_force_constr, f);
1052         if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
1053         {
1054             clear_rvecs(fr->natoms_force_constr, fr->f_twin);
1055         }
1056
1057         clear_rvec(fr->vir_diag_posres);
1058     }
1059
1060     if (inputrec->ePull == epullCONSTRAINT)
1061     {
1062         clear_pull_forces(inputrec->pull);
1063     }
1064
1065     /* We calculate the non-bonded forces, when done on the CPU, here.
1066      * We do this before calling do_force_lowlevel, as in there bondeds
1067      * forces are calculated before PME, which does communication.
1068      * With this order, non-bonded and bonded force calculation imbalance
1069      * can be balanced out by the domain decomposition load balancing.
1070      */
1071
1072     if (!bUseOrEmulGPU)
1073     {
1074         /* Maybe we should move this into do_force_lowlevel */
1075         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1076                      nrnb, wcycle);
1077     }
1078
1079     if (!bUseOrEmulGPU || bDiffKernels)
1080     {
1081         int aloc;
1082
1083         if (DOMAINDECOMP(cr))
1084         {
1085             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1086                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1087                          nrnb, wcycle);
1088         }
1089
1090         if (!bUseOrEmulGPU)
1091         {
1092             aloc = eintLocal;
1093         }
1094         else
1095         {
1096             aloc = eintNonlocal;
1097         }
1098
1099         /* Add all the non-bonded force to the normal force array.
1100          * This can be split into a local a non-local part when overlapping
1101          * communication with calculation with domain decomposition.
1102          */
1103         cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1104         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1105         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1106         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1107         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1108         cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1109         wallcycle_start_nocount(wcycle, ewcFORCE);
1110
1111         /* if there are multiple fshift output buffers reduce them */
1112         if ((flags & GMX_FORCE_VIRIAL) &&
1113             nbv->grp[aloc].nbl_lists.nnbl > 1)
1114         {
1115             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1116                                                      fr->fshift);
1117         }
1118     }
1119
1120     /* update QMMMrec, if necessary */
1121     if (fr->bQMMM)
1122     {
1123         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1124     }
1125
1126     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
1127     {
1128         posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
1129                        enerd, lambda, fr);
1130     }
1131
1132     /* Compute the bonded and non-bonded energies and optionally forces */
1133     do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
1134                       cr, nrnb, wcycle, mdatoms,
1135                       x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
1136                       &(top->atomtypes), bBornRadii, box,
1137                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1138                       flags, &cycles_pme);
1139
1140     if (bSepLRF)
1141     {
1142         if (do_per_step(step, inputrec->nstcalclr))
1143         {
1144             /* Add the long range forces to the short range forces */
1145             for (i = 0; i < fr->natoms_force_constr; i++)
1146             {
1147                 rvec_add(fr->f_twin[i], f[i], f[i]);
1148             }
1149         }
1150     }
1151
1152     cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1153
1154     if (ed)
1155     {
1156         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1157     }
1158
1159     if (bUseOrEmulGPU && !bDiffKernels)
1160     {
1161         /* wait for non-local forces (or calculate in emulation mode) */
1162         if (DOMAINDECOMP(cr))
1163         {
1164             if (bUseGPU)
1165             {
1166                 float cycles_tmp;
1167
1168                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1169                 nbnxn_cuda_wait_gpu(nbv->cu_nbv,
1170                                     nbv->grp[eintNonlocal].nbat,
1171                                     flags, eatNonlocal,
1172                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1173                                     fr->fshift);
1174                 cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1175                 cycles_wait_gpu += cycles_tmp;
1176                 cycles_force    += cycles_tmp;
1177             }
1178             else
1179             {
1180                 wallcycle_start_nocount(wcycle, ewcFORCE);
1181                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1182                              nrnb, wcycle);
1183                 cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1184             }
1185             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1186             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1187             /* skip the reduction if there was no non-local work to do */
1188             if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
1189             {
1190                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1191                                                nbv->grp[eintNonlocal].nbat, f);
1192             }
1193             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1194             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1195         }
1196     }
1197
1198     if (bDoForces)
1199     {
1200         /* Communicate the forces */
1201         if (PAR(cr))
1202         {
1203             wallcycle_start(wcycle, ewcMOVEF);
1204             if (DOMAINDECOMP(cr))
1205             {
1206                 dd_move_f(cr->dd, f, fr->fshift);
1207                 /* Do we need to communicate the separate force array
1208                  * for terms that do not contribute to the single sum virial?
1209                  * Position restraints and electric fields do not introduce
1210                  * inter-cg forces, only full electrostatics methods do.
1211                  * When we do not calculate the virial, fr->f_novirsum = f,
1212                  * so we have already communicated these forces.
1213                  */
1214                 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1215                     (flags & GMX_FORCE_VIRIAL))
1216                 {
1217                     dd_move_f(cr->dd, fr->f_novirsum, NULL);
1218                 }
1219                 if (bSepLRF)
1220                 {
1221                     /* We should not update the shift forces here,
1222                      * since f_twin is already included in f.
1223                      */
1224                     dd_move_f(cr->dd, fr->f_twin, NULL);
1225                 }
1226             }
1227             wallcycle_stop(wcycle, ewcMOVEF);
1228         }
1229     }
1230
1231     if (bUseOrEmulGPU)
1232     {
1233         /* wait for local forces (or calculate in emulation mode) */
1234         if (bUseGPU)
1235         {
1236             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1237             nbnxn_cuda_wait_gpu(nbv->cu_nbv,
1238                                 nbv->grp[eintLocal].nbat,
1239                                 flags, eatLocal,
1240                                 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1241                                 fr->fshift);
1242             cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1243
1244             /* now clear the GPU outputs while we finish the step on the CPU */
1245
1246             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1247             nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
1248             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1249         }
1250         else
1251         {
1252             wallcycle_start_nocount(wcycle, ewcFORCE);
1253             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1254                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1255                          nrnb, wcycle);
1256             wallcycle_stop(wcycle, ewcFORCE);
1257         }
1258         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1259         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1260         if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
1261         {
1262             /* skip the reduction if there was no non-local work to do */
1263             nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1264                                            nbv->grp[eintLocal].nbat, f);
1265         }
1266         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1267         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1268     }
1269
1270     if (DOMAINDECOMP(cr))
1271     {
1272         dd_force_flop_stop(cr->dd, nrnb);
1273         if (wcycle)
1274         {
1275             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1276             if (bUseGPU)
1277             {
1278                 dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
1279             }
1280         }
1281     }
1282
1283     if (bDoForces)
1284     {
1285         if (IR_ELEC_FIELD(*inputrec))
1286         {
1287             /* Compute forces due to electric field */
1288             calc_f_el(MASTER(cr) ? field : NULL,
1289                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1290                       inputrec->ex, inputrec->et, t);
1291         }
1292
1293         /* If we have NoVirSum forces, but we do not calculate the virial,
1294          * we sum fr->f_novirum=f later.
1295          */
1296         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1297         {
1298             wallcycle_start(wcycle, ewcVSITESPREAD);
1299             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1300                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1301             wallcycle_stop(wcycle, ewcVSITESPREAD);
1302
1303             if (bSepLRF)
1304             {
1305                 wallcycle_start(wcycle, ewcVSITESPREAD);
1306                 spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
1307                                nrnb,
1308                                &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1309                 wallcycle_stop(wcycle, ewcVSITESPREAD);
1310             }
1311         }
1312
1313         if (flags & GMX_FORCE_VIRIAL)
1314         {
1315             /* Calculation of the virial must be done after vsites! */
1316             calc_virial(mdatoms->start, mdatoms->homenr, x, f,
1317                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1318         }
1319     }
1320
1321     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
1322     {
1323         pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
1324                                f, vir_force, mdatoms, enerd, lambda, t);
1325     }
1326
1327     /* Add the forces from enforced rotation potentials (if any) */
1328     if (inputrec->bRot)
1329     {
1330         wallcycle_start(wcycle, ewcROTadd);
1331         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1332         wallcycle_stop(wcycle, ewcROTadd);
1333     }
1334
1335     if (PAR(cr) && !(cr->duty & DUTY_PME))
1336     {
1337         /* In case of node-splitting, the PP nodes receive the long-range
1338          * forces, virial and energy from the PME nodes here.
1339          */
1340         pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
1341     }
1342
1343     if (bDoForces)
1344     {
1345         post_process_forces(cr, step, nrnb, wcycle,
1346                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1347                             flags);
1348     }
1349
1350     /* Sum the potential energy terms from group contributions */
1351     sum_epot(&(enerd->grpp), enerd->term);
1352 }
1353
1354 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1355                         t_inputrec *inputrec,
1356                         gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1357                         gmx_localtop_t *top,
1358                         gmx_groups_t *groups,
1359                         matrix box, rvec x[], history_t *hist,
1360                         rvec f[],
1361                         tensor vir_force,
1362                         t_mdatoms *mdatoms,
1363                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1364                         real *lambda, t_graph *graph,
1365                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1366                         double t, FILE *field, gmx_edsam_t ed,
1367                         gmx_bool bBornRadii,
1368                         int flags)
1369 {
1370     int        cg0, cg1, i, j;
1371     int        start, homenr;
1372     double     mu[2*DIM];
1373     gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
1374     gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
1375     gmx_bool   bDoAdressWF;
1376     matrix     boxs;
1377     rvec       vzero, box_diag;
1378     real       e, v, dvdlambda[efptNR];
1379     t_pbc      pbc;
1380     float      cycles_pme, cycles_force;
1381
1382     start  = mdatoms->start;
1383     homenr = mdatoms->homenr;
1384
1385     bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
1386
1387     clear_mat(vir_force);
1388
1389     if (PARTDECOMP(cr))
1390     {
1391         pd_cg_range(cr, &cg0, &cg1);
1392     }
1393     else
1394     {
1395         cg0 = 0;
1396         if (DOMAINDECOMP(cr))
1397         {
1398             cg1 = cr->dd->ncg_tot;
1399         }
1400         else
1401         {
1402             cg1 = top->cgs.nr;
1403         }
1404         if (fr->n_tpi > 0)
1405         {
1406             cg1--;
1407         }
1408     }
1409
1410     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1411     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1412     /* Should we update the long-range neighborlists at this step? */
1413     bDoLongRangeNS = fr->bTwinRange && bNS;
1414     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1415     bFillGrid      = (bNS && bStateChanged);
1416     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1417     bDoForces      = (flags & GMX_FORCE_FORCES);
1418     bDoPotential   = (flags & GMX_FORCE_ENERGY);
1419     bSepLRF        = ((inputrec->nstcalclr > 1) && bDoForces &&
1420                       (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
1421
1422     /* should probably move this to the forcerec since it doesn't change */
1423     bDoAdressWF   = ((fr->adress_type != eAdressOff));
1424
1425     if (bStateChanged)
1426     {
1427         update_forcerec(fr, box);
1428
1429         if (NEED_MUTOT(*inputrec))
1430         {
1431             /* Calculate total (local) dipole moment in a temporary common array.
1432              * This makes it possible to sum them over nodes faster.
1433              */
1434             calc_mu(start, homenr,
1435                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1436                     mu, mu+DIM);
1437         }
1438     }
1439
1440     if (fr->ePBC != epbcNONE)
1441     {
1442         /* Compute shift vectors every step,
1443          * because of pressure coupling or box deformation!
1444          */
1445         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1446         {
1447             calc_shifts(box, fr->shift_vec);
1448         }
1449
1450         if (bCalcCGCM)
1451         {
1452             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1453                                      &(top->cgs), x, fr->cg_cm);
1454             inc_nrnb(nrnb, eNR_CGCM, homenr);
1455             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1456         }
1457         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1458         {
1459             unshift_self(graph, box, x);
1460         }
1461     }
1462     else if (bCalcCGCM)
1463     {
1464         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1465         inc_nrnb(nrnb, eNR_CGCM, homenr);
1466     }
1467
1468     if (bCalcCGCM)
1469     {
1470         if (PAR(cr))
1471         {
1472             move_cgcm(fplog, cr, fr->cg_cm);
1473         }
1474         if (gmx_debug_at)
1475         {
1476             pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1477         }
1478     }
1479
1480 #ifdef GMX_MPI
1481     if (!(cr->duty & DUTY_PME))
1482     {
1483         /* Send particle coordinates to the pme nodes.
1484          * Since this is only implemented for domain decomposition
1485          * and domain decomposition does not use the graph,
1486          * we do not need to worry about shifting.
1487          */
1488
1489         wallcycle_start(wcycle, ewcPP_PMESENDX);
1490
1491         bBS = (inputrec->nwall == 2);
1492         if (bBS)
1493         {
1494             copy_mat(box, boxs);
1495             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1496         }
1497
1498         gmx_pme_send_x(cr, bBS ? boxs : box, x,
1499                        mdatoms->nChargePerturbed, lambda[efptCOUL],
1500                        (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
1501
1502         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1503     }
1504 #endif /* GMX_MPI */
1505
1506     /* Communicate coordinates and sum dipole if necessary */
1507     if (PAR(cr))
1508     {
1509         wallcycle_start(wcycle, ewcMOVEX);
1510         if (DOMAINDECOMP(cr))
1511         {
1512             dd_move_x(cr->dd, box, x);
1513         }
1514         else
1515         {
1516             move_x(cr, x, nrnb);
1517         }
1518         wallcycle_stop(wcycle, ewcMOVEX);
1519     }
1520
1521     /* update adress weight beforehand */
1522     if (bStateChanged && bDoAdressWF)
1523     {
1524         /* need pbc for adress weight calculation with pbc_dx */
1525         set_pbc(&pbc, inputrec->ePBC, box);
1526         if (fr->adress_site == eAdressSITEcog)
1527         {
1528             update_adress_weights_cog(top->idef.iparams, top->idef.il, x, fr, mdatoms,
1529                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
1530         }
1531         else if (fr->adress_site == eAdressSITEcom)
1532         {
1533             update_adress_weights_com(fplog, cg0, cg1, &(top->cgs), x, fr, mdatoms,
1534                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
1535         }
1536         else if (fr->adress_site == eAdressSITEatomatom)
1537         {
1538             update_adress_weights_atom_per_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
1539                                                 inputrec->ePBC == epbcNONE ? NULL : &pbc);
1540         }
1541         else
1542         {
1543             update_adress_weights_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
1544                                        inputrec->ePBC == epbcNONE ? NULL : &pbc);
1545         }
1546     }
1547
1548     if (NEED_MUTOT(*inputrec))
1549     {
1550
1551         if (bStateChanged)
1552         {
1553             if (PAR(cr))
1554             {
1555                 gmx_sumd(2*DIM, mu, cr);
1556             }
1557             for (i = 0; i < 2; i++)
1558             {
1559                 for (j = 0; j < DIM; j++)
1560                 {
1561                     fr->mu_tot[i][j] = mu[i*DIM + j];
1562                 }
1563             }
1564         }
1565         if (fr->efep == efepNO)
1566         {
1567             copy_rvec(fr->mu_tot[0], mu_tot);
1568         }
1569         else
1570         {
1571             for (j = 0; j < DIM; j++)
1572             {
1573                 mu_tot[j] =
1574                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1575             }
1576         }
1577     }
1578
1579     /* Reset energies */
1580     reset_enerdata(fr, bNS, enerd, MASTER(cr));
1581     clear_rvecs(SHIFTS, fr->fshift);
1582
1583     if (bNS)
1584     {
1585         wallcycle_start(wcycle, ewcNS);
1586
1587         if (graph && bStateChanged)
1588         {
1589             /* Calculate intramolecular shift vectors to make molecules whole */
1590             mk_mshift(fplog, graph, fr->ePBC, box, x);
1591         }
1592
1593         /* Do the actual neighbour searching */
1594         ns(fplog, fr, box,
1595            groups, top, mdatoms,
1596            cr, nrnb, bFillGrid,
1597            bDoLongRangeNS);
1598
1599         wallcycle_stop(wcycle, ewcNS);
1600     }
1601
1602     if (inputrec->implicit_solvent && bNS)
1603     {
1604         make_gb_nblist(cr, inputrec->gb_algorithm,
1605                        x, box, fr, &top->idef, graph, fr->born);
1606     }
1607
1608     if (DOMAINDECOMP(cr))
1609     {
1610         if (!(cr->duty & DUTY_PME))
1611         {
1612             wallcycle_start(wcycle, ewcPPDURINGPME);
1613             dd_force_flop_start(cr->dd, nrnb);
1614         }
1615     }
1616
1617     if (inputrec->bRot)
1618     {
1619         /* Enforced rotation has its own cycle counter that starts after the collective
1620          * coordinates have been communicated. It is added to ddCyclF to allow
1621          * for proper load-balancing */
1622         wallcycle_start(wcycle, ewcROT);
1623         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1624         wallcycle_stop(wcycle, ewcROT);
1625     }
1626
1627     /* Start the force cycle counter.
1628      * This counter is stopped in do_forcelow_level.
1629      * No parallel communication should occur while this counter is running,
1630      * since that will interfere with the dynamic load balancing.
1631      */
1632     wallcycle_start(wcycle, ewcFORCE);
1633
1634     if (bDoForces)
1635     {
1636         /* Reset forces for which the virial is calculated separately:
1637          * PME/Ewald forces if necessary */
1638         if (fr->bF_NoVirSum)
1639         {
1640             if (flags & GMX_FORCE_VIRIAL)
1641             {
1642                 fr->f_novirsum = fr->f_novirsum_alloc;
1643                 if (fr->bDomDec)
1644                 {
1645                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1646                 }
1647                 else
1648                 {
1649                     clear_rvecs(homenr, fr->f_novirsum+start);
1650                 }
1651             }
1652             else
1653             {
1654                 /* We are not calculating the pressure so we do not need
1655                  * a separate array for forces that do not contribute
1656                  * to the pressure.
1657                  */
1658                 fr->f_novirsum = f;
1659             }
1660         }
1661
1662         /* Clear the short- and long-range forces */
1663         clear_rvecs(fr->natoms_force_constr, f);
1664         if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
1665         {
1666             clear_rvecs(fr->natoms_force_constr, fr->f_twin);
1667         }
1668
1669         clear_rvec(fr->vir_diag_posres);
1670     }
1671     if (inputrec->ePull == epullCONSTRAINT)
1672     {
1673         clear_pull_forces(inputrec->pull);
1674     }
1675
1676     /* update QMMMrec, if necessary */
1677     if (fr->bQMMM)
1678     {
1679         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1680     }
1681
1682     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
1683     {
1684         posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
1685                        enerd, lambda, fr);
1686     }
1687
1688     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
1689     {
1690         /* Flat-bottomed position restraints always require full pbc */
1691         if (!(bStateChanged && bDoAdressWF))
1692         {
1693             set_pbc(&pbc, inputrec->ePBC, box);
1694         }
1695         v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
1696                      top->idef.iparams_fbposres,
1697                      (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
1698                      inputrec->ePBC == epbcNONE ? NULL : &pbc,
1699                      fr->rc_scaling, fr->ePBC, fr->posres_com);
1700         enerd->term[F_FBPOSRES] += v;
1701         inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
1702     }
1703
1704     /* Compute the bonded and non-bonded energies and optionally forces */
1705     do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
1706                       cr, nrnb, wcycle, mdatoms,
1707                       x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
1708                       &(top->atomtypes), bBornRadii, box,
1709                       inputrec->fepvals, lambda,
1710                       graph, &(top->excls), fr->mu_tot,
1711                       flags,
1712                       &cycles_pme);
1713
1714     if (bSepLRF)
1715     {
1716         if (do_per_step(step, inputrec->nstcalclr))
1717         {
1718             /* Add the long range forces to the short range forces */
1719             for (i = 0; i < fr->natoms_force_constr; i++)
1720             {
1721                 rvec_add(fr->f_twin[i], f[i], f[i]);
1722             }
1723         }
1724     }
1725
1726     cycles_force = wallcycle_stop(wcycle, ewcFORCE);
1727
1728     if (ed)
1729     {
1730         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1731     }
1732
1733     if (DOMAINDECOMP(cr))
1734     {
1735         dd_force_flop_stop(cr->dd, nrnb);
1736         if (wcycle)
1737         {
1738             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1739         }
1740     }
1741
1742     if (bDoForces)
1743     {
1744         if (IR_ELEC_FIELD(*inputrec))
1745         {
1746             /* Compute forces due to electric field */
1747             calc_f_el(MASTER(cr) ? field : NULL,
1748                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1749                       inputrec->ex, inputrec->et, t);
1750         }
1751
1752         if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
1753         {
1754             /* Compute thermodynamic force in hybrid AdResS region */
1755             adress_thermo_force(start, homenr, &(top->cgs), x, fr->f_novirsum, fr, mdatoms,
1756                                 inputrec->ePBC == epbcNONE ? NULL : &pbc);
1757         }
1758
1759         /* Communicate the forces */
1760         if (PAR(cr))
1761         {
1762             wallcycle_start(wcycle, ewcMOVEF);
1763             if (DOMAINDECOMP(cr))
1764             {
1765                 dd_move_f(cr->dd, f, fr->fshift);
1766                 /* Do we need to communicate the separate force array
1767                  * for terms that do not contribute to the single sum virial?
1768                  * Position restraints and electric fields do not introduce
1769                  * inter-cg forces, only full electrostatics methods do.
1770                  * When we do not calculate the virial, fr->f_novirsum = f,
1771                  * so we have already communicated these forces.
1772                  */
1773                 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1774                     (flags & GMX_FORCE_VIRIAL))
1775                 {
1776                     dd_move_f(cr->dd, fr->f_novirsum, NULL);
1777                 }
1778                 if (bSepLRF)
1779                 {
1780                     /* We should not update the shift forces here,
1781                      * since f_twin is already included in f.
1782                      */
1783                     dd_move_f(cr->dd, fr->f_twin, NULL);
1784                 }
1785             }
1786             else
1787             {
1788                 pd_move_f(cr, f, nrnb);
1789                 if (bSepLRF)
1790                 {
1791                     pd_move_f(cr, fr->f_twin, nrnb);
1792                 }
1793             }
1794             wallcycle_stop(wcycle, ewcMOVEF);
1795         }
1796
1797         /* If we have NoVirSum forces, but we do not calculate the virial,
1798          * we sum fr->f_novirum=f later.
1799          */
1800         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1801         {
1802             wallcycle_start(wcycle, ewcVSITESPREAD);
1803             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1804                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1805             wallcycle_stop(wcycle, ewcVSITESPREAD);
1806
1807             if (bSepLRF)
1808             {
1809                 wallcycle_start(wcycle, ewcVSITESPREAD);
1810                 spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
1811                                nrnb,
1812                                &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1813                 wallcycle_stop(wcycle, ewcVSITESPREAD);
1814             }
1815         }
1816
1817         if (flags & GMX_FORCE_VIRIAL)
1818         {
1819             /* Calculation of the virial must be done after vsites! */
1820             calc_virial(mdatoms->start, mdatoms->homenr, x, f,
1821                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1822         }
1823     }
1824
1825     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
1826     {
1827         pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
1828                                f, vir_force, mdatoms, enerd, lambda, t);
1829     }
1830
1831     /* Add the forces from enforced rotation potentials (if any) */
1832     if (inputrec->bRot)
1833     {
1834         wallcycle_start(wcycle, ewcROTadd);
1835         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1836         wallcycle_stop(wcycle, ewcROTadd);
1837     }
1838
1839     if (PAR(cr) && !(cr->duty & DUTY_PME))
1840     {
1841         /* In case of node-splitting, the PP nodes receive the long-range
1842          * forces, virial and energy from the PME nodes here.
1843          */
1844         pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
1845     }
1846
1847     if (bDoForces)
1848     {
1849         post_process_forces(cr, step, nrnb, wcycle,
1850                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1851                             flags);
1852     }
1853
1854     /* Sum the potential energy terms from group contributions */
1855     sum_epot(&(enerd->grpp), enerd->term);
1856 }
1857
1858 void do_force(FILE *fplog, t_commrec *cr,
1859               t_inputrec *inputrec,
1860               gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1861               gmx_localtop_t *top,
1862               gmx_groups_t *groups,
1863               matrix box, rvec x[], history_t *hist,
1864               rvec f[],
1865               tensor vir_force,
1866               t_mdatoms *mdatoms,
1867               gmx_enerdata_t *enerd, t_fcdata *fcd,
1868               real *lambda, t_graph *graph,
1869               t_forcerec *fr,
1870               gmx_vsite_t *vsite, rvec mu_tot,
1871               double t, FILE *field, gmx_edsam_t ed,
1872               gmx_bool bBornRadii,
1873               int flags)
1874 {
1875     /* modify force flag if not doing nonbonded */
1876     if (!fr->bNonbonded)
1877     {
1878         flags &= ~GMX_FORCE_NONBONDED;
1879     }
1880
1881     switch (inputrec->cutoff_scheme)
1882     {
1883         case ecutsVERLET:
1884             do_force_cutsVERLET(fplog, cr, inputrec,
1885                                 step, nrnb, wcycle,
1886                                 top,
1887                                 groups,
1888                                 box, x, hist,
1889                                 f, vir_force,
1890                                 mdatoms,
1891                                 enerd, fcd,
1892                                 lambda, graph,
1893                                 fr, fr->ic,
1894                                 vsite, mu_tot,
1895                                 t, field, ed,
1896                                 bBornRadii,
1897                                 flags);
1898             break;
1899         case ecutsGROUP:
1900             do_force_cutsGROUP(fplog, cr, inputrec,
1901                                step, nrnb, wcycle,
1902                                top,
1903                                groups,
1904                                box, x, hist,
1905                                f, vir_force,
1906                                mdatoms,
1907                                enerd, fcd,
1908                                lambda, graph,
1909                                fr, vsite, mu_tot,
1910                                t, field, ed,
1911                                bBornRadii,
1912                                flags);
1913             break;
1914         default:
1915             gmx_incons("Invalid cut-off scheme passed!");
1916     }
1917 }
1918
1919
1920 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1921                         t_inputrec *ir, t_mdatoms *md,
1922                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1923                         t_forcerec *fr, gmx_localtop_t *top)
1924 {
1925     int             i, m, start, end;
1926     gmx_large_int_t step;
1927     real            dt = ir->delta_t;
1928     real            dvdl_dum;
1929     rvec           *savex;
1930
1931     snew(savex, state->natoms);
1932
1933     start = md->start;
1934     end   = md->homenr + start;
1935
1936     if (debug)
1937     {
1938         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1939                 start, md->homenr, end);
1940     }
1941     /* Do a first constrain to reset particles... */
1942     step = ir->init_step;
1943     if (fplog)
1944     {
1945         char buf[STEPSTRSIZE];
1946         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1947                 gmx_step_str(step, buf));
1948     }
1949     dvdl_dum = 0;
1950
1951     /* constrain the current position */
1952     constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1953               ir, NULL, cr, step, 0, md,
1954               state->x, state->x, NULL,
1955               fr->bMolPBC, state->box,
1956               state->lambda[efptBONDED], &dvdl_dum,
1957               NULL, NULL, nrnb, econqCoord,
1958               ir->epc == epcMTTK, state->veta, state->veta);
1959     if (EI_VV(ir->eI))
1960     {
1961         /* constrain the inital velocity, and save it */
1962         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1963         /* might not yet treat veta correctly */
1964         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1965                   ir, NULL, cr, step, 0, md,
1966                   state->x, state->v, state->v,
1967                   fr->bMolPBC, state->box,
1968                   state->lambda[efptBONDED], &dvdl_dum,
1969                   NULL, NULL, nrnb, econqVeloc,
1970                   ir->epc == epcMTTK, state->veta, state->veta);
1971     }
1972     /* constrain the inital velocities at t-dt/2 */
1973     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1974     {
1975         for (i = start; (i < end); i++)
1976         {
1977             for (m = 0; (m < DIM); m++)
1978             {
1979                 /* Reverse the velocity */
1980                 state->v[i][m] = -state->v[i][m];
1981                 /* Store the position at t-dt in buf */
1982                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1983             }
1984         }
1985         /* Shake the positions at t=-dt with the positions at t=0
1986          * as reference coordinates.
1987          */
1988         if (fplog)
1989         {
1990             char buf[STEPSTRSIZE];
1991             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1992                     gmx_step_str(step, buf));
1993         }
1994         dvdl_dum = 0;
1995         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1996                   ir, NULL, cr, step, -1, md,
1997                   state->x, savex, NULL,
1998                   fr->bMolPBC, state->box,
1999                   state->lambda[efptBONDED], &dvdl_dum,
2000                   state->v, NULL, nrnb, econqCoord,
2001                   ir->epc == epcMTTK, state->veta, state->veta);
2002
2003         for (i = start; i < end; i++)
2004         {
2005             for (m = 0; m < DIM; m++)
2006             {
2007                 /* Re-reverse the velocities */
2008                 state->v[i][m] = -state->v[i][m];
2009             }
2010         }
2011     }
2012     sfree(savex);
2013 }
2014
2015 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2016 {
2017     double eners[2], virs[2], enersum, virsum, y0, f, g, h;
2018     double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
2019     double invscale, invscale2, invscale3;
2020     int    ri0, ri1, ri, i, offstart, offset;
2021     real   scale, *vdwtab, tabfactor, tmp;
2022
2023     fr->enershiftsix    = 0;
2024     fr->enershifttwelve = 0;
2025     fr->enerdiffsix     = 0;
2026     fr->enerdifftwelve  = 0;
2027     fr->virdiffsix      = 0;
2028     fr->virdifftwelve   = 0;
2029
2030     if (eDispCorr != edispcNO)
2031     {
2032         for (i = 0; i < 2; i++)
2033         {
2034             eners[i] = 0;
2035             virs[i]  = 0;
2036         }
2037         if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
2038         {
2039             if (fr->rvdw_switch == 0)
2040             {
2041                 gmx_fatal(FARGS,
2042                           "With dispersion correction rvdw-switch can not be zero "
2043                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2044             }
2045
2046             scale  = fr->nblists[0].table_elec_vdw.scale;
2047             vdwtab = fr->nblists[0].table_vdw.data;
2048
2049             /* Round the cut-offs to exact table values for precision */
2050             ri0  = floor(fr->rvdw_switch*scale);
2051             ri1  = ceil(fr->rvdw*scale);
2052             r0   = ri0/scale;
2053             r1   = ri1/scale;
2054             rc3  = r0*r0*r0;
2055             rc9  = rc3*rc3*rc3;
2056
2057             if (fr->vdwtype == evdwSHIFT)
2058             {
2059                 /* Determine the constant energy shift below rvdw_switch.
2060                  * Table has a scale factor since we have scaled it down to compensate
2061                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2062                  */
2063                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2064                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2065             }
2066             /* Add the constant part from 0 to rvdw_switch.
2067              * This integration from 0 to rvdw_switch overcounts the number
2068              * of interactions by 1, as it also counts the self interaction.
2069              * We will correct for this later.
2070              */
2071             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2072             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2073
2074             invscale  = 1.0/(scale);
2075             invscale2 = invscale*invscale;
2076             invscale3 = invscale*invscale2;
2077
2078             /* following summation derived from cubic spline definition,
2079                Numerical Recipies in C, second edition, p. 113-116.  Exact
2080                for the cubic spline.  We first calculate the negative of
2081                the energy from rvdw to rvdw_switch, assuming that g(r)=1,
2082                and then add the more standard, abrupt cutoff correction to
2083                that result, yielding the long-range correction for a
2084                switched function.  We perform both the pressure and energy
2085                loops at the same time for simplicity, as the computational
2086                cost is low. */
2087
2088             for (i = 0; i < 2; i++)
2089             {
2090                 enersum = 0.0; virsum = 0.0;
2091                 if (i == 0)
2092                 {
2093                     offstart = 0;
2094                     /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
2095                      * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
2096                      * up (to save flops in kernels), we need to correct for this.
2097                      */
2098                     tabfactor = 6.0;
2099                 }
2100                 else
2101                 {
2102                     offstart  = 4;
2103                     tabfactor = 12.0;
2104                 }
2105                 for (ri = ri0; ri < ri1; ri++)
2106                 {
2107                     r  = ri*invscale;
2108                     ea = invscale3;
2109                     eb = 2.0*invscale2*r;
2110                     ec = invscale*r*r;
2111
2112                     pa = invscale3;
2113                     pb = 3.0*invscale2*r;
2114                     pc = 3.0*invscale*r*r;
2115                     pd = r*r*r;
2116
2117                     /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
2118                     offset = 8*ri + offstart;
2119                     y0     = vdwtab[offset];
2120                     f      = vdwtab[offset+1];
2121                     g      = vdwtab[offset+2];
2122                     h      = vdwtab[offset+3];
2123
2124                     enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2125                     virsum  += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2126                 }
2127
2128                 enersum  *= 4.0*M_PI*tabfactor;
2129                 virsum   *= 4.0*M_PI*tabfactor;
2130                 eners[i] -= enersum;
2131                 virs[i]  -= virsum;
2132             }
2133
2134             /* now add the correction for rvdw_switch to infinity */
2135             eners[0] += -4.0*M_PI/(3.0*rc3);
2136             eners[1] +=  4.0*M_PI/(9.0*rc9);
2137             virs[0]  +=  8.0*M_PI/rc3;
2138             virs[1]  += -16.0*M_PI/(3.0*rc9);
2139         }
2140         else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
2141         {
2142             if (fr->vdwtype == evdwUSER && fplog)
2143             {
2144                 fprintf(fplog,
2145                         "WARNING: using dispersion correction with user tables\n");
2146             }
2147             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2148             rc9  = rc3*rc3*rc3;
2149             /* Contribution beyond the cut-off */
2150             eners[0] += -4.0*M_PI/(3.0*rc3);
2151             eners[1] +=  4.0*M_PI/(9.0*rc9);
2152             if (fr->vdw_modifier == eintmodPOTSHIFT)
2153             {
2154                 /* Contribution within the cut-off */
2155                 eners[0] += -4.0*M_PI/(3.0*rc3);
2156                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2157             }
2158             /* Contribution beyond the cut-off */
2159             virs[0]  +=  8.0*M_PI/rc3;
2160             virs[1]  += -16.0*M_PI/(3.0*rc9);
2161         }
2162         else
2163         {
2164             gmx_fatal(FARGS,
2165                       "Dispersion correction is not implemented for vdw-type = %s",
2166                       evdw_names[fr->vdwtype]);
2167         }
2168         fr->enerdiffsix    = eners[0];
2169         fr->enerdifftwelve = eners[1];
2170         /* The 0.5 is due to the Gromacs definition of the virial */
2171         fr->virdiffsix     = 0.5*virs[0];
2172         fr->virdifftwelve  = 0.5*virs[1];
2173     }
2174 }
2175
2176 void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
2177                    gmx_large_int_t step, int natoms,
2178                    matrix box, real lambda, tensor pres, tensor virial,
2179                    real *prescorr, real *enercorr, real *dvdlcorr)
2180 {
2181     gmx_bool bCorrAll, bCorrPres;
2182     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2183     int      m;
2184
2185     *prescorr = 0;
2186     *enercorr = 0;
2187     *dvdlcorr = 0;
2188
2189     clear_mat(virial);
2190     clear_mat(pres);
2191
2192     if (ir->eDispCorr != edispcNO)
2193     {
2194         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2195                      ir->eDispCorr == edispcAllEnerPres);
2196         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2197                      ir->eDispCorr == edispcAllEnerPres);
2198
2199         invvol = 1/det(box);
2200         if (fr->n_tpi)
2201         {
2202             /* Only correct for the interactions with the inserted molecule */
2203             dens   = (natoms - fr->n_tpi)*invvol;
2204             ninter = fr->n_tpi;
2205         }
2206         else
2207         {
2208             dens   = natoms*invvol;
2209             ninter = 0.5*natoms;
2210         }
2211
2212         if (ir->efep == efepNO)
2213         {
2214             avcsix    = fr->avcsix[0];
2215             avctwelve = fr->avctwelve[0];
2216         }
2217         else
2218         {
2219             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2220             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2221         }
2222
2223         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2224         *enercorr += avcsix*enerdiff;
2225         dvdlambda  = 0.0;
2226         if (ir->efep != efepNO)
2227         {
2228             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2229         }
2230         if (bCorrAll)
2231         {
2232             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2233             *enercorr += avctwelve*enerdiff;
2234             if (fr->efep != efepNO)
2235             {
2236                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2237             }
2238         }
2239
2240         if (bCorrPres)
2241         {
2242             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2243             if (ir->eDispCorr == edispcAllEnerPres)
2244             {
2245                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2246             }
2247             /* The factor 2 is because of the Gromacs virial definition */
2248             spres = -2.0*invvol*svir*PRESFAC;
2249
2250             for (m = 0; m < DIM; m++)
2251             {
2252                 virial[m][m] += svir;
2253                 pres[m][m]   += spres;
2254             }
2255             *prescorr += spres;
2256         }
2257
2258         /* Can't currently control when it prints, for now, just print when degugging */
2259         if (debug)
2260         {
2261             if (bCorrAll)
2262             {
2263                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2264                         avcsix, avctwelve);
2265             }
2266             if (bCorrPres)
2267             {
2268                 fprintf(debug,
2269                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2270                         *enercorr, spres, svir);
2271             }
2272             else
2273             {
2274                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2275             }
2276         }
2277
2278         if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
2279         {
2280             gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
2281         }
2282         if (fr->efep != efepNO)
2283         {
2284             *dvdlcorr += dvdlambda;
2285         }
2286     }
2287 }
2288
2289 void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
2290                   t_graph *graph, rvec x[])
2291 {
2292     if (fplog)
2293     {
2294         fprintf(fplog, "Removing pbc first time\n");
2295     }
2296     calc_shifts(box, fr->shift_vec);
2297     if (graph)
2298     {
2299         mk_mshift(fplog, graph, fr->ePBC, box, x);
2300         if (gmx_debug_at)
2301         {
2302             p_graph(debug, "do_pbc_first 1", graph);
2303         }
2304         shift_self(graph, box, x);
2305         /* By doing an extra mk_mshift the molecules that are broken
2306          * because they were e.g. imported from another software
2307          * will be made whole again. Such are the healing powers
2308          * of GROMACS.
2309          */
2310         mk_mshift(fplog, graph, fr->ePBC, box, x);
2311         if (gmx_debug_at)
2312         {
2313             p_graph(debug, "do_pbc_first 2", graph);
2314         }
2315     }
2316     if (fplog)
2317     {
2318         fprintf(fplog, "Done rmpbc\n");
2319     }
2320 }
2321
2322 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2323                             gmx_mtop_t *mtop, rvec x[],
2324                             gmx_bool bFirst)
2325 {
2326     t_graph        *graph;
2327     int             mb, as, mol;
2328     gmx_molblock_t *molb;
2329
2330     if (bFirst && fplog)
2331     {
2332         fprintf(fplog, "Removing pbc first time\n");
2333     }
2334
2335     snew(graph, 1);
2336     as = 0;
2337     for (mb = 0; mb < mtop->nmolblock; mb++)
2338     {
2339         molb = &mtop->molblock[mb];
2340         if (molb->natoms_mol == 1 ||
2341             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2342         {
2343             /* Just one atom or charge group in the molecule, no PBC required */
2344             as += molb->nmol*molb->natoms_mol;
2345         }
2346         else
2347         {
2348             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2349             mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
2350                            0, molb->natoms_mol, FALSE, FALSE, graph);
2351
2352             for (mol = 0; mol < molb->nmol; mol++)
2353             {
2354                 mk_mshift(fplog, graph, ePBC, box, x+as);
2355
2356                 shift_self(graph, box, x+as);
2357                 /* The molecule is whole now.
2358                  * We don't need the second mk_mshift call as in do_pbc_first,
2359                  * since we no longer need this graph.
2360                  */
2361
2362                 as += molb->natoms_mol;
2363             }
2364             done_graph(graph);
2365         }
2366     }
2367     sfree(graph);
2368 }
2369
2370 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2371                        gmx_mtop_t *mtop, rvec x[])
2372 {
2373     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2374 }
2375
2376 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2377                  gmx_mtop_t *mtop, rvec x[])
2378 {
2379     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2380 }
2381
2382 void finish_run(FILE *fplog, t_commrec *cr,
2383                 t_inputrec *inputrec,
2384                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2385                 gmx_walltime_accounting_t walltime_accounting,
2386                 wallclock_gpu_t *gputimes,
2387                 gmx_bool bWriteStat)
2388 {
2389     int     i, j;
2390     t_nrnb *nrnb_tot = NULL;
2391     real    delta_t;
2392     double  nbfs, mflop;
2393     double  elapsed_time,
2394             elapsed_time_over_all_ranks,
2395             elapsed_time_over_all_threads,
2396             elapsed_time_over_all_threads_over_all_ranks;
2397     wallcycle_sum(cr, wcycle);
2398
2399     if (cr->nnodes > 1)
2400     {
2401         snew(nrnb_tot, 1);
2402 #ifdef GMX_MPI
2403         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2404                       cr->mpi_comm_mysim);
2405 #endif
2406     }
2407     else
2408     {
2409         nrnb_tot = nrnb;
2410     }
2411
2412     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2413     elapsed_time_over_all_ranks                  = elapsed_time;
2414     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2415     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2416 #ifdef GMX_MPI
2417     if (cr->nnodes > 1)
2418     {
2419         /* reduce elapsed_time over all MPI ranks in the current simulation */
2420         MPI_Allreduce(&elapsed_time,
2421                       &elapsed_time_over_all_ranks,
2422                       1, MPI_DOUBLE, MPI_SUM,
2423                       cr->mpi_comm_mysim);
2424         elapsed_time_over_all_ranks /= cr->nnodes;
2425         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2426          * current simulation. */
2427         MPI_Allreduce(&elapsed_time_over_all_threads,
2428                       &elapsed_time_over_all_threads_over_all_ranks,
2429                       1, MPI_DOUBLE, MPI_SUM,
2430                       cr->mpi_comm_mysim);
2431     }
2432 #endif
2433
2434     if (SIMMASTER(cr))
2435     {
2436         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2437     }
2438     if (cr->nnodes > 1)
2439     {
2440         sfree(nrnb_tot);
2441     }
2442
2443     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2444     {
2445         print_dd_statistics(cr, inputrec, fplog);
2446     }
2447
2448 #ifdef GMX_MPI
2449     if (PARTDECOMP(cr))
2450     {
2451         if (MASTER(cr))
2452         {
2453             t_nrnb     *nrnb_all;
2454             int         s;
2455             MPI_Status  stat;
2456
2457             snew(nrnb_all, cr->nnodes);
2458             nrnb_all[0] = *nrnb;
2459             for (s = 1; s < cr->nnodes; s++)
2460             {
2461                 MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
2462                          cr->mpi_comm_mysim, &stat);
2463             }
2464             pr_load(fplog, cr, nrnb_all);
2465             sfree(nrnb_all);
2466         }
2467         else
2468         {
2469             MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
2470                      cr->mpi_comm_mysim);
2471         }
2472     }
2473 #endif
2474
2475     if (SIMMASTER(cr))
2476     {
2477         wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
2478                         elapsed_time_over_all_ranks,
2479                         wcycle, gputimes);
2480
2481         if (EI_DYNAMICS(inputrec->eI))
2482         {
2483             delta_t = inputrec->delta_t;
2484         }
2485         else
2486         {
2487             delta_t = 0;
2488         }
2489
2490         if (fplog)
2491         {
2492             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2493                        elapsed_time_over_all_ranks,
2494                        walltime_accounting_get_nsteps_done(walltime_accounting),
2495                        delta_t, nbfs, mflop);
2496         }
2497         if (bWriteStat)
2498         {
2499             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2500                        elapsed_time_over_all_ranks,
2501                        walltime_accounting_get_nsteps_done(walltime_accounting),
2502                        delta_t, nbfs, mflop);
2503         }
2504     }
2505 }
2506
2507 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
2508 {
2509     /* this function works, but could probably use a logic rewrite to keep all the different
2510        types of efep straight. */
2511
2512     int       i;
2513     t_lambda *fep = ir->fepvals;
2514
2515     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2516     {
2517         for (i = 0; i < efptNR; i++)
2518         {
2519             lambda[i] = 0.0;
2520             if (lam0)
2521             {
2522                 lam0[i] = 0.0;
2523             }
2524         }
2525         return;
2526     }
2527     else
2528     {
2529         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2530                                              if checkpoint is set -- a kludge is in for now
2531                                              to prevent this.*/
2532         for (i = 0; i < efptNR; i++)
2533         {
2534             /* overwrite lambda state with init_lambda for now for backwards compatibility */
2535             if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2536             {
2537                 lambda[i] = fep->init_lambda;
2538                 if (lam0)
2539                 {
2540                     lam0[i] = lambda[i];
2541                 }
2542             }
2543             else
2544             {
2545                 lambda[i] = fep->all_lambda[i][*fep_state];
2546                 if (lam0)
2547                 {
2548                     lam0[i] = lambda[i];
2549                 }
2550             }
2551         }
2552         if (ir->bSimTemp)
2553         {
2554             /* need to rescale control temperatures to match current state */
2555             for (i = 0; i < ir->opts.ngtc; i++)
2556             {
2557                 if (ir->opts.ref_t[i] > 0)
2558                 {
2559                     ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2560                 }
2561             }
2562         }
2563     }
2564
2565     /* Send to the log the information on the current lambdas */
2566     if (fplog != NULL)
2567     {
2568         fprintf(fplog, "Initial vector of lambda components:[ ");
2569         for (i = 0; i < efptNR; i++)
2570         {
2571             fprintf(fplog, "%10.4f ", lambda[i]);
2572         }
2573         fprintf(fplog, "]\n");
2574     }
2575     return;
2576 }
2577
2578
2579 void init_md(FILE *fplog,
2580              t_commrec *cr, t_inputrec *ir, const output_env_t oenv,
2581              double *t, double *t0,
2582              real *lambda, int *fep_state, double *lam0,
2583              t_nrnb *nrnb, gmx_mtop_t *mtop,
2584              gmx_update_t *upd,
2585              int nfile, const t_filenm fnm[],
2586              gmx_mdoutf_t **outf, t_mdebin **mdebin,
2587              tensor force_vir, tensor shake_vir, rvec mu_tot,
2588              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
2589 {
2590     int  i, j, n;
2591     real tmpt, mod;
2592
2593     /* Initial values */
2594     *t = *t0       = ir->init_t;
2595
2596     *bSimAnn = FALSE;
2597     for (i = 0; i < ir->opts.ngtc; i++)
2598     {
2599         /* set bSimAnn if any group is being annealed */
2600         if (ir->opts.annealing[i] != eannNO)
2601         {
2602             *bSimAnn = TRUE;
2603         }
2604     }
2605     if (*bSimAnn)
2606     {
2607         update_annealing_target_temp(&(ir->opts), ir->init_t);
2608     }
2609
2610     /* Initialize lambda variables */
2611     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2612
2613     if (upd)
2614     {
2615         *upd = init_update(ir);
2616     }
2617
2618
2619     if (vcm != NULL)
2620     {
2621         *vcm = init_vcm(fplog, &mtop->groups, ir);
2622     }
2623
2624     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2625     {
2626         if (ir->etc == etcBERENDSEN)
2627         {
2628             please_cite(fplog, "Berendsen84a");
2629         }
2630         if (ir->etc == etcVRESCALE)
2631         {
2632             please_cite(fplog, "Bussi2007a");
2633         }
2634     }
2635
2636     init_nrnb(nrnb);
2637
2638     if (nfile != -1)
2639     {
2640         *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
2641
2642         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
2643                               mtop, ir, (*outf)->fp_dhdl);
2644     }
2645
2646     if (ir->bAdress)
2647     {
2648         please_cite(fplog, "Fritsch12");
2649         please_cite(fplog, "Junghans10");
2650     }
2651     /* Initiate variables */
2652     clear_mat(force_vir);
2653     clear_mat(shake_vir);
2654     clear_rvec(mu_tot);
2655
2656     debug_gmx();
2657 }