Upgrade Doxygen layout to 1.8.5.
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  *          GROningen MAchine for Chemical Simulations
9  *
10  *                        VERSION 3.2.0
11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13  * Copyright (c) 2001-2004, The GROMACS development team,
14  * check out http://www.gromacs.org for more information.
15
16  * This program is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU General Public License
18  * as published by the Free Software Foundation; either version 2
19  * of the License, or (at your option) any later version.
20  *
21  * If you want to redistribute modifications, please consider that
22  * scientific software is very special. Version control is crucial -
23  * bugs must be traceable. We will be happy to consider code for
24  * inclusion in the official distribution, but derived work must not
25  * be called official GROMACS. Details are found in the README & COPYING
26  * files - if they are missing, get the official version at www.gromacs.org.
27  *
28  * To help us fund GROMACS development, we humbly ask that you cite
29  * the papers on the package - you can find them in the top README file.
30  *
31  * For more info, check our website at http://www.gromacs.org
32  *
33  * And Hey:
34  * GROwing Monsters And Cloning Shrimps
35  */
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39
40 #include <stdio.h>
41 #ifdef HAVE_SYS_TIME_H
42 #include <sys/time.h>
43 #endif
44 #include <math.h>
45 #include "typedefs.h"
46 #include "string2.h"
47 #include "smalloc.h"
48 #include "names.h"
49 #include "mvdata.h"
50 #include "txtdump.h"
51 #include "pbc.h"
52 #include "chargegroup.h"
53 #include "vec.h"
54 #include "nrnb.h"
55 #include "mshift.h"
56 #include "mdrun.h"
57 #include "sim_util.h"
58 #include "update.h"
59 #include "physics.h"
60 #include "main.h"
61 #include "mdatoms.h"
62 #include "force.h"
63 #include "bondf.h"
64 #include "pme.h"
65 #include "disre.h"
66 #include "orires.h"
67 #include "network.h"
68 #include "calcmu.h"
69 #include "constr.h"
70 #include "xvgr.h"
71 #include "copyrite.h"
72 #include "pull_rotation.h"
73 #include "gmx_random.h"
74 #include "domdec.h"
75 #include "partdec.h"
76 #include "genborn.h"
77 #include "nbnxn_atomdata.h"
78 #include "nbnxn_search.h"
79 #include "nbnxn_kernels/nbnxn_kernel_ref.h"
80 #include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
81 #include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
82 #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
83
84 #include "gromacs/timing/wallcycle.h"
85 #include "gromacs/timing/walltime_accounting.h"
86 #include "gromacs/utility/gmxmpi.h"
87
88 #include "adress.h"
89 #include "qmmm.h"
90
91 #include "nbnxn_cuda_data_mgmt.h"
92 #include "nbnxn_cuda/nbnxn_cuda.h"
93
94 void print_time(FILE                     *out,
95                 gmx_walltime_accounting_t walltime_accounting,
96                 gmx_large_int_t           step,
97                 t_inputrec               *ir,
98                 t_commrec gmx_unused     *cr)
99 {
100     time_t finish;
101     char   timebuf[STRLEN];
102     double dt, elapsed_seconds, time_per_step;
103     char   buf[48];
104
105 #ifndef GMX_THREAD_MPI
106     if (!PAR(cr))
107 #endif
108     {
109         fprintf(out, "\r");
110     }
111     fprintf(out, "step %s", gmx_step_str(step, buf));
112     if ((step >= ir->nstlist))
113     {
114         double seconds_since_epoch = gmx_gettime();
115         elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
116         time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
117         dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
118
119         if (ir->nsteps >= 0)
120         {
121             if (dt >= 300)
122             {
123                 finish = (time_t) (seconds_since_epoch + dt);
124                 gmx_ctime_r(&finish, timebuf, STRLEN);
125                 sprintf(buf, "%s", timebuf);
126                 buf[strlen(buf)-1] = '\0';
127                 fprintf(out, ", will finish %s", buf);
128             }
129             else
130             {
131                 fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
132             }
133         }
134         else
135         {
136             fprintf(out, " performance: %.1f ns/day    ",
137                     ir->delta_t/1000*24*60*60/time_per_step);
138         }
139     }
140 #ifndef GMX_THREAD_MPI
141     if (PAR(cr))
142     {
143         fprintf(out, "\n");
144     }
145 #endif
146
147     fflush(out);
148 }
149
150 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
151                          const gmx_walltime_accounting_t walltime_accounting)
152 {
153     int    i;
154     char   timebuf[STRLEN];
155     char   time_string[STRLEN];
156     time_t tmptime;
157
158     if (fplog)
159     {
160         if (walltime_accounting != NULL)
161         {
162             tmptime = (time_t) walltime_accounting_get_start_time_stamp(walltime_accounting);
163             gmx_ctime_r(&tmptime, timebuf, STRLEN);
164         }
165         else
166         {
167             tmptime = (time_t) gmx_gettime();
168             gmx_ctime_r(&tmptime, timebuf, STRLEN);
169         }
170         for (i = 0; timebuf[i] >= ' '; i++)
171         {
172             time_string[i] = timebuf[i];
173         }
174         time_string[i] = '\0';
175
176         fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
177     }
178 }
179
180 static void sum_forces(int start, int end, rvec f[], rvec flr[])
181 {
182     int i;
183
184     if (gmx_debug_at)
185     {
186         pr_rvecs(debug, 0, "fsr", f+start, end-start);
187         pr_rvecs(debug, 0, "flr", flr+start, end-start);
188     }
189     for (i = start; (i < end); i++)
190     {
191         rvec_inc(f[i], flr[i]);
192     }
193 }
194
195 /*
196  * calc_f_el calculates forces due to an electric field.
197  *
198  * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
199  *
200  * Et[] contains the parameters for the time dependent
201  * part of the field (not yet used).
202  * Ex[] contains the parameters for
203  * the spatial dependent part of the field. You can have cool periodic
204  * fields in principle, but only a constant field is supported
205  * now.
206  * The function should return the energy due to the electric field
207  * (if any) but for now returns 0.
208  *
209  * WARNING:
210  * There can be problems with the virial.
211  * Since the field is not self-consistent this is unavoidable.
212  * For neutral molecules the virial is correct within this approximation.
213  * For neutral systems with many charged molecules the error is small.
214  * But for systems with a net charge or a few charged molecules
215  * the error can be significant when the field is high.
216  * Solution: implement a self-consitent electric field into PME.
217  */
218 static void calc_f_el(FILE *fp, int  start, int homenr,
219                       real charge[], rvec f[],
220                       t_cosines Ex[], t_cosines Et[], double t)
221 {
222     rvec Ext;
223     real t0;
224     int  i, m;
225
226     for (m = 0; (m < DIM); m++)
227     {
228         if (Et[m].n > 0)
229         {
230             if (Et[m].n == 3)
231             {
232                 t0     = Et[m].a[1];
233                 Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
234             }
235             else
236             {
237                 Ext[m] = cos(Et[m].a[0]*t);
238             }
239         }
240         else
241         {
242             Ext[m] = 1.0;
243         }
244         if (Ex[m].n > 0)
245         {
246             /* Convert the field strength from V/nm to MD-units */
247             Ext[m] *= Ex[m].a[0]*FIELDFAC;
248             for (i = start; (i < start+homenr); i++)
249             {
250                 f[i][m] += charge[i]*Ext[m];
251             }
252         }
253         else
254         {
255             Ext[m] = 0;
256         }
257     }
258     if (fp != NULL)
259     {
260         fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
261                 Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
262     }
263 }
264
265 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
266                         tensor vir_part, t_graph *graph, matrix box,
267                         t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
268 {
269     int    i, j;
270     tensor virtest;
271
272     /* The short-range virial from surrounding boxes */
273     clear_mat(vir_part);
274     calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
275     inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
276
277     /* Calculate partial virial, for local atoms only, based on short range.
278      * Total virial is computed in global_stat, called from do_md
279      */
280     f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
281     inc_nrnb(nrnb, eNR_VIRIAL, homenr);
282
283     /* Add position restraint contribution */
284     for (i = 0; i < DIM; i++)
285     {
286         vir_part[i][i] += fr->vir_diag_posres[i];
287     }
288
289     /* Add wall contribution */
290     for (i = 0; i < DIM; i++)
291     {
292         vir_part[i][ZZ] += fr->vir_wall_z[i];
293     }
294
295     if (debug)
296     {
297         pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
298     }
299 }
300
301 static void posres_wrapper(FILE *fplog,
302                            int flags,
303                            gmx_bool bSepDVDL,
304                            t_inputrec *ir,
305                            t_nrnb *nrnb,
306                            gmx_localtop_t *top,
307                            matrix box, rvec x[],
308                            gmx_enerdata_t *enerd,
309                            real *lambda,
310                            t_forcerec *fr)
311 {
312     t_pbc pbc;
313     real  v, dvdl;
314     int   i;
315
316     /* Position restraints always require full pbc */
317     set_pbc(&pbc, ir->ePBC, box);
318     dvdl = 0;
319     v    = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
320                   top->idef.iparams_posres,
321                   (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
322                   ir->ePBC == epbcNONE ? NULL : &pbc,
323                   lambda[efptRESTRAINT], &dvdl,
324                   fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
325     if (bSepDVDL)
326     {
327         gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
328     }
329     enerd->term[F_POSRES] += v;
330     /* If just the force constant changes, the FEP term is linear,
331      * but if k changes, it is not.
332      */
333     enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
334     inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
335
336     if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
337     {
338         for (i = 0; i < enerd->n_lambda; i++)
339         {
340             real dvdl_dum, lambda_dum;
341
342             lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
343             v          = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
344                                 top->idef.iparams_posres,
345                                 (const rvec*)x, NULL, NULL,
346                                 ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
347                                 fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
348             enerd->enerpart_lambda[i] += v;
349         }
350     }
351 }
352
353 static void pull_potential_wrapper(FILE *fplog,
354                                    gmx_bool bSepDVDL,
355                                    t_commrec *cr,
356                                    t_inputrec *ir,
357                                    matrix box, rvec x[],
358                                    rvec f[],
359                                    tensor vir_force,
360                                    t_mdatoms *mdatoms,
361                                    gmx_enerdata_t *enerd,
362                                    real *lambda,
363                                    double t)
364 {
365     t_pbc  pbc;
366     real   dvdl;
367
368     /* Calculate the center of mass forces, this requires communication,
369      * which is why pull_potential is called close to other communication.
370      * The virial contribution is calculated directly,
371      * which is why we call pull_potential after calc_virial.
372      */
373     set_pbc(&pbc, ir->ePBC, box);
374     dvdl                     = 0;
375     enerd->term[F_COM_PULL] +=
376         pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
377                        cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
378     if (bSepDVDL)
379     {
380         gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
381     }
382     enerd->dvdl_lin[efptRESTRAINT] += dvdl;
383 }
384
385 static void pme_receive_force_ener(FILE           *fplog,
386                                    gmx_bool        bSepDVDL,
387                                    t_commrec      *cr,
388                                    gmx_wallcycle_t wcycle,
389                                    gmx_enerdata_t *enerd,
390                                    t_forcerec     *fr)
391 {
392     real   e, v, dvdl;
393     float  cycles_ppdpme, cycles_seppme;
394
395     cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
396     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
397
398     /* In case of node-splitting, the PP nodes receive the long-range
399      * forces, virial and energy from the PME nodes here.
400      */
401     wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
402     dvdl = 0;
403     gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
404                       &cycles_seppme);
405     if (bSepDVDL)
406     {
407         gmx_print_sepdvdl(fplog, "PME mesh", e, dvdl);
408     }
409     enerd->term[F_COUL_RECIP] += e;
410     enerd->dvdl_lin[efptCOUL] += dvdl;
411     if (wcycle)
412     {
413         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
414     }
415     wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
416 }
417
418 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
419                                gmx_large_int_t step, real pforce, rvec *x, rvec *f)
420 {
421     int  i;
422     real pf2, fn2;
423     char buf[STEPSTRSIZE];
424
425     pf2 = sqr(pforce);
426     for (i = md->start; i < md->start+md->homenr; i++)
427     {
428         fn2 = norm2(f[i]);
429         /* We also catch NAN, if the compiler does not optimize this away. */
430         if (fn2 >= pf2 || fn2 != fn2)
431         {
432             fprintf(fp, "step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
433                     gmx_step_str(step, buf),
434                     ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], sqrt(fn2));
435         }
436     }
437 }
438
439 static void post_process_forces(t_commrec *cr,
440                                 gmx_large_int_t step,
441                                 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
442                                 gmx_localtop_t *top,
443                                 matrix box, rvec x[],
444                                 rvec f[],
445                                 tensor vir_force,
446                                 t_mdatoms *mdatoms,
447                                 t_graph *graph,
448                                 t_forcerec *fr, gmx_vsite_t *vsite,
449                                 int flags)
450 {
451     if (fr->bF_NoVirSum)
452     {
453         if (vsite)
454         {
455             /* Spread the mesh force on virtual sites to the other particles...
456              * This is parallellized. MPI communication is performed
457              * if the constructing atoms aren't local.
458              */
459             wallcycle_start(wcycle, ewcVSITESPREAD);
460             spread_vsite_f(vsite, x, fr->f_novirsum, NULL,
461                            (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
462                            nrnb,
463                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
464             wallcycle_stop(wcycle, ewcVSITESPREAD);
465         }
466         if (flags & GMX_FORCE_VIRIAL)
467         {
468             /* Now add the forces, this is local */
469             if (fr->bDomDec)
470             {
471                 sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
472             }
473             else
474             {
475                 sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
476                            f, fr->f_novirsum);
477             }
478             if (EEL_FULL(fr->eeltype))
479             {
480                 /* Add the mesh contribution to the virial */
481                 m_add(vir_force, fr->vir_el_recip, vir_force);
482             }
483             if (debug)
484             {
485                 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
486             }
487         }
488     }
489
490     if (fr->print_force >= 0)
491     {
492         print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
493     }
494 }
495
496 static void do_nb_verlet(t_forcerec *fr,
497                          interaction_const_t *ic,
498                          gmx_enerdata_t *enerd,
499                          int flags, int ilocality,
500                          int clearF,
501                          t_nrnb *nrnb,
502                          gmx_wallcycle_t wcycle)
503 {
504     int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
505     char                      *env;
506     nonbonded_verlet_group_t  *nbvg;
507     gmx_bool                   bCUDA;
508
509     if (!(flags & GMX_FORCE_NONBONDED))
510     {
511         /* skip non-bonded calculation */
512         return;
513     }
514
515     nbvg = &fr->nbv->grp[ilocality];
516
517     /* CUDA kernel launch overhead is already timed separately */
518     if (fr->cutoff_scheme != ecutsVERLET)
519     {
520         gmx_incons("Invalid cut-off scheme passed!");
521     }
522
523     bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
524
525     if (!bCUDA)
526     {
527         wallcycle_sub_start(wcycle, ewcsNONBONDED);
528     }
529     switch (nbvg->kernel_type)
530     {
531         case nbnxnk4x4_PlainC:
532             nbnxn_kernel_ref(&nbvg->nbl_lists,
533                              nbvg->nbat, ic,
534                              fr->shift_vec,
535                              flags,
536                              clearF,
537                              fr->fshift[0],
538                              enerd->grpp.ener[egCOULSR],
539                              fr->bBHAM ?
540                              enerd->grpp.ener[egBHAMSR] :
541                              enerd->grpp.ener[egLJSR]);
542             break;
543
544         case nbnxnk4xN_SIMD_4xN:
545             nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
546                                   nbvg->nbat, ic,
547                                   nbvg->ewald_excl,
548                                   fr->shift_vec,
549                                   flags,
550                                   clearF,
551                                   fr->fshift[0],
552                                   enerd->grpp.ener[egCOULSR],
553                                   fr->bBHAM ?
554                                   enerd->grpp.ener[egBHAMSR] :
555                                   enerd->grpp.ener[egLJSR]);
556             break;
557         case nbnxnk4xN_SIMD_2xNN:
558             nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
559                                    nbvg->nbat, ic,
560                                    nbvg->ewald_excl,
561                                    fr->shift_vec,
562                                    flags,
563                                    clearF,
564                                    fr->fshift[0],
565                                    enerd->grpp.ener[egCOULSR],
566                                    fr->bBHAM ?
567                                    enerd->grpp.ener[egBHAMSR] :
568                                    enerd->grpp.ener[egLJSR]);
569             break;
570
571         case nbnxnk8x8x8_CUDA:
572             nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
573             break;
574
575         case nbnxnk8x8x8_PlainC:
576             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
577                                  nbvg->nbat, ic,
578                                  fr->shift_vec,
579                                  flags,
580                                  clearF,
581                                  nbvg->nbat->out[0].f,
582                                  fr->fshift[0],
583                                  enerd->grpp.ener[egCOULSR],
584                                  fr->bBHAM ?
585                                  enerd->grpp.ener[egBHAMSR] :
586                                  enerd->grpp.ener[egLJSR]);
587             break;
588
589         default:
590             gmx_incons("Invalid nonbonded kernel type passed!");
591
592     }
593     if (!bCUDA)
594     {
595         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
596     }
597
598     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
599     {
600         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
601     }
602     else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
603              (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
604     {
605         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
606     }
607     else
608     {
609         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
610     }
611     enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
612     if (flags & GMX_FORCE_ENERGY)
613     {
614         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
615         enr_nbnxn_kernel_ljc += 1;
616         enr_nbnxn_kernel_lj  += 1;
617     }
618
619     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
620              nbvg->nbl_lists.natpair_ljq);
621     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
622              nbvg->nbl_lists.natpair_lj);
623     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
624              nbvg->nbl_lists.natpair_q);
625 }
626
627 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
628                          t_inputrec *inputrec,
629                          gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
630                          gmx_localtop_t *top,
631                          gmx_groups_t gmx_unused *groups,
632                          matrix box, rvec x[], history_t *hist,
633                          rvec f[],
634                          tensor vir_force,
635                          t_mdatoms *mdatoms,
636                          gmx_enerdata_t *enerd, t_fcdata *fcd,
637                          real *lambda, t_graph *graph,
638                          t_forcerec *fr, interaction_const_t *ic,
639                          gmx_vsite_t *vsite, rvec mu_tot,
640                          double t, FILE *field, gmx_edsam_t ed,
641                          gmx_bool bBornRadii,
642                          int flags)
643 {
644     int                 cg0, cg1, i, j;
645     int                 start, homenr;
646     int                 nb_kernel_type;
647     double              mu[2*DIM];
648     gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
649     gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
650     gmx_bool            bDiffKernels = FALSE;
651     matrix              boxs;
652     rvec                vzero, box_diag;
653     real                e, v, dvdl;
654     float               cycles_pme, cycles_force;
655     nonbonded_verlet_t *nbv;
656
657     cycles_force   = 0;
658     nbv            = fr->nbv;
659     nb_kernel_type = fr->nbv->grp[0].kernel_type;
660
661     start  = mdatoms->start;
662     homenr = mdatoms->homenr;
663
664     bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
665
666     clear_mat(vir_force);
667
668     cg0 = 0;
669     if (DOMAINDECOMP(cr))
670     {
671         cg1 = cr->dd->ncg_tot;
672     }
673     else
674     {
675         cg1 = top->cgs.nr;
676     }
677     if (fr->n_tpi > 0)
678     {
679         cg1--;
680     }
681
682     bStateChanged = (flags & GMX_FORCE_STATECHANGED);
683     bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
684     bFillGrid     = (bNS && bStateChanged);
685     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
686     bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
687     bDoForces     = (flags & GMX_FORCE_FORCES);
688     bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
689     bUseGPU       = fr->nbv->bUseGPU;
690     bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
691
692     if (bStateChanged)
693     {
694         update_forcerec(fr, box);
695
696         if (NEED_MUTOT(*inputrec))
697         {
698             /* Calculate total (local) dipole moment in a temporary common array.
699              * This makes it possible to sum them over nodes faster.
700              */
701             calc_mu(start, homenr,
702                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
703                     mu, mu+DIM);
704         }
705     }
706
707     if (fr->ePBC != epbcNONE)
708     {
709         /* Compute shift vectors every step,
710          * because of pressure coupling or box deformation!
711          */
712         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
713         {
714             calc_shifts(box, fr->shift_vec);
715         }
716
717         if (bCalcCGCM)
718         {
719             put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
720             inc_nrnb(nrnb, eNR_SHIFTX, homenr);
721         }
722         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
723         {
724             unshift_self(graph, box, x);
725         }
726     }
727
728     nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
729                                  fr->shift_vec, nbv->grp[0].nbat);
730
731 #ifdef GMX_MPI
732     if (!(cr->duty & DUTY_PME))
733     {
734         /* Send particle coordinates to the pme nodes.
735          * Since this is only implemented for domain decomposition
736          * and domain decomposition does not use the graph,
737          * we do not need to worry about shifting.
738          */
739
740         wallcycle_start(wcycle, ewcPP_PMESENDX);
741
742         bBS = (inputrec->nwall == 2);
743         if (bBS)
744         {
745             copy_mat(box, boxs);
746             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
747         }
748
749         gmx_pme_send_x(cr, bBS ? boxs : box, x,
750                        mdatoms->nChargePerturbed, lambda[efptCOUL],
751                        (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
752
753         wallcycle_stop(wcycle, ewcPP_PMESENDX);
754     }
755 #endif /* GMX_MPI */
756
757     /* do gridding for pair search */
758     if (bNS)
759     {
760         if (graph && bStateChanged)
761         {
762             /* Calculate intramolecular shift vectors to make molecules whole */
763             mk_mshift(fplog, graph, fr->ePBC, box, x);
764         }
765
766         clear_rvec(vzero);
767         box_diag[XX] = box[XX][XX];
768         box_diag[YY] = box[YY][YY];
769         box_diag[ZZ] = box[ZZ][ZZ];
770
771         wallcycle_start(wcycle, ewcNS);
772         if (!fr->bDomDec)
773         {
774             wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
775             nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
776                               0, vzero, box_diag,
777                               0, mdatoms->homenr, -1, fr->cginfo, x,
778                               0, NULL,
779                               nbv->grp[eintLocal].kernel_type,
780                               nbv->grp[eintLocal].nbat);
781             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
782         }
783         else
784         {
785             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
786             nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
787                                        fr->cginfo, x,
788                                        nbv->grp[eintNonlocal].kernel_type,
789                                        nbv->grp[eintNonlocal].nbat);
790             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
791         }
792
793         if (nbv->ngrp == 1 ||
794             nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
795         {
796             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
797                                nbv->nbs, mdatoms, fr->cginfo);
798         }
799         else
800         {
801             nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
802                                nbv->nbs, mdatoms, fr->cginfo);
803             nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
804                                nbv->nbs, mdatoms, fr->cginfo);
805         }
806         wallcycle_stop(wcycle, ewcNS);
807     }
808
809     /* initialize the GPU atom data and copy shift vector */
810     if (bUseGPU)
811     {
812         if (bNS)
813         {
814             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
815             nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
816             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
817         }
818
819         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
820         nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
821         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
822     }
823
824     /* do local pair search */
825     if (bNS)
826     {
827         wallcycle_start_nocount(wcycle, ewcNS);
828         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
829         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
830                             &top->excls,
831                             ic->rlist,
832                             nbv->min_ci_balanced,
833                             &nbv->grp[eintLocal].nbl_lists,
834                             eintLocal,
835                             nbv->grp[eintLocal].kernel_type,
836                             nrnb);
837         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
838
839         if (bUseGPU)
840         {
841             /* initialize local pair-list on the GPU */
842             nbnxn_cuda_init_pairlist(nbv->cu_nbv,
843                                      nbv->grp[eintLocal].nbl_lists.nbl[0],
844                                      eintLocal);
845         }
846         wallcycle_stop(wcycle, ewcNS);
847     }
848     else
849     {
850         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
851         wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
852         nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
853                                         nbv->grp[eintLocal].nbat);
854         wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
855         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
856     }
857
858     if (bUseGPU)
859     {
860         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
861         /* launch local nonbonded F on GPU */
862         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
863                      nrnb, wcycle);
864         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
865     }
866
867     /* Communicate coordinates and sum dipole if necessary +
868        do non-local pair search */
869     if (DOMAINDECOMP(cr))
870     {
871         bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
872                         nbv->grp[eintLocal].kernel_type);
873
874         if (bDiffKernels)
875         {
876             /* With GPU+CPU non-bonded calculations we need to copy
877              * the local coordinates to the non-local nbat struct
878              * (in CPU format) as the non-local kernel call also
879              * calculates the local - non-local interactions.
880              */
881             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
882             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
883             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
884                                             nbv->grp[eintNonlocal].nbat);
885             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
886             wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
887         }
888
889         if (bNS)
890         {
891             wallcycle_start_nocount(wcycle, ewcNS);
892             wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
893
894             if (bDiffKernels)
895             {
896                 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
897             }
898
899             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
900                                 &top->excls,
901                                 ic->rlist,
902                                 nbv->min_ci_balanced,
903                                 &nbv->grp[eintNonlocal].nbl_lists,
904                                 eintNonlocal,
905                                 nbv->grp[eintNonlocal].kernel_type,
906                                 nrnb);
907
908             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
909
910             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
911             {
912                 /* initialize non-local pair-list on the GPU */
913                 nbnxn_cuda_init_pairlist(nbv->cu_nbv,
914                                          nbv->grp[eintNonlocal].nbl_lists.nbl[0],
915                                          eintNonlocal);
916             }
917             wallcycle_stop(wcycle, ewcNS);
918         }
919         else
920         {
921             wallcycle_start(wcycle, ewcMOVEX);
922             dd_move_x(cr->dd, box, x);
923
924             /* When we don't need the total dipole we sum it in global_stat */
925             if (bStateChanged && NEED_MUTOT(*inputrec))
926             {
927                 gmx_sumd(2*DIM, mu, cr);
928             }
929             wallcycle_stop(wcycle, ewcMOVEX);
930
931             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
932             wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
933             nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
934                                             nbv->grp[eintNonlocal].nbat);
935             wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
936             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
937         }
938
939         if (bUseGPU && !bDiffKernels)
940         {
941             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
942             /* launch non-local nonbonded F on GPU */
943             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
944                          nrnb, wcycle);
945             cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
946         }
947     }
948
949     if (bUseGPU)
950     {
951         /* launch D2H copy-back F */
952         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
953         if (DOMAINDECOMP(cr) && !bDiffKernels)
954         {
955             nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
956                                       flags, eatNonlocal);
957         }
958         nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
959                                   flags, eatLocal);
960         cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
961     }
962
963     if (bStateChanged && NEED_MUTOT(*inputrec))
964     {
965         if (PAR(cr))
966         {
967             gmx_sumd(2*DIM, mu, cr);
968         }
969
970         for (i = 0; i < 2; i++)
971         {
972             for (j = 0; j < DIM; j++)
973             {
974                 fr->mu_tot[i][j] = mu[i*DIM + j];
975             }
976         }
977     }
978     if (fr->efep == efepNO)
979     {
980         copy_rvec(fr->mu_tot[0], mu_tot);
981     }
982     else
983     {
984         for (j = 0; j < DIM; j++)
985         {
986             mu_tot[j] =
987                 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
988                 lambda[efptCOUL]*fr->mu_tot[1][j];
989         }
990     }
991
992     /* Reset energies */
993     reset_enerdata(fr, bNS, enerd, MASTER(cr));
994     clear_rvecs(SHIFTS, fr->fshift);
995
996     if (DOMAINDECOMP(cr))
997     {
998         if (!(cr->duty & DUTY_PME))
999         {
1000             wallcycle_start(wcycle, ewcPPDURINGPME);
1001             dd_force_flop_start(cr->dd, nrnb);
1002         }
1003     }
1004
1005     if (inputrec->bRot)
1006     {
1007         /* Enforced rotation has its own cycle counter that starts after the collective
1008          * coordinates have been communicated. It is added to ddCyclF to allow
1009          * for proper load-balancing */
1010         wallcycle_start(wcycle, ewcROT);
1011         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1012         wallcycle_stop(wcycle, ewcROT);
1013     }
1014
1015     /* Start the force cycle counter.
1016      * This counter is stopped in do_forcelow_level.
1017      * No parallel communication should occur while this counter is running,
1018      * since that will interfere with the dynamic load balancing.
1019      */
1020     wallcycle_start(wcycle, ewcFORCE);
1021     if (bDoForces)
1022     {
1023         /* Reset forces for which the virial is calculated separately:
1024          * PME/Ewald forces if necessary */
1025         if (fr->bF_NoVirSum)
1026         {
1027             if (flags & GMX_FORCE_VIRIAL)
1028             {
1029                 fr->f_novirsum = fr->f_novirsum_alloc;
1030                 if (fr->bDomDec)
1031                 {
1032                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1033                 }
1034                 else
1035                 {
1036                     clear_rvecs(homenr, fr->f_novirsum+start);
1037                 }
1038             }
1039             else
1040             {
1041                 /* We are not calculating the pressure so we do not need
1042                  * a separate array for forces that do not contribute
1043                  * to the pressure.
1044                  */
1045                 fr->f_novirsum = f;
1046             }
1047         }
1048
1049         /* Clear the short- and long-range forces */
1050         clear_rvecs(fr->natoms_force_constr, f);
1051         if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
1052         {
1053             clear_rvecs(fr->natoms_force_constr, fr->f_twin);
1054         }
1055
1056         clear_rvec(fr->vir_diag_posres);
1057     }
1058
1059     if (inputrec->ePull == epullCONSTRAINT)
1060     {
1061         clear_pull_forces(inputrec->pull);
1062     }
1063
1064     /* We calculate the non-bonded forces, when done on the CPU, here.
1065      * We do this before calling do_force_lowlevel, as in there bondeds
1066      * forces are calculated before PME, which does communication.
1067      * With this order, non-bonded and bonded force calculation imbalance
1068      * can be balanced out by the domain decomposition load balancing.
1069      */
1070
1071     if (!bUseOrEmulGPU)
1072     {
1073         /* Maybe we should move this into do_force_lowlevel */
1074         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1075                      nrnb, wcycle);
1076     }
1077
1078     if (!bUseOrEmulGPU || bDiffKernels)
1079     {
1080         int aloc;
1081
1082         if (DOMAINDECOMP(cr))
1083         {
1084             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1085                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
1086                          nrnb, wcycle);
1087         }
1088
1089         if (!bUseOrEmulGPU)
1090         {
1091             aloc = eintLocal;
1092         }
1093         else
1094         {
1095             aloc = eintNonlocal;
1096         }
1097
1098         /* Add all the non-bonded force to the normal force array.
1099          * This can be split into a local a non-local part when overlapping
1100          * communication with calculation with domain decomposition.
1101          */
1102         cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1103         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1104         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1105         nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1106         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1107         cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1108         wallcycle_start_nocount(wcycle, ewcFORCE);
1109
1110         /* if there are multiple fshift output buffers reduce them */
1111         if ((flags & GMX_FORCE_VIRIAL) &&
1112             nbv->grp[aloc].nbl_lists.nnbl > 1)
1113         {
1114             nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1115                                                      fr->fshift);
1116         }
1117     }
1118
1119     /* update QMMMrec, if necessary */
1120     if (fr->bQMMM)
1121     {
1122         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1123     }
1124
1125     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
1126     {
1127         posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
1128                        enerd, lambda, fr);
1129     }
1130
1131     /* Compute the bonded and non-bonded energies and optionally forces */
1132     do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
1133                       cr, nrnb, wcycle, mdatoms,
1134                       x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
1135                       &(top->atomtypes), bBornRadii, box,
1136                       inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1137                       flags, &cycles_pme);
1138
1139     if (bSepLRF)
1140     {
1141         if (do_per_step(step, inputrec->nstcalclr))
1142         {
1143             /* Add the long range forces to the short range forces */
1144             for (i = 0; i < fr->natoms_force_constr; i++)
1145             {
1146                 rvec_add(fr->f_twin[i], f[i], f[i]);
1147             }
1148         }
1149     }
1150
1151     cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1152
1153     if (ed)
1154     {
1155         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1156     }
1157
1158     if (bUseOrEmulGPU && !bDiffKernels)
1159     {
1160         /* wait for non-local forces (or calculate in emulation mode) */
1161         if (DOMAINDECOMP(cr))
1162         {
1163             if (bUseGPU)
1164             {
1165                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1166                 nbnxn_cuda_wait_gpu(nbv->cu_nbv,
1167                                     nbv->grp[eintNonlocal].nbat,
1168                                     flags, eatNonlocal,
1169                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1170                                     fr->fshift);
1171                 cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1172             }
1173             else
1174             {
1175                 wallcycle_start_nocount(wcycle, ewcFORCE);
1176                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1177                              nrnb, wcycle);
1178                 cycles_force += wallcycle_stop(wcycle, ewcFORCE);
1179             }
1180             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1181             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1182             /* skip the reduction if there was no non-local work to do */
1183             if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
1184             {
1185                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1186                                                nbv->grp[eintNonlocal].nbat, f);
1187             }
1188             wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1189             cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1190         }
1191     }
1192
1193     if (bDoForces)
1194     {
1195         /* Communicate the forces */
1196         if (PAR(cr))
1197         {
1198             wallcycle_start(wcycle, ewcMOVEF);
1199             if (DOMAINDECOMP(cr))
1200             {
1201                 dd_move_f(cr->dd, f, fr->fshift);
1202                 /* Do we need to communicate the separate force array
1203                  * for terms that do not contribute to the single sum virial?
1204                  * Position restraints and electric fields do not introduce
1205                  * inter-cg forces, only full electrostatics methods do.
1206                  * When we do not calculate the virial, fr->f_novirsum = f,
1207                  * so we have already communicated these forces.
1208                  */
1209                 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1210                     (flags & GMX_FORCE_VIRIAL))
1211                 {
1212                     dd_move_f(cr->dd, fr->f_novirsum, NULL);
1213                 }
1214                 if (bSepLRF)
1215                 {
1216                     /* We should not update the shift forces here,
1217                      * since f_twin is already included in f.
1218                      */
1219                     dd_move_f(cr->dd, fr->f_twin, NULL);
1220                 }
1221             }
1222             wallcycle_stop(wcycle, ewcMOVEF);
1223         }
1224     }
1225
1226     if (bUseOrEmulGPU)
1227     {
1228         /* wait for local forces (or calculate in emulation mode) */
1229         if (bUseGPU)
1230         {
1231             wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1232             nbnxn_cuda_wait_gpu(nbv->cu_nbv,
1233                                 nbv->grp[eintLocal].nbat,
1234                                 flags, eatLocal,
1235                                 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1236                                 fr->fshift);
1237             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1238
1239             /* now clear the GPU outputs while we finish the step on the CPU */
1240
1241             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1242             nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
1243             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1244         }
1245         else
1246         {
1247             wallcycle_start_nocount(wcycle, ewcFORCE);
1248             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1249                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1250                          nrnb, wcycle);
1251             wallcycle_stop(wcycle, ewcFORCE);
1252         }
1253         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1254         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1255         if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
1256         {
1257             /* skip the reduction if there was no non-local work to do */
1258             nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1259                                            nbv->grp[eintLocal].nbat, f);
1260         }
1261         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1262         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1263     }
1264
1265     if (DOMAINDECOMP(cr))
1266     {
1267         dd_force_flop_stop(cr->dd, nrnb);
1268         if (wcycle)
1269         {
1270             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1271         }
1272     }
1273
1274     if (bDoForces)
1275     {
1276         if (IR_ELEC_FIELD(*inputrec))
1277         {
1278             /* Compute forces due to electric field */
1279             calc_f_el(MASTER(cr) ? field : NULL,
1280                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1281                       inputrec->ex, inputrec->et, t);
1282         }
1283
1284         /* If we have NoVirSum forces, but we do not calculate the virial,
1285          * we sum fr->f_novirum=f later.
1286          */
1287         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1288         {
1289             wallcycle_start(wcycle, ewcVSITESPREAD);
1290             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1291                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1292             wallcycle_stop(wcycle, ewcVSITESPREAD);
1293
1294             if (bSepLRF)
1295             {
1296                 wallcycle_start(wcycle, ewcVSITESPREAD);
1297                 spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
1298                                nrnb,
1299                                &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1300                 wallcycle_stop(wcycle, ewcVSITESPREAD);
1301             }
1302         }
1303
1304         if (flags & GMX_FORCE_VIRIAL)
1305         {
1306             /* Calculation of the virial must be done after vsites! */
1307             calc_virial(mdatoms->start, mdatoms->homenr, x, f,
1308                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1309         }
1310     }
1311
1312     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
1313     {
1314         pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
1315                                f, vir_force, mdatoms, enerd, lambda, t);
1316     }
1317
1318     /* Add the forces from enforced rotation potentials (if any) */
1319     if (inputrec->bRot)
1320     {
1321         wallcycle_start(wcycle, ewcROTadd);
1322         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1323         wallcycle_stop(wcycle, ewcROTadd);
1324     }
1325
1326     if (PAR(cr) && !(cr->duty & DUTY_PME))
1327     {
1328         /* In case of node-splitting, the PP nodes receive the long-range
1329          * forces, virial and energy from the PME nodes here.
1330          */
1331         pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
1332     }
1333
1334     if (bDoForces)
1335     {
1336         post_process_forces(cr, step, nrnb, wcycle,
1337                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1338                             flags);
1339     }
1340
1341     /* Sum the potential energy terms from group contributions */
1342     sum_epot(&(enerd->grpp), enerd->term);
1343 }
1344
1345 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1346                         t_inputrec *inputrec,
1347                         gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1348                         gmx_localtop_t *top,
1349                         gmx_groups_t *groups,
1350                         matrix box, rvec x[], history_t *hist,
1351                         rvec f[],
1352                         tensor vir_force,
1353                         t_mdatoms *mdatoms,
1354                         gmx_enerdata_t *enerd, t_fcdata *fcd,
1355                         real *lambda, t_graph *graph,
1356                         t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1357                         double t, FILE *field, gmx_edsam_t ed,
1358                         gmx_bool bBornRadii,
1359                         int flags)
1360 {
1361     int        cg0, cg1, i, j;
1362     int        start, homenr;
1363     double     mu[2*DIM];
1364     gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
1365     gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
1366     gmx_bool   bDoAdressWF;
1367     matrix     boxs;
1368     rvec       vzero, box_diag;
1369     real       e, v, dvdlambda[efptNR];
1370     t_pbc      pbc;
1371     float      cycles_pme, cycles_force;
1372
1373     start  = mdatoms->start;
1374     homenr = mdatoms->homenr;
1375
1376     bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
1377
1378     clear_mat(vir_force);
1379
1380     if (PARTDECOMP(cr))
1381     {
1382         pd_cg_range(cr, &cg0, &cg1);
1383     }
1384     else
1385     {
1386         cg0 = 0;
1387         if (DOMAINDECOMP(cr))
1388         {
1389             cg1 = cr->dd->ncg_tot;
1390         }
1391         else
1392         {
1393             cg1 = top->cgs.nr;
1394         }
1395         if (fr->n_tpi > 0)
1396         {
1397             cg1--;
1398         }
1399     }
1400
1401     bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
1402     bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1403     /* Should we update the long-range neighborlists at this step? */
1404     bDoLongRangeNS = fr->bTwinRange && bNS;
1405     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1406     bFillGrid      = (bNS && bStateChanged);
1407     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
1408     bDoForces      = (flags & GMX_FORCE_FORCES);
1409     bDoPotential   = (flags & GMX_FORCE_ENERGY);
1410     bSepLRF        = ((inputrec->nstcalclr > 1) && bDoForces &&
1411                       (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
1412
1413     /* should probably move this to the forcerec since it doesn't change */
1414     bDoAdressWF   = ((fr->adress_type != eAdressOff));
1415
1416     if (bStateChanged)
1417     {
1418         update_forcerec(fr, box);
1419
1420         if (NEED_MUTOT(*inputrec))
1421         {
1422             /* Calculate total (local) dipole moment in a temporary common array.
1423              * This makes it possible to sum them over nodes faster.
1424              */
1425             calc_mu(start, homenr,
1426                     x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1427                     mu, mu+DIM);
1428         }
1429     }
1430
1431     if (fr->ePBC != epbcNONE)
1432     {
1433         /* Compute shift vectors every step,
1434          * because of pressure coupling or box deformation!
1435          */
1436         if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1437         {
1438             calc_shifts(box, fr->shift_vec);
1439         }
1440
1441         if (bCalcCGCM)
1442         {
1443             put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1444                                      &(top->cgs), x, fr->cg_cm);
1445             inc_nrnb(nrnb, eNR_CGCM, homenr);
1446             inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1447         }
1448         else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1449         {
1450             unshift_self(graph, box, x);
1451         }
1452     }
1453     else if (bCalcCGCM)
1454     {
1455         calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1456         inc_nrnb(nrnb, eNR_CGCM, homenr);
1457     }
1458
1459     if (bCalcCGCM)
1460     {
1461         if (PAR(cr))
1462         {
1463             move_cgcm(fplog, cr, fr->cg_cm);
1464         }
1465         if (gmx_debug_at)
1466         {
1467             pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1468         }
1469     }
1470
1471 #ifdef GMX_MPI
1472     if (!(cr->duty & DUTY_PME))
1473     {
1474         /* Send particle coordinates to the pme nodes.
1475          * Since this is only implemented for domain decomposition
1476          * and domain decomposition does not use the graph,
1477          * we do not need to worry about shifting.
1478          */
1479
1480         wallcycle_start(wcycle, ewcPP_PMESENDX);
1481
1482         bBS = (inputrec->nwall == 2);
1483         if (bBS)
1484         {
1485             copy_mat(box, boxs);
1486             svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1487         }
1488
1489         gmx_pme_send_x(cr, bBS ? boxs : box, x,
1490                        mdatoms->nChargePerturbed, lambda[efptCOUL],
1491                        (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
1492
1493         wallcycle_stop(wcycle, ewcPP_PMESENDX);
1494     }
1495 #endif /* GMX_MPI */
1496
1497     /* Communicate coordinates and sum dipole if necessary */
1498     if (PAR(cr))
1499     {
1500         wallcycle_start(wcycle, ewcMOVEX);
1501         if (DOMAINDECOMP(cr))
1502         {
1503             dd_move_x(cr->dd, box, x);
1504         }
1505         else
1506         {
1507             move_x(cr, x, nrnb);
1508         }
1509         wallcycle_stop(wcycle, ewcMOVEX);
1510     }
1511
1512     /* update adress weight beforehand */
1513     if (bStateChanged && bDoAdressWF)
1514     {
1515         /* need pbc for adress weight calculation with pbc_dx */
1516         set_pbc(&pbc, inputrec->ePBC, box);
1517         if (fr->adress_site == eAdressSITEcog)
1518         {
1519             update_adress_weights_cog(top->idef.iparams, top->idef.il, x, fr, mdatoms,
1520                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
1521         }
1522         else if (fr->adress_site == eAdressSITEcom)
1523         {
1524             update_adress_weights_com(fplog, cg0, cg1, &(top->cgs), x, fr, mdatoms,
1525                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
1526         }
1527         else if (fr->adress_site == eAdressSITEatomatom)
1528         {
1529             update_adress_weights_atom_per_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
1530                                                 inputrec->ePBC == epbcNONE ? NULL : &pbc);
1531         }
1532         else
1533         {
1534             update_adress_weights_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
1535                                        inputrec->ePBC == epbcNONE ? NULL : &pbc);
1536         }
1537     }
1538
1539     if (NEED_MUTOT(*inputrec))
1540     {
1541
1542         if (bStateChanged)
1543         {
1544             if (PAR(cr))
1545             {
1546                 gmx_sumd(2*DIM, mu, cr);
1547             }
1548             for (i = 0; i < 2; i++)
1549             {
1550                 for (j = 0; j < DIM; j++)
1551                 {
1552                     fr->mu_tot[i][j] = mu[i*DIM + j];
1553                 }
1554             }
1555         }
1556         if (fr->efep == efepNO)
1557         {
1558             copy_rvec(fr->mu_tot[0], mu_tot);
1559         }
1560         else
1561         {
1562             for (j = 0; j < DIM; j++)
1563             {
1564                 mu_tot[j] =
1565                     (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1566             }
1567         }
1568     }
1569
1570     /* Reset energies */
1571     reset_enerdata(fr, bNS, enerd, MASTER(cr));
1572     clear_rvecs(SHIFTS, fr->fshift);
1573
1574     if (bNS)
1575     {
1576         wallcycle_start(wcycle, ewcNS);
1577
1578         if (graph && bStateChanged)
1579         {
1580             /* Calculate intramolecular shift vectors to make molecules whole */
1581             mk_mshift(fplog, graph, fr->ePBC, box, x);
1582         }
1583
1584         /* Do the actual neighbour searching */
1585         ns(fplog, fr, box,
1586            groups, top, mdatoms,
1587            cr, nrnb, bFillGrid,
1588            bDoLongRangeNS);
1589
1590         wallcycle_stop(wcycle, ewcNS);
1591     }
1592
1593     if (inputrec->implicit_solvent && bNS)
1594     {
1595         make_gb_nblist(cr, inputrec->gb_algorithm,
1596                        x, box, fr, &top->idef, graph, fr->born);
1597     }
1598
1599     if (DOMAINDECOMP(cr))
1600     {
1601         if (!(cr->duty & DUTY_PME))
1602         {
1603             wallcycle_start(wcycle, ewcPPDURINGPME);
1604             dd_force_flop_start(cr->dd, nrnb);
1605         }
1606     }
1607
1608     if (inputrec->bRot)
1609     {
1610         /* Enforced rotation has its own cycle counter that starts after the collective
1611          * coordinates have been communicated. It is added to ddCyclF to allow
1612          * for proper load-balancing */
1613         wallcycle_start(wcycle, ewcROT);
1614         do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
1615         wallcycle_stop(wcycle, ewcROT);
1616     }
1617
1618     /* Start the force cycle counter.
1619      * This counter is stopped in do_forcelow_level.
1620      * No parallel communication should occur while this counter is running,
1621      * since that will interfere with the dynamic load balancing.
1622      */
1623     wallcycle_start(wcycle, ewcFORCE);
1624
1625     if (bDoForces)
1626     {
1627         /* Reset forces for which the virial is calculated separately:
1628          * PME/Ewald forces if necessary */
1629         if (fr->bF_NoVirSum)
1630         {
1631             if (flags & GMX_FORCE_VIRIAL)
1632             {
1633                 fr->f_novirsum = fr->f_novirsum_alloc;
1634                 if (fr->bDomDec)
1635                 {
1636                     clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
1637                 }
1638                 else
1639                 {
1640                     clear_rvecs(homenr, fr->f_novirsum+start);
1641                 }
1642             }
1643             else
1644             {
1645                 /* We are not calculating the pressure so we do not need
1646                  * a separate array for forces that do not contribute
1647                  * to the pressure.
1648                  */
1649                 fr->f_novirsum = f;
1650             }
1651         }
1652
1653         /* Clear the short- and long-range forces */
1654         clear_rvecs(fr->natoms_force_constr, f);
1655         if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
1656         {
1657             clear_rvecs(fr->natoms_force_constr, fr->f_twin);
1658         }
1659
1660         clear_rvec(fr->vir_diag_posres);
1661     }
1662     if (inputrec->ePull == epullCONSTRAINT)
1663     {
1664         clear_pull_forces(inputrec->pull);
1665     }
1666
1667     /* update QMMMrec, if necessary */
1668     if (fr->bQMMM)
1669     {
1670         update_QMMMrec(cr, fr, x, mdatoms, box, top);
1671     }
1672
1673     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
1674     {
1675         posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
1676                        enerd, lambda, fr);
1677     }
1678
1679     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
1680     {
1681         /* Flat-bottomed position restraints always require full pbc */
1682         if (!(bStateChanged && bDoAdressWF))
1683         {
1684             set_pbc(&pbc, inputrec->ePBC, box);
1685         }
1686         v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
1687                      top->idef.iparams_fbposres,
1688                      (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
1689                      inputrec->ePBC == epbcNONE ? NULL : &pbc,
1690                      fr->rc_scaling, fr->ePBC, fr->posres_com);
1691         enerd->term[F_FBPOSRES] += v;
1692         inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
1693     }
1694
1695     /* Compute the bonded and non-bonded energies and optionally forces */
1696     do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
1697                       cr, nrnb, wcycle, mdatoms,
1698                       x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
1699                       &(top->atomtypes), bBornRadii, box,
1700                       inputrec->fepvals, lambda,
1701                       graph, &(top->excls), fr->mu_tot,
1702                       flags,
1703                       &cycles_pme);
1704
1705     if (bSepLRF)
1706     {
1707         if (do_per_step(step, inputrec->nstcalclr))
1708         {
1709             /* Add the long range forces to the short range forces */
1710             for (i = 0; i < fr->natoms_force_constr; i++)
1711             {
1712                 rvec_add(fr->f_twin[i], f[i], f[i]);
1713             }
1714         }
1715     }
1716
1717     cycles_force = wallcycle_stop(wcycle, ewcFORCE);
1718
1719     if (ed)
1720     {
1721         do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1722     }
1723
1724     if (DOMAINDECOMP(cr))
1725     {
1726         dd_force_flop_stop(cr->dd, nrnb);
1727         if (wcycle)
1728         {
1729             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
1730         }
1731     }
1732
1733     if (bDoForces)
1734     {
1735         if (IR_ELEC_FIELD(*inputrec))
1736         {
1737             /* Compute forces due to electric field */
1738             calc_f_el(MASTER(cr) ? field : NULL,
1739                       start, homenr, mdatoms->chargeA, fr->f_novirsum,
1740                       inputrec->ex, inputrec->et, t);
1741         }
1742
1743         if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
1744         {
1745             /* Compute thermodynamic force in hybrid AdResS region */
1746             adress_thermo_force(start, homenr, &(top->cgs), x, fr->f_novirsum, fr, mdatoms,
1747                                 inputrec->ePBC == epbcNONE ? NULL : &pbc);
1748         }
1749
1750         /* Communicate the forces */
1751         if (PAR(cr))
1752         {
1753             wallcycle_start(wcycle, ewcMOVEF);
1754             if (DOMAINDECOMP(cr))
1755             {
1756                 dd_move_f(cr->dd, f, fr->fshift);
1757                 /* Do we need to communicate the separate force array
1758                  * for terms that do not contribute to the single sum virial?
1759                  * Position restraints and electric fields do not introduce
1760                  * inter-cg forces, only full electrostatics methods do.
1761                  * When we do not calculate the virial, fr->f_novirsum = f,
1762                  * so we have already communicated these forces.
1763                  */
1764                 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1765                     (flags & GMX_FORCE_VIRIAL))
1766                 {
1767                     dd_move_f(cr->dd, fr->f_novirsum, NULL);
1768                 }
1769                 if (bSepLRF)
1770                 {
1771                     /* We should not update the shift forces here,
1772                      * since f_twin is already included in f.
1773                      */
1774                     dd_move_f(cr->dd, fr->f_twin, NULL);
1775                 }
1776             }
1777             else
1778             {
1779                 pd_move_f(cr, f, nrnb);
1780                 if (bSepLRF)
1781                 {
1782                     pd_move_f(cr, fr->f_twin, nrnb);
1783                 }
1784             }
1785             wallcycle_stop(wcycle, ewcMOVEF);
1786         }
1787
1788         /* If we have NoVirSum forces, but we do not calculate the virial,
1789          * we sum fr->f_novirum=f later.
1790          */
1791         if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1792         {
1793             wallcycle_start(wcycle, ewcVSITESPREAD);
1794             spread_vsite_f(vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
1795                            &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1796             wallcycle_stop(wcycle, ewcVSITESPREAD);
1797
1798             if (bSepLRF)
1799             {
1800                 wallcycle_start(wcycle, ewcVSITESPREAD);
1801                 spread_vsite_f(vsite, x, fr->f_twin, NULL, FALSE, NULL,
1802                                nrnb,
1803                                &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1804                 wallcycle_stop(wcycle, ewcVSITESPREAD);
1805             }
1806         }
1807
1808         if (flags & GMX_FORCE_VIRIAL)
1809         {
1810             /* Calculation of the virial must be done after vsites! */
1811             calc_virial(mdatoms->start, mdatoms->homenr, x, f,
1812                         vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1813         }
1814     }
1815
1816     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
1817     {
1818         pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
1819                                f, vir_force, mdatoms, enerd, lambda, t);
1820     }
1821
1822     /* Add the forces from enforced rotation potentials (if any) */
1823     if (inputrec->bRot)
1824     {
1825         wallcycle_start(wcycle, ewcROTadd);
1826         enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1827         wallcycle_stop(wcycle, ewcROTadd);
1828     }
1829
1830     if (PAR(cr) && !(cr->duty & DUTY_PME))
1831     {
1832         /* In case of node-splitting, the PP nodes receive the long-range
1833          * forces, virial and energy from the PME nodes here.
1834          */
1835         pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
1836     }
1837
1838     if (bDoForces)
1839     {
1840         post_process_forces(cr, step, nrnb, wcycle,
1841                             top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1842                             flags);
1843     }
1844
1845     /* Sum the potential energy terms from group contributions */
1846     sum_epot(&(enerd->grpp), enerd->term);
1847 }
1848
1849 void do_force(FILE *fplog, t_commrec *cr,
1850               t_inputrec *inputrec,
1851               gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1852               gmx_localtop_t *top,
1853               gmx_groups_t *groups,
1854               matrix box, rvec x[], history_t *hist,
1855               rvec f[],
1856               tensor vir_force,
1857               t_mdatoms *mdatoms,
1858               gmx_enerdata_t *enerd, t_fcdata *fcd,
1859               real *lambda, t_graph *graph,
1860               t_forcerec *fr,
1861               gmx_vsite_t *vsite, rvec mu_tot,
1862               double t, FILE *field, gmx_edsam_t ed,
1863               gmx_bool bBornRadii,
1864               int flags)
1865 {
1866     /* modify force flag if not doing nonbonded */
1867     if (!fr->bNonbonded)
1868     {
1869         flags &= ~GMX_FORCE_NONBONDED;
1870     }
1871
1872     switch (inputrec->cutoff_scheme)
1873     {
1874         case ecutsVERLET:
1875             do_force_cutsVERLET(fplog, cr, inputrec,
1876                                 step, nrnb, wcycle,
1877                                 top,
1878                                 groups,
1879                                 box, x, hist,
1880                                 f, vir_force,
1881                                 mdatoms,
1882                                 enerd, fcd,
1883                                 lambda, graph,
1884                                 fr, fr->ic,
1885                                 vsite, mu_tot,
1886                                 t, field, ed,
1887                                 bBornRadii,
1888                                 flags);
1889             break;
1890         case ecutsGROUP:
1891             do_force_cutsGROUP(fplog, cr, inputrec,
1892                                step, nrnb, wcycle,
1893                                top,
1894                                groups,
1895                                box, x, hist,
1896                                f, vir_force,
1897                                mdatoms,
1898                                enerd, fcd,
1899                                lambda, graph,
1900                                fr, vsite, mu_tot,
1901                                t, field, ed,
1902                                bBornRadii,
1903                                flags);
1904             break;
1905         default:
1906             gmx_incons("Invalid cut-off scheme passed!");
1907     }
1908 }
1909
1910
1911 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1912                         t_inputrec *ir, t_mdatoms *md,
1913                         t_state *state, t_commrec *cr, t_nrnb *nrnb,
1914                         t_forcerec *fr, gmx_localtop_t *top)
1915 {
1916     int             i, m, start, end;
1917     gmx_large_int_t step;
1918     real            dt = ir->delta_t;
1919     real            dvdl_dum;
1920     rvec           *savex;
1921
1922     snew(savex, state->natoms);
1923
1924     start = md->start;
1925     end   = md->homenr + start;
1926
1927     if (debug)
1928     {
1929         fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1930                 start, md->homenr, end);
1931     }
1932     /* Do a first constrain to reset particles... */
1933     step = ir->init_step;
1934     if (fplog)
1935     {
1936         char buf[STEPSTRSIZE];
1937         fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1938                 gmx_step_str(step, buf));
1939     }
1940     dvdl_dum = 0;
1941
1942     /* constrain the current position */
1943     constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1944               ir, NULL, cr, step, 0, md,
1945               state->x, state->x, NULL,
1946               fr->bMolPBC, state->box,
1947               state->lambda[efptBONDED], &dvdl_dum,
1948               NULL, NULL, nrnb, econqCoord,
1949               ir->epc == epcMTTK, state->veta, state->veta);
1950     if (EI_VV(ir->eI))
1951     {
1952         /* constrain the inital velocity, and save it */
1953         /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1954         /* might not yet treat veta correctly */
1955         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1956                   ir, NULL, cr, step, 0, md,
1957                   state->x, state->v, state->v,
1958                   fr->bMolPBC, state->box,
1959                   state->lambda[efptBONDED], &dvdl_dum,
1960                   NULL, NULL, nrnb, econqVeloc,
1961                   ir->epc == epcMTTK, state->veta, state->veta);
1962     }
1963     /* constrain the inital velocities at t-dt/2 */
1964     if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1965     {
1966         for (i = start; (i < end); i++)
1967         {
1968             for (m = 0; (m < DIM); m++)
1969             {
1970                 /* Reverse the velocity */
1971                 state->v[i][m] = -state->v[i][m];
1972                 /* Store the position at t-dt in buf */
1973                 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1974             }
1975         }
1976         /* Shake the positions at t=-dt with the positions at t=0
1977          * as reference coordinates.
1978          */
1979         if (fplog)
1980         {
1981             char buf[STEPSTRSIZE];
1982             fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1983                     gmx_step_str(step, buf));
1984         }
1985         dvdl_dum = 0;
1986         constrain(NULL, TRUE, FALSE, constr, &(top->idef),
1987                   ir, NULL, cr, step, -1, md,
1988                   state->x, savex, NULL,
1989                   fr->bMolPBC, state->box,
1990                   state->lambda[efptBONDED], &dvdl_dum,
1991                   state->v, NULL, nrnb, econqCoord,
1992                   ir->epc == epcMTTK, state->veta, state->veta);
1993
1994         for (i = start; i < end; i++)
1995         {
1996             for (m = 0; m < DIM; m++)
1997             {
1998                 /* Re-reverse the velocities */
1999                 state->v[i][m] = -state->v[i][m];
2000             }
2001         }
2002     }
2003     sfree(savex);
2004 }
2005
2006 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2007 {
2008     double eners[2], virs[2], enersum, virsum, y0, f, g, h;
2009     double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
2010     double invscale, invscale2, invscale3;
2011     int    ri0, ri1, ri, i, offstart, offset;
2012     real   scale, *vdwtab, tabfactor, tmp;
2013
2014     fr->enershiftsix    = 0;
2015     fr->enershifttwelve = 0;
2016     fr->enerdiffsix     = 0;
2017     fr->enerdifftwelve  = 0;
2018     fr->virdiffsix      = 0;
2019     fr->virdifftwelve   = 0;
2020
2021     if (eDispCorr != edispcNO)
2022     {
2023         for (i = 0; i < 2; i++)
2024         {
2025             eners[i] = 0;
2026             virs[i]  = 0;
2027         }
2028         if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
2029         {
2030             if (fr->rvdw_switch == 0)
2031             {
2032                 gmx_fatal(FARGS,
2033                           "With dispersion correction rvdw-switch can not be zero "
2034                           "for vdw-type = %s", evdw_names[fr->vdwtype]);
2035             }
2036
2037             scale  = fr->nblists[0].table_elec_vdw.scale;
2038             vdwtab = fr->nblists[0].table_vdw.data;
2039
2040             /* Round the cut-offs to exact table values for precision */
2041             ri0  = floor(fr->rvdw_switch*scale);
2042             ri1  = ceil(fr->rvdw*scale);
2043             r0   = ri0/scale;
2044             r1   = ri1/scale;
2045             rc3  = r0*r0*r0;
2046             rc9  = rc3*rc3*rc3;
2047
2048             if (fr->vdwtype == evdwSHIFT)
2049             {
2050                 /* Determine the constant energy shift below rvdw_switch.
2051                  * Table has a scale factor since we have scaled it down to compensate
2052                  * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2053                  */
2054                 fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2055                 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2056             }
2057             /* Add the constant part from 0 to rvdw_switch.
2058              * This integration from 0 to rvdw_switch overcounts the number
2059              * of interactions by 1, as it also counts the self interaction.
2060              * We will correct for this later.
2061              */
2062             eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2063             eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2064
2065             invscale  = 1.0/(scale);
2066             invscale2 = invscale*invscale;
2067             invscale3 = invscale*invscale2;
2068
2069             /* following summation derived from cubic spline definition,
2070                Numerical Recipies in C, second edition, p. 113-116.  Exact
2071                for the cubic spline.  We first calculate the negative of
2072                the energy from rvdw to rvdw_switch, assuming that g(r)=1,
2073                and then add the more standard, abrupt cutoff correction to
2074                that result, yielding the long-range correction for a
2075                switched function.  We perform both the pressure and energy
2076                loops at the same time for simplicity, as the computational
2077                cost is low. */
2078
2079             for (i = 0; i < 2; i++)
2080             {
2081                 enersum = 0.0; virsum = 0.0;
2082                 if (i == 0)
2083                 {
2084                     offstart = 0;
2085                     /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
2086                      * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
2087                      * up (to save flops in kernels), we need to correct for this.
2088                      */
2089                     tabfactor = 6.0;
2090                 }
2091                 else
2092                 {
2093                     offstart  = 4;
2094                     tabfactor = 12.0;
2095                 }
2096                 for (ri = ri0; ri < ri1; ri++)
2097                 {
2098                     r  = ri*invscale;
2099                     ea = invscale3;
2100                     eb = 2.0*invscale2*r;
2101                     ec = invscale*r*r;
2102
2103                     pa = invscale3;
2104                     pb = 3.0*invscale2*r;
2105                     pc = 3.0*invscale*r*r;
2106                     pd = r*r*r;
2107
2108                     /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
2109                     offset = 8*ri + offstart;
2110                     y0     = vdwtab[offset];
2111                     f      = vdwtab[offset+1];
2112                     g      = vdwtab[offset+2];
2113                     h      = vdwtab[offset+3];
2114
2115                     enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2116                     virsum  += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2117                 }
2118
2119                 enersum  *= 4.0*M_PI*tabfactor;
2120                 virsum   *= 4.0*M_PI*tabfactor;
2121                 eners[i] -= enersum;
2122                 virs[i]  -= virsum;
2123             }
2124
2125             /* now add the correction for rvdw_switch to infinity */
2126             eners[0] += -4.0*M_PI/(3.0*rc3);
2127             eners[1] +=  4.0*M_PI/(9.0*rc9);
2128             virs[0]  +=  8.0*M_PI/rc3;
2129             virs[1]  += -16.0*M_PI/(3.0*rc9);
2130         }
2131         else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
2132         {
2133             if (fr->vdwtype == evdwUSER && fplog)
2134             {
2135                 fprintf(fplog,
2136                         "WARNING: using dispersion correction with user tables\n");
2137             }
2138             rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
2139             rc9  = rc3*rc3*rc3;
2140             /* Contribution beyond the cut-off */
2141             eners[0] += -4.0*M_PI/(3.0*rc3);
2142             eners[1] +=  4.0*M_PI/(9.0*rc9);
2143             if (fr->vdw_modifier == eintmodPOTSHIFT)
2144             {
2145                 /* Contribution within the cut-off */
2146                 eners[0] += -4.0*M_PI/(3.0*rc3);
2147                 eners[1] +=  4.0*M_PI/(3.0*rc9);
2148             }
2149             /* Contribution beyond the cut-off */
2150             virs[0]  +=  8.0*M_PI/rc3;
2151             virs[1]  += -16.0*M_PI/(3.0*rc9);
2152         }
2153         else
2154         {
2155             gmx_fatal(FARGS,
2156                       "Dispersion correction is not implemented for vdw-type = %s",
2157                       evdw_names[fr->vdwtype]);
2158         }
2159         fr->enerdiffsix    = eners[0];
2160         fr->enerdifftwelve = eners[1];
2161         /* The 0.5 is due to the Gromacs definition of the virial */
2162         fr->virdiffsix     = 0.5*virs[0];
2163         fr->virdifftwelve  = 0.5*virs[1];
2164     }
2165 }
2166
2167 void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
2168                    gmx_large_int_t step, int natoms,
2169                    matrix box, real lambda, tensor pres, tensor virial,
2170                    real *prescorr, real *enercorr, real *dvdlcorr)
2171 {
2172     gmx_bool bCorrAll, bCorrPres;
2173     real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2174     int      m;
2175
2176     *prescorr = 0;
2177     *enercorr = 0;
2178     *dvdlcorr = 0;
2179
2180     clear_mat(virial);
2181     clear_mat(pres);
2182
2183     if (ir->eDispCorr != edispcNO)
2184     {
2185         bCorrAll  = (ir->eDispCorr == edispcAllEner ||
2186                      ir->eDispCorr == edispcAllEnerPres);
2187         bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2188                      ir->eDispCorr == edispcAllEnerPres);
2189
2190         invvol = 1/det(box);
2191         if (fr->n_tpi)
2192         {
2193             /* Only correct for the interactions with the inserted molecule */
2194             dens   = (natoms - fr->n_tpi)*invvol;
2195             ninter = fr->n_tpi;
2196         }
2197         else
2198         {
2199             dens   = natoms*invvol;
2200             ninter = 0.5*natoms;
2201         }
2202
2203         if (ir->efep == efepNO)
2204         {
2205             avcsix    = fr->avcsix[0];
2206             avctwelve = fr->avctwelve[0];
2207         }
2208         else
2209         {
2210             avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
2211             avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2212         }
2213
2214         enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2215         *enercorr += avcsix*enerdiff;
2216         dvdlambda  = 0.0;
2217         if (ir->efep != efepNO)
2218         {
2219             dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2220         }
2221         if (bCorrAll)
2222         {
2223             enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2224             *enercorr += avctwelve*enerdiff;
2225             if (fr->efep != efepNO)
2226             {
2227                 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2228             }
2229         }
2230
2231         if (bCorrPres)
2232         {
2233             svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2234             if (ir->eDispCorr == edispcAllEnerPres)
2235             {
2236                 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2237             }
2238             /* The factor 2 is because of the Gromacs virial definition */
2239             spres = -2.0*invvol*svir*PRESFAC;
2240
2241             for (m = 0; m < DIM; m++)
2242             {
2243                 virial[m][m] += svir;
2244                 pres[m][m]   += spres;
2245             }
2246             *prescorr += spres;
2247         }
2248
2249         /* Can't currently control when it prints, for now, just print when degugging */
2250         if (debug)
2251         {
2252             if (bCorrAll)
2253             {
2254                 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2255                         avcsix, avctwelve);
2256             }
2257             if (bCorrPres)
2258             {
2259                 fprintf(debug,
2260                         "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2261                         *enercorr, spres, svir);
2262             }
2263             else
2264             {
2265                 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2266             }
2267         }
2268
2269         if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
2270         {
2271             gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
2272         }
2273         if (fr->efep != efepNO)
2274         {
2275             *dvdlcorr += dvdlambda;
2276         }
2277     }
2278 }
2279
2280 void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
2281                   t_graph *graph, rvec x[])
2282 {
2283     if (fplog)
2284     {
2285         fprintf(fplog, "Removing pbc first time\n");
2286     }
2287     calc_shifts(box, fr->shift_vec);
2288     if (graph)
2289     {
2290         mk_mshift(fplog, graph, fr->ePBC, box, x);
2291         if (gmx_debug_at)
2292         {
2293             p_graph(debug, "do_pbc_first 1", graph);
2294         }
2295         shift_self(graph, box, x);
2296         /* By doing an extra mk_mshift the molecules that are broken
2297          * because they were e.g. imported from another software
2298          * will be made whole again. Such are the healing powers
2299          * of GROMACS.
2300          */
2301         mk_mshift(fplog, graph, fr->ePBC, box, x);
2302         if (gmx_debug_at)
2303         {
2304             p_graph(debug, "do_pbc_first 2", graph);
2305         }
2306     }
2307     if (fplog)
2308     {
2309         fprintf(fplog, "Done rmpbc\n");
2310     }
2311 }
2312
2313 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2314                             gmx_mtop_t *mtop, rvec x[],
2315                             gmx_bool bFirst)
2316 {
2317     t_graph        *graph;
2318     int             mb, as, mol;
2319     gmx_molblock_t *molb;
2320
2321     if (bFirst && fplog)
2322     {
2323         fprintf(fplog, "Removing pbc first time\n");
2324     }
2325
2326     snew(graph, 1);
2327     as = 0;
2328     for (mb = 0; mb < mtop->nmolblock; mb++)
2329     {
2330         molb = &mtop->molblock[mb];
2331         if (molb->natoms_mol == 1 ||
2332             (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2333         {
2334             /* Just one atom or charge group in the molecule, no PBC required */
2335             as += molb->nmol*molb->natoms_mol;
2336         }
2337         else
2338         {
2339             /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2340             mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
2341                            0, molb->natoms_mol, FALSE, FALSE, graph);
2342
2343             for (mol = 0; mol < molb->nmol; mol++)
2344             {
2345                 mk_mshift(fplog, graph, ePBC, box, x+as);
2346
2347                 shift_self(graph, box, x+as);
2348                 /* The molecule is whole now.
2349                  * We don't need the second mk_mshift call as in do_pbc_first,
2350                  * since we no longer need this graph.
2351                  */
2352
2353                 as += molb->natoms_mol;
2354             }
2355             done_graph(graph);
2356         }
2357     }
2358     sfree(graph);
2359 }
2360
2361 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2362                        gmx_mtop_t *mtop, rvec x[])
2363 {
2364     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2365 }
2366
2367 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2368                  gmx_mtop_t *mtop, rvec x[])
2369 {
2370     low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2371 }
2372
2373 void finish_run(FILE *fplog, t_commrec *cr,
2374                 t_inputrec *inputrec,
2375                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2376                 gmx_walltime_accounting_t walltime_accounting,
2377                 wallclock_gpu_t *gputimes,
2378                 gmx_bool bWriteStat)
2379 {
2380     int     i, j;
2381     t_nrnb *nrnb_tot = NULL;
2382     real    delta_t;
2383     double  nbfs, mflop;
2384     double  elapsed_time,
2385             elapsed_time_over_all_ranks,
2386             elapsed_time_over_all_threads,
2387             elapsed_time_over_all_threads_over_all_ranks;
2388     wallcycle_sum(cr, wcycle);
2389
2390     if (cr->nnodes > 1)
2391     {
2392         snew(nrnb_tot, 1);
2393 #ifdef GMX_MPI
2394         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2395                       cr->mpi_comm_mysim);
2396 #endif
2397     }
2398     else
2399     {
2400         nrnb_tot = nrnb;
2401     }
2402
2403     elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
2404     elapsed_time_over_all_ranks                  = elapsed_time;
2405     elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2406     elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2407 #ifdef GMX_MPI
2408     if (cr->nnodes > 1)
2409     {
2410         /* reduce elapsed_time over all MPI ranks in the current simulation */
2411         MPI_Allreduce(&elapsed_time,
2412                       &elapsed_time_over_all_ranks,
2413                       1, MPI_DOUBLE, MPI_SUM,
2414                       cr->mpi_comm_mysim);
2415         elapsed_time_over_all_ranks /= cr->nnodes;
2416         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2417          * current simulation. */
2418         MPI_Allreduce(&elapsed_time_over_all_threads,
2419                       &elapsed_time_over_all_threads_over_all_ranks,
2420                       1, MPI_DOUBLE, MPI_SUM,
2421                       cr->mpi_comm_mysim);
2422     }
2423 #endif
2424
2425     if (SIMMASTER(cr))
2426     {
2427         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2428     }
2429     if (cr->nnodes > 1)
2430     {
2431         sfree(nrnb_tot);
2432     }
2433
2434     if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2435     {
2436         print_dd_statistics(cr, inputrec, fplog);
2437     }
2438
2439 #ifdef GMX_MPI
2440     if (PARTDECOMP(cr))
2441     {
2442         if (MASTER(cr))
2443         {
2444             t_nrnb     *nrnb_all;
2445             int         s;
2446             MPI_Status  stat;
2447
2448             snew(nrnb_all, cr->nnodes);
2449             nrnb_all[0] = *nrnb;
2450             for (s = 1; s < cr->nnodes; s++)
2451             {
2452                 MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
2453                          cr->mpi_comm_mysim, &stat);
2454             }
2455             pr_load(fplog, cr, nrnb_all);
2456             sfree(nrnb_all);
2457         }
2458         else
2459         {
2460             MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
2461                      cr->mpi_comm_mysim);
2462         }
2463     }
2464 #endif
2465
2466     if (SIMMASTER(cr))
2467     {
2468         wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
2469                         elapsed_time_over_all_ranks,
2470                         wcycle, gputimes);
2471
2472         if (EI_DYNAMICS(inputrec->eI))
2473         {
2474             delta_t = inputrec->delta_t;
2475         }
2476         else
2477         {
2478             delta_t = 0;
2479         }
2480
2481         if (fplog)
2482         {
2483             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2484                        elapsed_time_over_all_ranks,
2485                        walltime_accounting_get_nsteps_done(walltime_accounting),
2486                        delta_t, nbfs, mflop);
2487         }
2488         if (bWriteStat)
2489         {
2490             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2491                        elapsed_time_over_all_ranks,
2492                        walltime_accounting_get_nsteps_done(walltime_accounting),
2493                        delta_t, nbfs, mflop);
2494         }
2495     }
2496 }
2497
2498 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
2499 {
2500     /* this function works, but could probably use a logic rewrite to keep all the different
2501        types of efep straight. */
2502
2503     int       i;
2504     t_lambda *fep = ir->fepvals;
2505
2506     if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2507     {
2508         for (i = 0; i < efptNR; i++)
2509         {
2510             lambda[i] = 0.0;
2511             if (lam0)
2512             {
2513                 lam0[i] = 0.0;
2514             }
2515         }
2516         return;
2517     }
2518     else
2519     {
2520         *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2521                                              if checkpoint is set -- a kludge is in for now
2522                                              to prevent this.*/
2523         for (i = 0; i < efptNR; i++)
2524         {
2525             /* overwrite lambda state with init_lambda for now for backwards compatibility */
2526             if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2527             {
2528                 lambda[i] = fep->init_lambda;
2529                 if (lam0)
2530                 {
2531                     lam0[i] = lambda[i];
2532                 }
2533             }
2534             else
2535             {
2536                 lambda[i] = fep->all_lambda[i][*fep_state];
2537                 if (lam0)
2538                 {
2539                     lam0[i] = lambda[i];
2540                 }
2541             }
2542         }
2543         if (ir->bSimTemp)
2544         {
2545             /* need to rescale control temperatures to match current state */
2546             for (i = 0; i < ir->opts.ngtc; i++)
2547             {
2548                 if (ir->opts.ref_t[i] > 0)
2549                 {
2550                     ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2551                 }
2552             }
2553         }
2554     }
2555
2556     /* Send to the log the information on the current lambdas */
2557     if (fplog != NULL)
2558     {
2559         fprintf(fplog, "Initial vector of lambda components:[ ");
2560         for (i = 0; i < efptNR; i++)
2561         {
2562             fprintf(fplog, "%10.4f ", lambda[i]);
2563         }
2564         fprintf(fplog, "]\n");
2565     }
2566     return;
2567 }
2568
2569
2570 void init_md(FILE *fplog,
2571              t_commrec *cr, t_inputrec *ir, const output_env_t oenv,
2572              double *t, double *t0,
2573              real *lambda, int *fep_state, double *lam0,
2574              t_nrnb *nrnb, gmx_mtop_t *mtop,
2575              gmx_update_t *upd,
2576              int nfile, const t_filenm fnm[],
2577              gmx_mdoutf_t **outf, t_mdebin **mdebin,
2578              tensor force_vir, tensor shake_vir, rvec mu_tot,
2579              gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
2580 {
2581     int  i, j, n;
2582     real tmpt, mod;
2583
2584     /* Initial values */
2585     *t = *t0       = ir->init_t;
2586
2587     *bSimAnn = FALSE;
2588     for (i = 0; i < ir->opts.ngtc; i++)
2589     {
2590         /* set bSimAnn if any group is being annealed */
2591         if (ir->opts.annealing[i] != eannNO)
2592         {
2593             *bSimAnn = TRUE;
2594         }
2595     }
2596     if (*bSimAnn)
2597     {
2598         update_annealing_target_temp(&(ir->opts), ir->init_t);
2599     }
2600
2601     /* Initialize lambda variables */
2602     initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2603
2604     if (upd)
2605     {
2606         *upd = init_update(ir);
2607     }
2608
2609
2610     if (vcm != NULL)
2611     {
2612         *vcm = init_vcm(fplog, &mtop->groups, ir);
2613     }
2614
2615     if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2616     {
2617         if (ir->etc == etcBERENDSEN)
2618         {
2619             please_cite(fplog, "Berendsen84a");
2620         }
2621         if (ir->etc == etcVRESCALE)
2622         {
2623             please_cite(fplog, "Bussi2007a");
2624         }
2625     }
2626
2627     init_nrnb(nrnb);
2628
2629     if (nfile != -1)
2630     {
2631         *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
2632
2633         *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
2634                               mtop, ir, (*outf)->fp_dhdl);
2635     }
2636
2637     if (ir->bAdress)
2638     {
2639         please_cite(fplog, "Fritsch12");
2640         please_cite(fplog, "Junghans10");
2641     }
2642     /* Initiate variables */
2643     clear_mat(force_vir);
2644     clear_mat(shake_vir);
2645     clear_rvec(mu_tot);
2646
2647     debug_gmx();
2648 }