File: | programs/mdrun/runner.c |
Location: | line 1729, column 53 |
Description: | Access to field 'cginfo_mb' results in a dereference of a null pointer (loaded from variable 'fr') |
1 | /* | |||
2 | * This file is part of the GROMACS molecular simulation package. | |||
3 | * | |||
4 | * Copyright (c) 1991-2000, University of Groningen, The Netherlands. | |||
5 | * Copyright (c) 2001-2004, The GROMACS development team. | |||
6 | * Copyright (c) 2011,2012,2013,2014, by the GROMACS development team, led by | |||
7 | * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, | |||
8 | * and including many others, as listed in the AUTHORS file in the | |||
9 | * top-level source directory and at http://www.gromacs.org. | |||
10 | * | |||
11 | * GROMACS is free software; you can redistribute it and/or | |||
12 | * modify it under the terms of the GNU Lesser General Public License | |||
13 | * as published by the Free Software Foundation; either version 2.1 | |||
14 | * of the License, or (at your option) any later version. | |||
15 | * | |||
16 | * GROMACS is distributed in the hope that it will be useful, | |||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
19 | * Lesser General Public License for more details. | |||
20 | * | |||
21 | * You should have received a copy of the GNU Lesser General Public | |||
22 | * License along with GROMACS; if not, see | |||
23 | * http://www.gnu.org/licenses, or write to the Free Software Foundation, | |||
24 | * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |||
25 | * | |||
26 | * If you want to redistribute modifications to GROMACS, please | |||
27 | * consider that scientific software is very special. Version | |||
28 | * control is crucial - bugs must be traceable. We will be happy to | |||
29 | * consider code for inclusion in the official distribution, but | |||
30 | * derived work must not be called official GROMACS. Details are found | |||
31 | * in the README & COPYING files - if they are missing, get the | |||
32 | * official version at http://www.gromacs.org. | |||
33 | * | |||
34 | * To help us fund GROMACS development, we humbly ask that you cite | |||
35 | * the research papers on the package. Check out http://www.gromacs.org. | |||
36 | */ | |||
37 | #ifdef HAVE_CONFIG_H1 | |||
38 | #include <config.h> | |||
39 | #endif | |||
40 | ||||
41 | #include <assert.h> | |||
42 | #include <signal.h> | |||
43 | #include <stdlib.h> | |||
44 | #include <string.h> | |||
45 | #ifdef HAVE_UNISTD_H | |||
46 | #include <unistd.h> | |||
47 | #endif | |||
48 | ||||
49 | #include "typedefs.h" | |||
50 | #include "copyrite.h" | |||
51 | #include "force.h" | |||
52 | #include "mdrun.h" | |||
53 | #include "md_logging.h" | |||
54 | #include "md_support.h" | |||
55 | #include "network.h" | |||
56 | #include "names.h" | |||
57 | #include "disre.h" | |||
58 | #include "orires.h" | |||
59 | #include "pme.h" | |||
60 | #include "mdatoms.h" | |||
61 | #include "repl_ex.h" | |||
62 | #include "deform.h" | |||
63 | #include "qmmm.h" | |||
64 | #include "domdec.h" | |||
65 | #include "coulomb.h" | |||
66 | #include "constr.h" | |||
67 | #include "mvdata.h" | |||
68 | #include "checkpoint.h" | |||
69 | #include "mtop_util.h" | |||
70 | #include "sighandler.h" | |||
71 | #include "txtdump.h" | |||
72 | #include "gmx_detect_hardware.h" | |||
73 | #include "gmx_omp_nthreads.h" | |||
74 | #include "gromacs/gmxpreprocess/calc_verletbuf.h" | |||
75 | #include "membed.h" | |||
76 | #include "macros.h" | |||
77 | #include "gmx_thread_affinity.h" | |||
78 | #include "inputrec.h" | |||
79 | #include "main.h" | |||
80 | ||||
81 | #include "gromacs/essentialdynamics/edsam.h" | |||
82 | #include "gromacs/fileio/tpxio.h" | |||
83 | #include "gromacs/math/vec.h" | |||
84 | #include "gromacs/mdlib/nbnxn_search.h" | |||
85 | #include "gromacs/mdlib/nbnxn_consts.h" | |||
86 | #include "gromacs/pulling/pull.h" | |||
87 | #include "gromacs/pulling/pull_rotation.h" | |||
88 | #include "gromacs/swap/swapcoords.h" | |||
89 | #include "gromacs/timing/wallcycle.h" | |||
90 | #include "gromacs/utility/gmxmpi.h" | |||
91 | #include "gromacs/utility/smalloc.h" | |||
92 | ||||
93 | #ifdef GMX_FAHCORE | |||
94 | #include "corewrap.h" | |||
95 | #endif | |||
96 | ||||
97 | #include "gpu_utils.h" | |||
98 | #include "nbnxn_cuda_data_mgmt.h" | |||
99 | ||||
100 | typedef struct { | |||
101 | gmx_integrator_t *func; | |||
102 | } gmx_intp_t; | |||
103 | ||||
104 | /* The array should match the eI array in include/types/enums.h */ | |||
105 | const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}}; | |||
106 | ||||
107 | gmx_int64_t deform_init_init_step_tpx; | |||
108 | matrix deform_init_box_tpx; | |||
109 | tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER{ {0}, ((void*)0) }; | |||
110 | ||||
111 | ||||
112 | #ifdef GMX_THREAD_MPI | |||
113 | /* The minimum number of atoms per tMPI thread. With fewer atoms than this, | |||
114 | * the number of threads will get lowered. | |||
115 | */ | |||
116 | #define MIN_ATOMS_PER_MPI_THREAD90 90 | |||
117 | #define MIN_ATOMS_PER_GPU900 900 | |||
118 | ||||
119 | struct mdrunner_arglist | |||
120 | { | |||
121 | gmx_hw_opt_t hw_opt; | |||
122 | FILE *fplog; | |||
123 | t_commrec *cr; | |||
124 | int nfile; | |||
125 | const t_filenm *fnm; | |||
126 | output_env_t oenv; | |||
127 | gmx_bool bVerbose; | |||
128 | gmx_bool bCompact; | |||
129 | int nstglobalcomm; | |||
130 | ivec ddxyz; | |||
131 | int dd_node_order; | |||
132 | real rdd; | |||
133 | real rconstr; | |||
134 | const char *dddlb_opt; | |||
135 | real dlb_scale; | |||
136 | const char *ddcsx; | |||
137 | const char *ddcsy; | |||
138 | const char *ddcsz; | |||
139 | const char *nbpu_opt; | |||
140 | int nstlist_cmdline; | |||
141 | gmx_int64_t nsteps_cmdline; | |||
142 | int nstepout; | |||
143 | int resetstep; | |||
144 | int nmultisim; | |||
145 | int repl_ex_nst; | |||
146 | int repl_ex_nex; | |||
147 | int repl_ex_seed; | |||
148 | real pforce; | |||
149 | real cpt_period; | |||
150 | real max_hours; | |||
151 | const char *deviceOptions; | |||
152 | int imdport; | |||
153 | unsigned long Flags; | |||
154 | }; | |||
155 | ||||
156 | ||||
157 | /* The function used for spawning threads. Extracts the mdrunner() | |||
158 | arguments from its one argument and calls mdrunner(), after making | |||
159 | a commrec. */ | |||
160 | static void mdrunner_start_fn(void *arg) | |||
161 | { | |||
162 | struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg; | |||
163 | struct mdrunner_arglist mc = *mda; /* copy the arg list to make sure | |||
164 | that it's thread-local. This doesn't | |||
165 | copy pointed-to items, of course, | |||
166 | but those are all const. */ | |||
167 | t_commrec *cr; /* we need a local version of this */ | |||
168 | FILE *fplog = NULL((void*)0); | |||
169 | t_filenm *fnm; | |||
170 | ||||
171 | fnm = dup_tfn(mc.nfile, mc.fnm); | |||
172 | ||||
173 | cr = reinitialize_commrec_for_this_thread(mc.cr); | |||
174 | ||||
175 | if (MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1))) | |||
176 | { | |||
177 | fplog = mc.fplog; | |||
178 | } | |||
179 | ||||
180 | mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, | |||
181 | mc.bVerbose, mc.bCompact, mc.nstglobalcomm, | |||
182 | mc.ddxyz, mc.dd_node_order, mc.rdd, | |||
183 | mc.rconstr, mc.dddlb_opt, mc.dlb_scale, | |||
184 | mc.ddcsx, mc.ddcsy, mc.ddcsz, | |||
185 | mc.nbpu_opt, mc.nstlist_cmdline, | |||
186 | mc.nsteps_cmdline, mc.nstepout, mc.resetstep, | |||
187 | mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, | |||
188 | mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.imdport, mc.Flags); | |||
189 | } | |||
190 | ||||
191 | /* called by mdrunner() to start a specific number of threads (including | |||
192 | the main thread) for thread-parallel runs. This in turn calls mdrunner() | |||
193 | for each thread. | |||
194 | All options besides nthreads are the same as for mdrunner(). */ | |||
195 | static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, | |||
196 | FILE *fplog, t_commrec *cr, int nfile, | |||
197 | const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose, | |||
198 | gmx_bool bCompact, int nstglobalcomm, | |||
199 | ivec ddxyz, int dd_node_order, real rdd, real rconstr, | |||
200 | const char *dddlb_opt, real dlb_scale, | |||
201 | const char *ddcsx, const char *ddcsy, const char *ddcsz, | |||
202 | const char *nbpu_opt, int nstlist_cmdline, | |||
203 | gmx_int64_t nsteps_cmdline, | |||
204 | int nstepout, int resetstep, | |||
205 | int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, | |||
206 | real pforce, real cpt_period, real max_hours, | |||
207 | const char *deviceOptions, unsigned long Flags) | |||
208 | { | |||
209 | int ret; | |||
210 | struct mdrunner_arglist *mda; | |||
211 | t_commrec *crn; /* the new commrec */ | |||
212 | t_filenm *fnmn; | |||
213 | ||||
214 | /* first check whether we even need to start tMPI */ | |||
215 | if (hw_opt->nthreads_tmpi < 2) | |||
216 | { | |||
217 | return cr; | |||
218 | } | |||
219 | ||||
220 | /* a few small, one-time, almost unavoidable memory leaks: */ | |||
221 | snew(mda, 1)(mda) = save_calloc("mda", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 221, (1), sizeof(*(mda))); | |||
222 | fnmn = dup_tfn(nfile, fnm); | |||
223 | ||||
224 | /* fill the data structure to pass as void pointer to thread start fn */ | |||
225 | /* hw_opt contains pointers, which should all be NULL at this stage */ | |||
226 | mda->hw_opt = *hw_opt; | |||
227 | mda->fplog = fplog; | |||
228 | mda->cr = cr; | |||
229 | mda->nfile = nfile; | |||
230 | mda->fnm = fnmn; | |||
231 | mda->oenv = oenv; | |||
232 | mda->bVerbose = bVerbose; | |||
233 | mda->bCompact = bCompact; | |||
234 | mda->nstglobalcomm = nstglobalcomm; | |||
235 | mda->ddxyz[XX0] = ddxyz[XX0]; | |||
236 | mda->ddxyz[YY1] = ddxyz[YY1]; | |||
237 | mda->ddxyz[ZZ2] = ddxyz[ZZ2]; | |||
238 | mda->dd_node_order = dd_node_order; | |||
239 | mda->rdd = rdd; | |||
240 | mda->rconstr = rconstr; | |||
241 | mda->dddlb_opt = dddlb_opt; | |||
242 | mda->dlb_scale = dlb_scale; | |||
243 | mda->ddcsx = ddcsx; | |||
244 | mda->ddcsy = ddcsy; | |||
245 | mda->ddcsz = ddcsz; | |||
246 | mda->nbpu_opt = nbpu_opt; | |||
247 | mda->nstlist_cmdline = nstlist_cmdline; | |||
248 | mda->nsteps_cmdline = nsteps_cmdline; | |||
249 | mda->nstepout = nstepout; | |||
250 | mda->resetstep = resetstep; | |||
251 | mda->nmultisim = nmultisim; | |||
252 | mda->repl_ex_nst = repl_ex_nst; | |||
253 | mda->repl_ex_nex = repl_ex_nex; | |||
254 | mda->repl_ex_seed = repl_ex_seed; | |||
255 | mda->pforce = pforce; | |||
256 | mda->cpt_period = cpt_period; | |||
257 | mda->max_hours = max_hours; | |||
258 | mda->deviceOptions = deviceOptions; | |||
259 | mda->Flags = Flags; | |||
260 | ||||
261 | /* now spawn new threads that start mdrunner_start_fn(), while | |||
262 | the main thread returns, we set thread affinity later */ | |||
263 | ret = tMPI_Init_fn(TRUE1, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE, | |||
264 | mdrunner_start_fn, (void*)(mda) ); | |||
265 | if (ret != TMPI_SUCCESS) | |||
266 | { | |||
267 | return NULL((void*)0); | |||
268 | } | |||
269 | ||||
270 | crn = reinitialize_commrec_for_this_thread(cr); | |||
271 | return crn; | |||
272 | } | |||
273 | ||||
274 | ||||
275 | static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, | |||
276 | const gmx_hw_opt_t *hw_opt, | |||
277 | int nthreads_tot, | |||
278 | int ngpu) | |||
279 | { | |||
280 | int nthreads_tmpi; | |||
281 | ||||
282 | /* There are no separate PME nodes here, as we ensured in | |||
283 | * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes | |||
284 | * and a conditional ensures we would not have ended up here. | |||
285 | * Note that separate PME nodes might be switched on later. | |||
286 | */ | |||
287 | if (ngpu > 0) | |||
288 | { | |||
289 | nthreads_tmpi = ngpu; | |||
290 | if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi) | |||
291 | { | |||
292 | nthreads_tmpi = nthreads_tot; | |||
293 | } | |||
294 | } | |||
295 | else if (hw_opt->nthreads_omp > 0) | |||
296 | { | |||
297 | /* Here we could oversubscribe, when we do, we issue a warning later */ | |||
298 | nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp)(((1) > (nthreads_tot/hw_opt->nthreads_omp)) ? (1) : (nthreads_tot /hw_opt->nthreads_omp) ); | |||
299 | } | |||
300 | else | |||
301 | { | |||
302 | /* TODO choose nthreads_omp based on hardware topology | |||
303 | when we have a hardware topology detection library */ | |||
304 | /* In general, when running up to 4 threads, OpenMP should be faster. | |||
305 | * Note: on AMD Bulldozer we should avoid running OpenMP over two dies. | |||
306 | * On Intel>=Nehalem running OpenMP on a single CPU is always faster, | |||
307 | * even on two CPUs it's usually faster (but with many OpenMP threads | |||
308 | * it could be faster not to use HT, currently we always use HT). | |||
309 | * On Nehalem/Westmere we want to avoid running 16 threads over | |||
310 | * two CPUs with HT, so we need a limit<16; thus we use 12. | |||
311 | * A reasonable limit for Intel Sandy and Ivy bridge, | |||
312 | * not knowing the topology, is 16 threads. | |||
313 | */ | |||
314 | const int nthreads_omp_always_faster = 4; | |||
315 | const int nthreads_omp_always_faster_Nehalem = 12; | |||
316 | const int nthreads_omp_always_faster_SandyBridge = 16; | |||
317 | const int first_model_Nehalem = 0x1A; | |||
318 | const int first_model_SandyBridge = 0x2A; | |||
319 | gmx_bool bIntel_Family6; | |||
320 | ||||
321 | bIntel_Family6 = | |||
322 | (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL && | |||
323 | gmx_cpuid_family(hwinfo->cpuid_info) == 6); | |||
324 | ||||
325 | if (nthreads_tot <= nthreads_omp_always_faster || | |||
326 | (bIntel_Family6 && | |||
327 | ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) || | |||
328 | (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge)))) | |||
329 | { | |||
330 | /* Use pure OpenMP parallelization */ | |||
331 | nthreads_tmpi = 1; | |||
332 | } | |||
333 | else | |||
334 | { | |||
335 | /* Don't use OpenMP parallelization */ | |||
336 | nthreads_tmpi = nthreads_tot; | |||
337 | } | |||
338 | } | |||
339 | ||||
340 | return nthreads_tmpi; | |||
341 | } | |||
342 | ||||
343 | ||||
344 | /* Get the number of threads to use for thread-MPI based on how many | |||
345 | * were requested, which algorithms we're using, | |||
346 | * and how many particles there are. | |||
347 | * At the point we have already called check_and_update_hw_opt. | |||
348 | * Thus all options should be internally consistent and consistent | |||
349 | * with the hardware, except that ntmpi could be larger than #GPU. | |||
350 | */ | |||
351 | static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, | |||
352 | gmx_hw_opt_t *hw_opt, | |||
353 | t_inputrec *inputrec, gmx_mtop_t *mtop, | |||
354 | const t_commrec *cr, | |||
355 | FILE *fplog) | |||
356 | { | |||
357 | int nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu; | |||
358 | int min_atoms_per_mpi_thread; | |||
359 | char *env; | |||
360 | char sbuf[STRLEN4096]; | |||
361 | gmx_bool bCanUseGPU; | |||
362 | ||||
363 | if (hw_opt->nthreads_tmpi > 0) | |||
364 | { | |||
365 | /* Trivial, return right away */ | |||
366 | return hw_opt->nthreads_tmpi; | |||
367 | } | |||
368 | ||||
369 | nthreads_hw = hwinfo->nthreads_hw_avail; | |||
370 | ||||
371 | /* How many total (#tMPI*#OpenMP) threads can we start? */ | |||
372 | if (hw_opt->nthreads_tot > 0) | |||
373 | { | |||
374 | nthreads_tot_max = hw_opt->nthreads_tot; | |||
375 | } | |||
376 | else | |||
377 | { | |||
378 | nthreads_tot_max = nthreads_hw; | |||
379 | } | |||
380 | ||||
381 | bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && | |||
382 | hwinfo->gpu_info.ncuda_dev_compatible > 0); | |||
383 | if (bCanUseGPU) | |||
384 | { | |||
385 | ngpu = hwinfo->gpu_info.ncuda_dev_compatible; | |||
386 | } | |||
387 | else | |||
388 | { | |||
389 | ngpu = 0; | |||
390 | } | |||
391 | ||||
392 | if (inputrec->cutoff_scheme == ecutsGROUP) | |||
393 | { | |||
394 | /* We checked this before, but it doesn't hurt to do it once more */ | |||
395 | assert(hw_opt->nthreads_omp == 1)((void) (0)); | |||
396 | } | |||
397 | ||||
398 | nthreads_tmpi = | |||
399 | get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu); | |||
400 | ||||
401 | if (inputrec->eI == eiNM || EI_TPI(inputrec->eI)((inputrec->eI) == eiTPI || (inputrec->eI) == eiTPIC)) | |||
402 | { | |||
403 | /* Dims/steps are divided over the nodes iso splitting the atoms */ | |||
404 | min_atoms_per_mpi_thread = 0; | |||
405 | } | |||
406 | else | |||
407 | { | |||
408 | if (bCanUseGPU) | |||
409 | { | |||
410 | min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU900; | |||
411 | } | |||
412 | else | |||
413 | { | |||
414 | min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD90; | |||
415 | } | |||
416 | } | |||
417 | ||||
418 | /* Check if an algorithm does not support parallel simulation. */ | |||
419 | if (nthreads_tmpi != 1 && | |||
420 | ( inputrec->eI == eiLBFGS || | |||
421 | inputrec->coulombtype == eelEWALD ) ) | |||
422 | { | |||
423 | nthreads_tmpi = 1; | |||
424 | ||||
425 | md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n"); | |||
426 | if (hw_opt->nthreads_tmpi > nthreads_tmpi) | |||
427 | { | |||
428 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 428, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that"); | |||
429 | } | |||
430 | } | |||
431 | else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread) | |||
432 | { | |||
433 | /* the thread number was chosen automatically, but there are too many | |||
434 | threads (too few atoms per thread) */ | |||
435 | nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread)(((1) > (mtop->natoms/min_atoms_per_mpi_thread)) ? (1) : (mtop->natoms/min_atoms_per_mpi_thread) ); | |||
436 | ||||
437 | /* Avoid partial use of Hyper-Threading */ | |||
438 | if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED && | |||
439 | nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw) | |||
440 | { | |||
441 | nthreads_new = nthreads_hw/2; | |||
442 | } | |||
443 | ||||
444 | /* Avoid large prime numbers in the thread count */ | |||
445 | if (nthreads_new >= 6) | |||
446 | { | |||
447 | /* Use only 6,8,10 with additional factors of 2 */ | |||
448 | int fac; | |||
449 | ||||
450 | fac = 2; | |||
451 | while (3*fac*2 <= nthreads_new) | |||
452 | { | |||
453 | fac *= 2; | |||
454 | } | |||
455 | ||||
456 | nthreads_new = (nthreads_new/fac)*fac; | |||
457 | } | |||
458 | else | |||
459 | { | |||
460 | /* Avoid 5 */ | |||
461 | if (nthreads_new == 5) | |||
462 | { | |||
463 | nthreads_new = 4; | |||
464 | } | |||
465 | } | |||
466 | ||||
467 | nthreads_tmpi = nthreads_new; | |||
468 | ||||
469 | fprintf(stderrstderr, "\n"); | |||
470 | fprintf(stderrstderr, "NOTE: Parallelization is limited by the small number of atoms,\n"); | |||
471 | fprintf(stderrstderr, " only starting %d thread-MPI threads.\n", nthreads_tmpi); | |||
472 | fprintf(stderrstderr, " You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n"); | |||
473 | } | |||
474 | ||||
475 | return nthreads_tmpi; | |||
476 | } | |||
477 | #endif /* GMX_THREAD_MPI */ | |||
478 | ||||
479 | ||||
480 | /* We determine the extra cost of the non-bonded kernels compared to | |||
481 | * a reference nstlist value of 10 (which is the default in grompp). | |||
482 | */ | |||
483 | static const int nbnxn_reference_nstlist = 10; | |||
484 | /* The values to try when switching */ | |||
485 | const int nstlist_try[] = { 20, 25, 40 }; | |||
486 | #define NNSTLsizeof(nstlist_try)/sizeof(nstlist_try[0]) sizeof(nstlist_try)/sizeof(nstlist_try[0]) | |||
487 | /* Increase nstlist until the non-bonded cost increases more than listfac_ok, | |||
488 | * but never more than listfac_max. | |||
489 | * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5 | |||
490 | * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40. | |||
491 | * Note that both CPU and GPU factors are conservative. Performance should | |||
492 | * not go down due to this tuning, except with a relatively slow GPU. | |||
493 | * On the other hand, at medium/high parallelization or with fast GPUs | |||
494 | * nstlist will not be increased enough to reach optimal performance. | |||
495 | */ | |||
496 | /* CPU: pair-search is about a factor 1.5 slower than the non-bonded kernel */ | |||
497 | static const float nbnxn_cpu_listfac_ok = 1.05; | |||
498 | static const float nbnxn_cpu_listfac_max = 1.09; | |||
499 | /* GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel */ | |||
500 | static const float nbnxn_gpu_listfac_ok = 1.20; | |||
501 | static const float nbnxn_gpu_listfac_max = 1.30; | |||
502 | ||||
503 | /* Try to increase nstlist when using the Verlet cut-off scheme */ | |||
504 | static void increase_nstlist(FILE *fp, t_commrec *cr, | |||
505 | t_inputrec *ir, int nstlist_cmdline, | |||
506 | const gmx_mtop_t *mtop, matrix box, | |||
507 | gmx_bool bGPU) | |||
508 | { | |||
509 | float listfac_ok, listfac_max; | |||
510 | int nstlist_orig, nstlist_prev; | |||
511 | verletbuf_list_setup_t ls; | |||
512 | real rlist_nstlist10, rlist_inc, rlist_ok, rlist_max; | |||
513 | real rlist_new, rlist_prev; | |||
514 | int nstlist_ind = 0; | |||
515 | t_state state_tmp; | |||
516 | gmx_bool bBox, bDD, bCont; | |||
517 | const char *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n"; | |||
518 | const char *nve_err = "Can not increase nstlist because an NVE ensemble is used"; | |||
519 | const char *vbd_err = "Can not increase nstlist because verlet-buffer-tolerance is not set or used"; | |||
520 | const char *box_err = "Can not increase nstlist because the box is too small"; | |||
521 | const char *dd_err = "Can not increase nstlist because of domain decomposition limitations"; | |||
522 | char buf[STRLEN4096]; | |||
523 | ||||
524 | if (nstlist_cmdline <= 0) | |||
525 | { | |||
526 | if (fp != NULL((void*)0) && bGPU && ir->nstlist < nstlist_try[0]) | |||
527 | { | |||
528 | fprintf(fp, nstl_gpu, ir->nstlist); | |||
529 | } | |||
530 | nstlist_ind = 0; | |||
531 | while (nstlist_ind < NNSTLsizeof(nstlist_try)/sizeof(nstlist_try[0]) && ir->nstlist >= nstlist_try[nstlist_ind]) | |||
532 | { | |||
533 | nstlist_ind++; | |||
534 | } | |||
535 | if (nstlist_ind == NNSTLsizeof(nstlist_try)/sizeof(nstlist_try[0])) | |||
536 | { | |||
537 | /* There are no larger nstlist value to try */ | |||
538 | return; | |||
539 | } | |||
540 | } | |||
541 | ||||
542 | if (EI_MD(ir->eI)((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK)) && ir->etc == etcNO) | |||
543 | { | |||
544 | if (MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1))) | |||
545 | { | |||
546 | fprintf(stderrstderr, "%s\n", nve_err); | |||
547 | } | |||
548 | if (fp != NULL((void*)0)) | |||
549 | { | |||
550 | fprintf(fp, "%s\n", nve_err); | |||
551 | } | |||
552 | ||||
553 | return; | |||
554 | } | |||
555 | ||||
556 | if (ir->verletbuf_tol == 0 && bGPU) | |||
557 | { | |||
558 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 558, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp"); | |||
559 | } | |||
560 | ||||
561 | if (ir->verletbuf_tol < 0) | |||
562 | { | |||
563 | if (MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1))) | |||
564 | { | |||
565 | fprintf(stderrstderr, "%s\n", vbd_err); | |||
566 | } | |||
567 | if (fp != NULL((void*)0)) | |||
568 | { | |||
569 | fprintf(fp, "%s\n", vbd_err); | |||
570 | } | |||
571 | ||||
572 | return; | |||
573 | } | |||
574 | ||||
575 | if (bGPU) | |||
576 | { | |||
577 | listfac_ok = nbnxn_gpu_listfac_ok; | |||
578 | listfac_max = nbnxn_gpu_listfac_max; | |||
579 | } | |||
580 | else | |||
581 | { | |||
582 | listfac_ok = nbnxn_cpu_listfac_ok; | |||
583 | listfac_max = nbnxn_cpu_listfac_max; | |||
584 | } | |||
585 | ||||
586 | nstlist_orig = ir->nstlist; | |||
587 | if (nstlist_cmdline > 0) | |||
588 | { | |||
589 | if (fp) | |||
590 | { | |||
591 | sprintf(buf, "Getting nstlist=%d from command line option", | |||
592 | nstlist_cmdline); | |||
593 | } | |||
594 | ir->nstlist = nstlist_cmdline; | |||
595 | } | |||
596 | ||||
597 | verletbuf_get_list_setup(bGPU, &ls); | |||
598 | ||||
599 | /* Allow rlist to make the list a given factor larger than the list | |||
600 | * would be with nstlist=10. | |||
601 | */ | |||
602 | nstlist_prev = ir->nstlist; | |||
603 | ir->nstlist = 10; | |||
604 | calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL((void*)0), | |||
605 | &rlist_nstlist10); | |||
606 | ir->nstlist = nstlist_prev; | |||
607 | ||||
608 | /* Determine the pair list size increase due to zero interactions */ | |||
609 | rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j, | |||
610 | mtop->natoms/det(box)); | |||
611 | rlist_ok = (rlist_nstlist10 + rlist_inc)*pow(listfac_ok, 1.0/3.0) - rlist_inc; | |||
612 | rlist_max = (rlist_nstlist10 + rlist_inc)*pow(listfac_max, 1.0/3.0) - rlist_inc; | |||
613 | if (debug) | |||
614 | { | |||
615 | fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n", | |||
616 | rlist_inc, rlist_ok, rlist_max); | |||
617 | } | |||
618 | ||||
619 | nstlist_prev = nstlist_orig; | |||
620 | rlist_prev = ir->rlist; | |||
621 | do | |||
622 | { | |||
623 | if (nstlist_cmdline <= 0) | |||
624 | { | |||
625 | ir->nstlist = nstlist_try[nstlist_ind]; | |||
626 | } | |||
627 | ||||
628 | /* Set the pair-list buffer size in ir */ | |||
629 | calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL((void*)0), &rlist_new); | |||
630 | ||||
631 | /* Does rlist fit in the box? */ | |||
632 | bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box)); | |||
633 | bDD = TRUE1; | |||
634 | if (bBox && DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1))) | |||
635 | { | |||
636 | /* Check if rlist fits in the domain decomposition */ | |||
637 | if (inputrec2nboundeddim(ir) < DIM3) | |||
638 | { | |||
639 | gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet")_gmx_error("incons", "Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet" , "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c", 639); | |||
640 | } | |||
641 | copy_mat(box, state_tmp.box); | |||
642 | bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new); | |||
643 | } | |||
644 | ||||
645 | if (debug) | |||
646 | { | |||
647 | fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n", | |||
648 | ir->nstlist, rlist_new, bBox, bDD); | |||
649 | } | |||
650 | ||||
651 | bCont = FALSE0; | |||
652 | ||||
653 | if (nstlist_cmdline <= 0) | |||
654 | { | |||
655 | if (bBox && bDD && rlist_new <= rlist_max) | |||
656 | { | |||
657 | /* Increase nstlist */ | |||
658 | nstlist_prev = ir->nstlist; | |||
659 | rlist_prev = rlist_new; | |||
660 | bCont = (nstlist_ind+1 < NNSTLsizeof(nstlist_try)/sizeof(nstlist_try[0]) && rlist_new < rlist_ok); | |||
661 | } | |||
662 | else | |||
663 | { | |||
664 | /* Stick with the previous nstlist */ | |||
665 | ir->nstlist = nstlist_prev; | |||
666 | rlist_new = rlist_prev; | |||
667 | bBox = TRUE1; | |||
668 | bDD = TRUE1; | |||
669 | } | |||
670 | } | |||
671 | ||||
672 | nstlist_ind++; | |||
673 | } | |||
674 | while (bCont); | |||
675 | ||||
676 | if (!bBox || !bDD) | |||
677 | { | |||
678 | gmx_warning(!bBox ? box_err : dd_err); | |||
679 | if (fp != NULL((void*)0)) | |||
680 | { | |||
681 | fprintf(fp, "\n%s\n", bBox ? box_err : dd_err); | |||
682 | } | |||
683 | ir->nstlist = nstlist_orig; | |||
684 | } | |||
685 | else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist) | |||
686 | { | |||
687 | sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g", | |||
688 | nstlist_orig, ir->nstlist, | |||
689 | ir->rlist, rlist_new); | |||
690 | if (MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1))) | |||
691 | { | |||
692 | fprintf(stderrstderr, "%s\n\n", buf); | |||
693 | } | |||
694 | if (fp != NULL((void*)0)) | |||
695 | { | |||
696 | fprintf(fp, "%s\n\n", buf); | |||
697 | } | |||
698 | ir->rlist = rlist_new; | |||
699 | ir->rlistlong = rlist_new; | |||
700 | } | |||
701 | } | |||
702 | ||||
703 | static void prepare_verlet_scheme(FILE *fplog, | |||
704 | t_commrec *cr, | |||
705 | t_inputrec *ir, | |||
706 | int nstlist_cmdline, | |||
707 | const gmx_mtop_t *mtop, | |||
708 | matrix box, | |||
709 | gmx_bool bUseGPU) | |||
710 | { | |||
711 | /* For NVE simulations, we will retain the initial list buffer */ | |||
712 | if (ir->verletbuf_tol > 0 && !(EI_MD(ir->eI)((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK)) && ir->etc == etcNO)) | |||
713 | { | |||
714 | /* Update the Verlet buffer size for the current run setup */ | |||
715 | verletbuf_list_setup_t ls; | |||
716 | real rlist_new; | |||
717 | ||||
718 | /* Here we assume SIMD-enabled kernels are being used. But as currently | |||
719 | * calc_verlet_buffer_size gives the same results for 4x8 and 4x4 | |||
720 | * and 4x2 gives a larger buffer than 4x4, this is ok. | |||
721 | */ | |||
722 | verletbuf_get_list_setup(bUseGPU, &ls); | |||
723 | ||||
724 | calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL((void*)0), &rlist_new); | |||
725 | ||||
726 | if (rlist_new != ir->rlist) | |||
727 | { | |||
728 | if (fplog != NULL((void*)0)) | |||
729 | { | |||
730 | fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n", | |||
731 | ir->rlist, rlist_new, | |||
732 | ls.cluster_size_i, ls.cluster_size_j); | |||
733 | } | |||
734 | ir->rlist = rlist_new; | |||
735 | ir->rlistlong = rlist_new; | |||
736 | } | |||
737 | } | |||
738 | ||||
739 | if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI)(((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK)) || ((ir->eI) == eiSD1 || (ir->eI) == eiSD2) || (ir->eI) == eiBD) || ir->verletbuf_tol <= 0)) | |||
740 | { | |||
741 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 741, "Can not set nstlist without %s", | |||
742 | !EI_DYNAMICS(ir->eI)(((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK)) || ((ir->eI) == eiSD1 || (ir->eI) == eiSD2) || (ir->eI) == eiBD) ? "dynamics" : "verlet-buffer-tolerance"); | |||
743 | } | |||
744 | ||||
745 | if (EI_DYNAMICS(ir->eI)(((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK)) || ((ir->eI) == eiSD1 || (ir->eI) == eiSD2) || (ir->eI) == eiBD)) | |||
746 | { | |||
747 | /* Set or try nstlist values */ | |||
748 | increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU); | |||
749 | } | |||
750 | } | |||
751 | ||||
752 | static void convert_to_verlet_scheme(FILE *fplog, | |||
753 | t_inputrec *ir, | |||
754 | gmx_mtop_t *mtop, real box_vol) | |||
755 | { | |||
756 | char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme"; | |||
757 | ||||
758 | md_print_warn(NULL((void*)0), fplog, "%s\n", conv_mesg); | |||
759 | ||||
760 | ir->cutoff_scheme = ecutsVERLET; | |||
761 | ir->verletbuf_tol = 0.005; | |||
762 | ||||
763 | if (ir->rcoulomb != ir->rvdw) | |||
764 | { | |||
765 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 765, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs"); | |||
766 | } | |||
767 | ||||
768 | if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype)((ir->coulombtype) == eelUSER || (ir->coulombtype) == eelPMEUSER || (ir->coulombtype) == (eelPMEUSERSWITCH))) | |||
769 | { | |||
770 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 770, "User non-bonded potentials are not (yet) supported with the Verlet scheme"); | |||
771 | } | |||
772 | else if (ir_vdw_switched(ir) || ir_coulomb_switched(ir)) | |||
773 | { | |||
774 | if (ir_vdw_switched(ir) && ir->vdw_modifier == eintmodNONE) | |||
775 | { | |||
776 | ir->vdwtype = evdwCUT; | |||
777 | ||||
778 | switch (ir->vdwtype) | |||
779 | { | |||
780 | case evdwSHIFT: ir->vdw_modifier = eintmodFORCESWITCH; break; | |||
781 | case evdwSWITCH: ir->vdw_modifier = eintmodPOTSWITCH; break; | |||
782 | default: gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 782, "The Verlet scheme does not support Van der Waals interactions of type '%s'", evdw_names[ir->vdwtype]); | |||
783 | } | |||
784 | } | |||
785 | if (ir_coulomb_switched(ir) && ir->coulomb_modifier == eintmodNONE) | |||
786 | { | |||
787 | if (EEL_FULL(ir->coulombtype)((((ir->coulombtype) == eelPME || (ir->coulombtype) == eelPMESWITCH || (ir->coulombtype) == eelPMEUSER || (ir->coulombtype ) == eelPMEUSERSWITCH || (ir->coulombtype) == eelP3M_AD) || (ir->coulombtype) == eelEWALD) || (ir->coulombtype) == eelPOISSON)) | |||
788 | { | |||
789 | /* With full electrostatic only PME can be switched */ | |||
790 | ir->coulombtype = eelPME; | |||
791 | ir->coulomb_modifier = eintmodPOTSHIFT; | |||
792 | } | |||
793 | else | |||
794 | { | |||
795 | md_print_warn(NULL((void*)0), fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]); | |||
796 | ir->coulombtype = eelRF; | |||
797 | ir->epsilon_rf = 0.0; | |||
798 | ir->coulomb_modifier = eintmodPOTSHIFT; | |||
799 | } | |||
800 | } | |||
801 | ||||
802 | /* We set the pair energy error tolerance to a small number. | |||
803 | * Note that this is only for testing. For production the user | |||
804 | * should think about this and set the mdp options. | |||
805 | */ | |||
806 | ir->verletbuf_tol = 1e-4; | |||
807 | } | |||
808 | ||||
809 | if (inputrec2nboundeddim(ir) != 3) | |||
810 | { | |||
811 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 811, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc"); | |||
812 | } | |||
813 | ||||
814 | if (ir->efep != efepNO || ir->implicit_solvent != eisNO) | |||
815 | { | |||
816 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 816, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent"); | |||
817 | } | |||
818 | ||||
819 | if (EI_DYNAMICS(ir->eI)(((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK)) || ((ir->eI) == eiSD1 || (ir->eI) == eiSD2) || (ir->eI) == eiBD) && !(EI_MD(ir->eI)((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK)) && ir->etc == etcNO)) | |||
820 | { | |||
821 | verletbuf_list_setup_t ls; | |||
822 | ||||
823 | verletbuf_get_list_setup(FALSE0, &ls); | |||
824 | calc_verlet_buffer_size(mtop, box_vol, ir, -1, &ls, NULL((void*)0), &ir->rlist); | |||
825 | } | |||
826 | else | |||
827 | { | |||
828 | real rlist_fac; | |||
829 | ||||
830 | if (EI_MD(ir->eI)((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK))) | |||
831 | { | |||
832 | rlist_fac = 1 + verlet_buffer_ratio_NVE_T0; | |||
833 | } | |||
834 | else | |||
835 | { | |||
836 | rlist_fac = 1 + verlet_buffer_ratio_nodynamics; | |||
837 | } | |||
838 | ir->verletbuf_tol = -1; | |||
839 | ir->rlist = rlist_fac*max(ir->rvdw, ir->rcoulomb)(((ir->rvdw) > (ir->rcoulomb)) ? (ir->rvdw) : (ir ->rcoulomb) ); | |||
840 | } | |||
841 | ||||
842 | gmx_mtop_remove_chargegroups(mtop); | |||
843 | } | |||
844 | ||||
845 | static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt) | |||
846 | { | |||
847 | fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n", | |||
848 | hw_opt->nthreads_tot, | |||
849 | hw_opt->nthreads_tmpi, | |||
850 | hw_opt->nthreads_omp, | |||
851 | hw_opt->nthreads_omp_pme, | |||
852 | hw_opt->gpu_opt.gpu_id != NULL((void*)0) ? hw_opt->gpu_opt.gpu_id : ""); | |||
853 | } | |||
854 | ||||
855 | /* Checks we can do when we don't (yet) know the cut-off scheme */ | |||
856 | static void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt, | |||
857 | gmx_bool bIsSimMaster) | |||
858 | { | |||
859 | gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster); | |||
860 | ||||
861 | #ifndef GMX_THREAD_MPI | |||
862 | if (hw_opt->nthreads_tot > 0) | |||
863 | { | |||
864 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 864, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI"); | |||
865 | } | |||
866 | if (hw_opt->nthreads_tmpi > 0) | |||
867 | { | |||
868 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 868, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI"); | |||
869 | } | |||
870 | #endif | |||
871 | ||||
872 | #ifndef GMX_OPENMP | |||
873 | if (hw_opt->nthreads_omp > 1) | |||
874 | { | |||
875 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 875, "More than 1 OpenMP thread requested, but Gromacs was compiled without OpenMP support"); | |||
876 | } | |||
877 | hw_opt->nthreads_omp = 1; | |||
878 | #endif | |||
879 | ||||
880 | if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0) | |||
881 | { | |||
882 | /* We have the same number of OpenMP threads for PP and PME processes, | |||
883 | * thus we can perform several consistency checks. | |||
884 | */ | |||
885 | if (hw_opt->nthreads_tmpi > 0 && | |||
886 | hw_opt->nthreads_omp > 0 && | |||
887 | hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp) | |||
888 | { | |||
889 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 889, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested", | |||
890 | hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp); | |||
891 | } | |||
892 | ||||
893 | if (hw_opt->nthreads_tmpi > 0 && | |||
894 | hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0) | |||
895 | { | |||
896 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 896, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)", | |||
897 | hw_opt->nthreads_tot, hw_opt->nthreads_tmpi); | |||
898 | } | |||
899 | ||||
900 | if (hw_opt->nthreads_omp > 0 && | |||
901 | hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0) | |||
902 | { | |||
903 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 903, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)", | |||
904 | hw_opt->nthreads_tot, hw_opt->nthreads_omp); | |||
905 | } | |||
906 | ||||
907 | if (hw_opt->nthreads_tmpi > 0 && | |||
908 | hw_opt->nthreads_omp <= 0) | |||
909 | { | |||
910 | hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi; | |||
911 | } | |||
912 | } | |||
913 | ||||
914 | #ifndef GMX_OPENMP | |||
915 | if (hw_opt->nthreads_omp > 1) | |||
916 | { | |||
917 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 917, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support"); | |||
918 | } | |||
919 | #endif | |||
920 | ||||
921 | if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0) | |||
922 | { | |||
923 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 923, "You need to specify -ntomp in addition to -ntomp_pme"); | |||
924 | } | |||
925 | ||||
926 | if (hw_opt->nthreads_tot == 1) | |||
927 | { | |||
928 | hw_opt->nthreads_tmpi = 1; | |||
929 | ||||
930 | if (hw_opt->nthreads_omp > 1) | |||
931 | { | |||
932 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 932, "You requested %d OpenMP threads with %d total threads", | |||
933 | hw_opt->nthreads_tmpi, hw_opt->nthreads_tot); | |||
934 | } | |||
935 | hw_opt->nthreads_omp = 1; | |||
936 | } | |||
937 | ||||
938 | if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0) | |||
939 | { | |||
940 | hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp; | |||
941 | } | |||
942 | ||||
943 | /* Parse GPU IDs, if provided. | |||
944 | * We check consistency with the tMPI thread count later. | |||
945 | */ | |||
946 | gmx_parse_gpu_ids(&hw_opt->gpu_opt); | |||
947 | ||||
948 | #ifdef GMX_THREAD_MPI | |||
949 | if (hw_opt->gpu_opt.ncuda_dev_use > 0 && hw_opt->nthreads_tmpi == 0) | |||
950 | { | |||
951 | /* Set the number of MPI threads equal to the number of GPUs */ | |||
952 | hw_opt->nthreads_tmpi = hw_opt->gpu_opt.ncuda_dev_use; | |||
953 | ||||
954 | if (hw_opt->nthreads_tot > 0 && | |||
955 | hw_opt->nthreads_tmpi > hw_opt->nthreads_tot) | |||
956 | { | |||
957 | /* We have more GPUs than total threads requested. | |||
958 | * We choose to (later) generate a mismatch error, | |||
959 | * instead of launching more threads than requested. | |||
960 | */ | |||
961 | hw_opt->nthreads_tmpi = hw_opt->nthreads_tot; | |||
962 | } | |||
963 | } | |||
964 | #endif | |||
965 | ||||
966 | if (debug) | |||
967 | { | |||
968 | print_hw_opt(debug, hw_opt); | |||
969 | } | |||
970 | } | |||
971 | ||||
972 | /* Checks we can do when we know the cut-off scheme */ | |||
973 | static void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt, | |||
974 | int cutoff_scheme) | |||
975 | { | |||
976 | if (cutoff_scheme == ecutsGROUP) | |||
977 | { | |||
978 | /* We only have OpenMP support for PME only nodes */ | |||
979 | if (hw_opt->nthreads_omp > 1) | |||
980 | { | |||
981 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 981, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s", | |||
982 | ecutscheme_names[cutoff_scheme], | |||
983 | ecutscheme_names[ecutsVERLET]); | |||
984 | } | |||
985 | hw_opt->nthreads_omp = 1; | |||
986 | } | |||
987 | ||||
988 | if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0) | |||
989 | { | |||
990 | hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp; | |||
991 | } | |||
992 | ||||
993 | if (debug) | |||
994 | { | |||
995 | print_hw_opt(debug, hw_opt); | |||
996 | } | |||
997 | } | |||
998 | ||||
999 | ||||
1000 | /* Override the value in inputrec with value passed on the command line (if any) */ | |||
1001 | static void override_nsteps_cmdline(FILE *fplog, | |||
1002 | gmx_int64_t nsteps_cmdline, | |||
1003 | t_inputrec *ir, | |||
1004 | const t_commrec *cr) | |||
1005 | { | |||
1006 | char sbuf[STEPSTRSIZE22]; | |||
1007 | ||||
1008 | assert(ir)((void) (0)); | |||
1009 | assert(cr)((void) (0)); | |||
1010 | ||||
1011 | /* override with anything else than the default -2 */ | |||
1012 | if (nsteps_cmdline > -2) | |||
1013 | { | |||
1014 | char stmp[STRLEN4096]; | |||
1015 | ||||
1016 | ir->nsteps = nsteps_cmdline; | |||
1017 | if (EI_DYNAMICS(ir->eI)(((ir->eI) == eiMD || ((ir->eI) == eiVV || (ir->eI) == eiVVAK)) || ((ir->eI) == eiSD1 || (ir->eI) == eiSD2) || (ir->eI) == eiBD)) | |||
1018 | { | |||
1019 | sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps", | |||
1020 | gmx_step_str(nsteps_cmdline, sbuf), | |||
1021 | nsteps_cmdline*ir->delta_t); | |||
1022 | } | |||
1023 | else | |||
1024 | { | |||
1025 | sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps", | |||
1026 | gmx_step_str(nsteps_cmdline, sbuf)); | |||
1027 | } | |||
1028 | ||||
1029 | md_print_warn(cr, fplog, "%s\n", stmp); | |||
1030 | } | |||
1031 | } | |||
1032 | ||||
1033 | /* Frees GPU memory and destroys the CUDA context. | |||
1034 | * | |||
1035 | * Note that this function needs to be called even if GPUs are not used | |||
1036 | * in this run because the PME ranks have no knowledge of whether GPUs | |||
1037 | * are used or not, but all ranks need to enter the barrier below. | |||
1038 | */ | |||
1039 | static void free_gpu_resources(const t_forcerec *fr, | |||
1040 | const t_commrec *cr) | |||
1041 | { | |||
1042 | gmx_bool bIsPPrankUsingGPU; | |||
1043 | char gpu_err_str[STRLEN4096]; | |||
1044 | ||||
1045 | bIsPPrankUsingGPU = (cr->duty & DUTY_PP(1<<0)) && fr->nbv != NULL((void*)0) && fr->nbv->bUseGPU; | |||
1046 | ||||
1047 | if (bIsPPrankUsingGPU) | |||
1048 | { | |||
1049 | /* free nbnxn data in GPU memory */ | |||
1050 | nbnxn_cuda_free(fr->nbv->cu_nbv); | |||
1051 | ||||
1052 | /* With tMPI we need to wait for all ranks to finish deallocation before | |||
1053 | * destroying the context in free_gpu() as some ranks may be sharing | |||
1054 | * GPU and context. | |||
1055 | * Note: as only PP ranks need to free GPU resources, so it is safe to | |||
1056 | * not call the barrier on PME ranks. | |||
1057 | */ | |||
1058 | #ifdef GMX_THREAD_MPI | |||
1059 | if (PAR(cr)((cr)->nnodes > 1)) | |||
1060 | { | |||
1061 | gmx_barrier(cr); | |||
1062 | } | |||
1063 | #endif /* GMX_THREAD_MPI */ | |||
1064 | ||||
1065 | /* uninitialize GPU (by destroying the context) */ | |||
1066 | if (!free_gpu(gpu_err_str)) | |||
1067 | { | |||
1068 | gmx_warning("On node %d failed to free GPU #%d: %s", | |||
1069 | cr->nodeid, get_current_gpu_device_id(), gpu_err_str); | |||
1070 | } | |||
1071 | } | |||
1072 | } | |||
1073 | ||||
1074 | int mdrunner(gmx_hw_opt_t *hw_opt, | |||
1075 | FILE *fplog, t_commrec *cr, int nfile, | |||
1076 | const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose, | |||
1077 | gmx_bool bCompact, int nstglobalcomm, | |||
1078 | ivec ddxyz, int dd_node_order, real rdd, real rconstr, | |||
1079 | const char *dddlb_opt, real dlb_scale, | |||
1080 | const char *ddcsx, const char *ddcsy, const char *ddcsz, | |||
1081 | const char *nbpu_opt, int nstlist_cmdline, | |||
1082 | gmx_int64_t nsteps_cmdline, int nstepout, int resetstep, | |||
1083 | int gmx_unused__attribute__ ((unused)) nmultisim, int repl_ex_nst, int repl_ex_nex, | |||
1084 | int repl_ex_seed, real pforce, real cpt_period, real max_hours, | |||
1085 | const char *deviceOptions, int imdport, unsigned long Flags) | |||
1086 | { | |||
1087 | gmx_bool bForceUseGPU, bTryUseGPU; | |||
1088 | double nodetime = 0, realtime; | |||
1089 | t_inputrec *inputrec; | |||
1090 | t_state *state = NULL((void*)0); | |||
1091 | matrix box; | |||
1092 | gmx_ddbox_t ddbox = {0}; | |||
1093 | int npme_major, npme_minor; | |||
1094 | real tmpr1, tmpr2; | |||
1095 | t_nrnb *nrnb; | |||
1096 | gmx_mtop_t *mtop = NULL((void*)0); | |||
1097 | t_mdatoms *mdatoms = NULL((void*)0); | |||
1098 | t_forcerec *fr = NULL((void*)0); | |||
| ||||
1099 | t_fcdata *fcd = NULL((void*)0); | |||
1100 | real ewaldcoeff_q = 0; | |||
1101 | real ewaldcoeff_lj = 0; | |||
1102 | gmx_pme_t *pmedata = NULL((void*)0); | |||
1103 | gmx_vsite_t *vsite = NULL((void*)0); | |||
1104 | gmx_constr_t constr; | |||
1105 | int i, m, nChargePerturbed = -1, nTypePerturbed = 0, status, nalloc; | |||
1106 | char *gro; | |||
1107 | gmx_wallcycle_t wcycle; | |||
1108 | gmx_bool bReadEkin; | |||
1109 | int list; | |||
1110 | gmx_walltime_accounting_t walltime_accounting = NULL((void*)0); | |||
1111 | int rc; | |||
1112 | gmx_int64_t reset_counters; | |||
1113 | gmx_edsam_t ed = NULL((void*)0); | |||
1114 | t_commrec *cr_old = cr; | |||
1115 | int nthreads_pme = 1; | |||
1116 | int nthreads_pp = 1; | |||
1117 | gmx_membed_t membed = NULL((void*)0); | |||
1118 | gmx_hw_info_t *hwinfo = NULL((void*)0); | |||
1119 | /* The master rank decides early on bUseGPU and broadcasts this later */ | |||
1120 | gmx_bool bUseGPU = FALSE0; | |||
1121 | ||||
1122 | /* CAUTION: threads may be started later on in this function, so | |||
1123 | cr doesn't reflect the final parallel state right now */ | |||
1124 | snew(inputrec, 1)(inputrec) = save_calloc("inputrec", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1124, (1), sizeof(*(inputrec))); | |||
1125 | snew(mtop, 1)(mtop) = save_calloc("mtop", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1125, (1), sizeof(*(mtop))); | |||
1126 | ||||
1127 | if (Flags & MD_APPENDFILES(1<<15)) | |||
1128 | { | |||
1129 | fplog = NULL((void*)0); | |||
1130 | } | |||
1131 | ||||
1132 | bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3)(__extension__ (__builtin_constant_p (3) && ((__builtin_constant_p (nbpu_opt) && strlen (nbpu_opt) < ((size_t) (3))) || (__builtin_constant_p ("gpu") && strlen ("gpu") < ((size_t) (3)))) ? __extension__ ({ size_t __s1_len, __s2_len ; (__builtin_constant_p (nbpu_opt) && __builtin_constant_p ("gpu") && (__s1_len = strlen (nbpu_opt), __s2_len = strlen ("gpu"), (!((size_t)(const void *)((nbpu_opt) + 1) - ( size_t)(const void *)(nbpu_opt) == 1) || __s1_len >= 4) && (!((size_t)(const void *)(("gpu") + 1) - (size_t)(const void *)("gpu") == 1) || __s2_len >= 4)) ? __builtin_strcmp (nbpu_opt , "gpu") : (__builtin_constant_p (nbpu_opt) && ((size_t )(const void *)((nbpu_opt) + 1) - (size_t)(const void *)(nbpu_opt ) == 1) && (__s1_len = strlen (nbpu_opt), __s1_len < 4) ? (__builtin_constant_p ("gpu") && ((size_t)(const void *)(("gpu") + 1) - (size_t)(const void *)("gpu") == 1) ? __builtin_strcmp (nbpu_opt, "gpu") : (__extension__ ({ const unsigned char *__s2 = (const unsigned char *) (const char *) ("gpu"); int __result = (((const unsigned char *) (const char *) (nbpu_opt))[0] - __s2[0]); if (__s1_len > 0 && __result == 0) { __result = (((const unsigned char *) (const char *) (nbpu_opt))[1] - __s2[1]); if (__s1_len > 1 && __result == 0) { __result = (((const unsigned char *) (const char *) (nbpu_opt))[2] - __s2[2]); if (__s1_len > 2 && __result == 0) __result = (((const unsigned char *) (const char *) (nbpu_opt))[3] - __s2[3]); } } __result; }))) : (__builtin_constant_p ("gpu") && ((size_t)(const void *)(("gpu") + 1) - (size_t )(const void *)("gpu") == 1) && (__s2_len = strlen ("gpu" ), __s2_len < 4) ? (__builtin_constant_p (nbpu_opt) && ((size_t)(const void *)((nbpu_opt) + 1) - (size_t)(const void *)(nbpu_opt) == 1) ? __builtin_strcmp (nbpu_opt, "gpu") : (- (__extension__ ({ const unsigned char *__s2 = (const unsigned char *) (const char *) (nbpu_opt); int __result = (((const unsigned char *) (const char *) ("gpu"))[0] - __s2[0]); if (__s2_len > 0 && __result == 0) { __result = (((const unsigned char *) (const char *) ("gpu"))[1] - __s2[1]); if (__s2_len > 1 && __result == 0) { __result = (((const unsigned char *) (const char *) ("gpu"))[2] - __s2[2]); if (__s2_len > 2 && __result == 0) __result = (((const unsigned char * ) (const char *) ("gpu"))[3] - __s2[3]); } } __result; })))) : __builtin_strcmp (nbpu_opt, "gpu")))); }) : strncmp (nbpu_opt , "gpu", 3))) == 0); | |||
1133 | bTryUseGPU = (strncmp(nbpu_opt, "auto", 4)(__extension__ (__builtin_constant_p (4) && ((__builtin_constant_p (nbpu_opt) && strlen (nbpu_opt) < ((size_t) (4))) || (__builtin_constant_p ("auto") && strlen ("auto") < ((size_t) (4)))) ? __extension__ ({ size_t __s1_len, __s2_len ; (__builtin_constant_p (nbpu_opt) && __builtin_constant_p ("auto") && (__s1_len = strlen (nbpu_opt), __s2_len = strlen ("auto"), (!((size_t)(const void *)((nbpu_opt) + 1) - (size_t)(const void *)(nbpu_opt) == 1) || __s1_len >= 4) && (!((size_t)(const void *)(("auto") + 1) - (size_t)(const void *)("auto") == 1) || __s2_len >= 4)) ? __builtin_strcmp (nbpu_opt , "auto") : (__builtin_constant_p (nbpu_opt) && ((size_t )(const void *)((nbpu_opt) + 1) - (size_t)(const void *)(nbpu_opt ) == 1) && (__s1_len = strlen (nbpu_opt), __s1_len < 4) ? (__builtin_constant_p ("auto") && ((size_t)(const void *)(("auto") + 1) - (size_t)(const void *)("auto") == 1) ? __builtin_strcmp (nbpu_opt, "auto") : (__extension__ ({ const unsigned char *__s2 = (const unsigned char *) (const char *) ("auto"); int __result = (((const unsigned char *) (const char *) (nbpu_opt))[0] - __s2[0]); if (__s1_len > 0 && __result == 0) { __result = (((const unsigned char *) (const char *) (nbpu_opt))[1] - __s2[1]); if (__s1_len > 1 && __result == 0) { __result = (((const unsigned char *) (const char *) (nbpu_opt))[2] - __s2[2]); if (__s1_len > 2 && __result == 0) __result = (((const unsigned char *) (const char *) (nbpu_opt))[3] - __s2[3]); } } __result; }))) : (__builtin_constant_p ("auto") && ((size_t)(const void *)(("auto") + 1) - ( size_t)(const void *)("auto") == 1) && (__s2_len = strlen ("auto"), __s2_len < 4) ? (__builtin_constant_p (nbpu_opt ) && ((size_t)(const void *)((nbpu_opt) + 1) - (size_t )(const void *)(nbpu_opt) == 1) ? __builtin_strcmp (nbpu_opt, "auto") : (- (__extension__ ({ const unsigned char *__s2 = ( const unsigned char *) (const char *) (nbpu_opt); int __result = (((const unsigned char *) (const char *) ("auto"))[0] - __s2 [0]); if (__s2_len > 0 && __result == 0) { __result = (((const unsigned char *) (const char *) ("auto"))[1] - __s2 [1]); if (__s2_len > 1 && __result == 0) { __result = (((const unsigned char *) (const char *) ("auto"))[2] - __s2 [2]); if (__s2_len > 2 && __result == 0) __result = (((const unsigned char *) (const char *) ("auto"))[3] - __s2 [3]); } } __result; })))) : __builtin_strcmp (nbpu_opt, "auto" )))); }) : strncmp (nbpu_opt, "auto", 4))) == 0) || bForceUseGPU; | |||
1134 | ||||
1135 | /* Detect hardware, gather information. This is an operation that is | |||
1136 | * global for this process (MPI rank). */ | |||
1137 | hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU); | |||
1138 | ||||
1139 | ||||
1140 | snew(state, 1)(state) = save_calloc("state", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1140, (1), sizeof(*(state))); | |||
1141 | if (SIMMASTER(cr)(((((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && ((cr)->duty & (1<<0))) || !((cr)->nnodes > 1))) | |||
1142 | { | |||
1143 | /* Read (nearly) all data required for the simulation */ | |||
1144 | read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL((void*)0), mtop); | |||
1145 | ||||
1146 | if (inputrec->cutoff_scheme != ecutsVERLET && | |||
1147 | ((Flags & MD_TESTVERLET(1<<22)) || getenv("GMX_VERLET_SCHEME") != NULL((void*)0))) | |||
1148 | { | |||
1149 | convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box)); | |||
1150 | } | |||
1151 | ||||
1152 | if (inputrec->cutoff_scheme == ecutsVERLET) | |||
1153 | { | |||
1154 | /* Here the master rank decides if all ranks will use GPUs */ | |||
1155 | bUseGPU = (hwinfo->gpu_info.ncuda_dev_compatible > 0 || | |||
1156 | getenv("GMX_EMULATE_GPU") != NULL((void*)0)); | |||
1157 | ||||
1158 | /* TODO add GPU kernels for this and replace this check by: | |||
1159 | * (bUseGPU && (ir->vdwtype == evdwPME && | |||
1160 | * ir->ljpme_combination_rule == eljpmeLB)) | |||
1161 | * update the message text and the content of nbnxn_acceleration_supported. | |||
1162 | */ | |||
1163 | if (bUseGPU && | |||
1164 | !nbnxn_acceleration_supported(fplog, cr, inputrec, bUseGPU)) | |||
1165 | { | |||
1166 | /* Fallback message printed by nbnxn_acceleration_supported */ | |||
1167 | if (bForceUseGPU) | |||
1168 | { | |||
1169 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1169, "GPU acceleration requested, but not supported with the given input settings"); | |||
1170 | } | |||
1171 | bUseGPU = FALSE0; | |||
1172 | } | |||
1173 | ||||
1174 | prepare_verlet_scheme(fplog, cr, | |||
1175 | inputrec, nstlist_cmdline, mtop, state->box, | |||
1176 | bUseGPU); | |||
1177 | } | |||
1178 | else | |||
1179 | { | |||
1180 | if (nstlist_cmdline > 0) | |||
1181 | { | |||
1182 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1182, "Can not set nstlist with the group cut-off scheme"); | |||
1183 | } | |||
1184 | ||||
1185 | if (hwinfo->gpu_info.ncuda_dev_compatible > 0) | |||
1186 | { | |||
1187 | md_print_warn(cr, fplog, | |||
1188 | "NOTE: GPU(s) found, but the current simulation can not use GPUs\n" | |||
1189 | " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n" | |||
1190 | " (for quick performance testing you can use the -testverlet option)\n"); | |||
1191 | } | |||
1192 | ||||
1193 | if (bForceUseGPU) | |||
1194 | { | |||
1195 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1195, "GPU requested, but can't be used without cutoff-scheme=Verlet"); | |||
1196 | } | |||
1197 | ||||
1198 | #ifdef GMX_TARGET_BGQ | |||
1199 | md_print_warn(cr, fplog, | |||
1200 | "NOTE: There is no SIMD implementation of the group scheme kernels on\n" | |||
1201 | " BlueGene/Q. You will observe better performance from using the\n" | |||
1202 | " Verlet cut-off scheme.\n"); | |||
1203 | #endif | |||
1204 | } | |||
1205 | } | |||
1206 | ||||
1207 | /* Check and update the hardware options for internal consistency */ | |||
1208 | check_and_update_hw_opt_1(hw_opt, SIMMASTER(cr)(((((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && ((cr)->duty & (1<<0))) || !((cr)->nnodes > 1))); | |||
1209 | ||||
1210 | /* Early check for externally set process affinity. */ | |||
1211 | gmx_check_thread_affinity_set(fplog, cr, | |||
1212 | hw_opt, hwinfo->nthreads_hw_avail, FALSE0); | |||
1213 | if (SIMMASTER(cr)(((((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && ((cr)->duty & (1<<0))) || !((cr)->nnodes > 1))) | |||
1214 | { | |||
1215 | ||||
1216 | #ifdef GMX_THREAD_MPI | |||
1217 | if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0) | |||
1218 | { | |||
1219 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1219, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes"); | |||
1220 | } | |||
1221 | #endif | |||
1222 | ||||
1223 | if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp && | |||
1224 | cr->npmenodes <= 0) | |||
1225 | { | |||
1226 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1226, "You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes"); | |||
1227 | } | |||
1228 | } | |||
1229 | ||||
1230 | #ifdef GMX_THREAD_MPI | |||
1231 | if (SIMMASTER(cr)(((((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && ((cr)->duty & (1<<0))) || !((cr)->nnodes > 1))) | |||
1232 | { | |||
1233 | /* Since the master knows the cut-off scheme, update hw_opt for this. | |||
1234 | * This is done later for normal MPI and also once more with tMPI | |||
1235 | * for all tMPI ranks. | |||
1236 | */ | |||
1237 | check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme); | |||
1238 | ||||
1239 | /* NOW the threads will be started: */ | |||
1240 | hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo, | |||
1241 | hw_opt, | |||
1242 | inputrec, mtop, | |||
1243 | cr, fplog); | |||
1244 | if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0) | |||
1245 | { | |||
1246 | hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi; | |||
1247 | } | |||
1248 | ||||
1249 | if (hw_opt->nthreads_tmpi > 1) | |||
1250 | { | |||
1251 | /* now start the threads. */ | |||
1252 | cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, | |||
1253 | oenv, bVerbose, bCompact, nstglobalcomm, | |||
1254 | ddxyz, dd_node_order, rdd, rconstr, | |||
1255 | dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz, | |||
1256 | nbpu_opt, nstlist_cmdline, | |||
1257 | nsteps_cmdline, nstepout, resetstep, nmultisim, | |||
1258 | repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce, | |||
1259 | cpt_period, max_hours, deviceOptions, | |||
1260 | Flags); | |||
1261 | /* the main thread continues here with a new cr. We don't deallocate | |||
1262 | the old cr because other threads may still be reading it. */ | |||
1263 | if (cr == NULL((void*)0)) | |||
1264 | { | |||
1265 | gmx_comm("Failed to spawn threads")_gmx_error("comm", "Failed to spawn threads", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1265); | |||
1266 | } | |||
1267 | } | |||
1268 | } | |||
1269 | #endif | |||
1270 | /* END OF CAUTION: cr is now reliable */ | |||
1271 | ||||
1272 | /* g_membed initialisation * | |||
1273 | * Because we change the mtop, init_membed is called before the init_parallel * | |||
1274 | * (in case we ever want to make it run in parallel) */ | |||
1275 | if (opt2bSet("-membed", nfile, fnm)) | |||
1276 | { | |||
1277 | if (MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1))) | |||
1278 | { | |||
1279 | fprintf(stderrstderr, "Initializing membed"); | |||
1280 | } | |||
1281 | membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period); | |||
1282 | } | |||
1283 | ||||
1284 | if (PAR(cr)((cr)->nnodes > 1)) | |||
1285 | { | |||
1286 | /* now broadcast everything to the non-master nodes/threads: */ | |||
1287 | init_parallel(cr, inputrec, mtop); | |||
1288 | } | |||
1289 | if (fplog != NULL((void*)0)) | |||
1290 | { | |||
1291 | pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE0); | |||
1292 | } | |||
1293 | ||||
1294 | /* now make sure the state is initialized and propagated */ | |||
1295 | set_state_entries(state, inputrec); | |||
1296 | ||||
1297 | /* A parallel command line option consistency check that we can | |||
1298 | only do after any threads have started. */ | |||
1299 | if (!PAR(cr)((cr)->nnodes > 1) && | |||
1300 | (ddxyz[XX0] > 1 || ddxyz[YY1] > 1 || ddxyz[ZZ2] > 1 || cr->npmenodes > 0)) | |||
1301 | { | |||
1302 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1302, | |||
1303 | "The -dd or -npme option request a parallel simulation, " | |||
1304 | #ifndef GMX_MPI | |||
1305 | "but %s was compiled without threads or MPI enabled" | |||
1306 | #else | |||
1307 | #ifdef GMX_THREAD_MPI | |||
1308 | "but the number of threads (option -nt) is 1" | |||
1309 | #else | |||
1310 | "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec" | |||
1311 | #endif | |||
1312 | #endif | |||
1313 | , ShortProgram() | |||
1314 | ); | |||
1315 | } | |||
1316 | ||||
1317 | if ((Flags & MD_RERUN(1<<4)) && | |||
1318 | (EI_ENERGY_MINIMIZATION(inputrec->eI)((inputrec->eI) == eiSteep || (inputrec->eI) == eiCG || (inputrec->eI) == eiLBFGS) || eiNM == inputrec->eI)) | |||
1319 | { | |||
1320 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1320, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun"); | |||
1321 | } | |||
1322 | ||||
1323 | if (can_use_allvsall(inputrec, TRUE1, cr, fplog) && DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1))) | |||
1324 | { | |||
1325 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1325, "All-vs-all loops do not work with domain decomposition, use a single MPI rank"); | |||
1326 | } | |||
1327 | ||||
1328 | if (!(EEL_PME(inputrec->coulombtype)((inputrec->coulombtype) == eelPME || (inputrec->coulombtype ) == eelPMESWITCH || (inputrec->coulombtype) == eelPMEUSER || (inputrec->coulombtype) == eelPMEUSERSWITCH || (inputrec ->coulombtype) == eelP3M_AD) || EVDW_PME(inputrec->vdwtype)((inputrec->vdwtype) == evdwPME))) | |||
1329 | { | |||
1330 | if (cr->npmenodes > 0) | |||
1331 | { | |||
1332 | gmx_fatal_collective(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1332, cr, NULL((void*)0), | |||
1333 | "PME nodes are requested, but the system does not use PME electrostatics or LJ-PME"); | |||
1334 | } | |||
1335 | ||||
1336 | cr->npmenodes = 0; | |||
1337 | } | |||
1338 | ||||
1339 | #ifdef GMX_FAHCORE | |||
1340 | if (MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1))) | |||
1341 | { | |||
1342 | fcRegisterSteps(inputrec->nsteps, inputrec->init_step); | |||
1343 | } | |||
1344 | #endif | |||
1345 | ||||
1346 | /* NMR restraints must be initialized before load_checkpoint, | |||
1347 | * since with time averaging the history is added to t_state. | |||
1348 | * For proper consistency check we therefore need to extend | |||
1349 | * t_state here. | |||
1350 | * So the PME-only nodes (if present) will also initialize | |||
1351 | * the distance restraints. | |||
1352 | */ | |||
1353 | snew(fcd, 1)(fcd) = save_calloc("fcd", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1353, (1), sizeof(*(fcd))); | |||
1354 | ||||
1355 | /* This needs to be called before read_checkpoint to extend the state */ | |||
1356 | init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0); | |||
1357 | ||||
1358 | init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires), | |||
1359 | state); | |||
1360 | ||||
1361 | if (DEFORM(*inputrec)((*inputrec).deform[0][0] != 0 || (*inputrec).deform[1][1] != 0 || (*inputrec).deform[2][2] != 0 || (*inputrec).deform[1][ 0] != 0 || (*inputrec).deform[2][0] != 0 || (*inputrec).deform [2][1] != 0)) | |||
1362 | { | |||
1363 | /* Store the deform reference box before reading the checkpoint */ | |||
1364 | if (SIMMASTER(cr)(((((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && ((cr)->duty & (1<<0))) || !((cr)->nnodes > 1))) | |||
1365 | { | |||
1366 | copy_mat(state->box, box); | |||
1367 | } | |||
1368 | if (PAR(cr)((cr)->nnodes > 1)) | |||
1369 | { | |||
1370 | gmx_bcast(sizeof(box), box, cr); | |||
1371 | } | |||
1372 | /* Because we do not have the update struct available yet | |||
1373 | * in which the reference values should be stored, | |||
1374 | * we store them temporarily in static variables. | |||
1375 | * This should be thread safe, since they are only written once | |||
1376 | * and with identical values. | |||
1377 | */ | |||
1378 | tMPI_Thread_mutex_lock(&deform_init_box_mutex); | |||
1379 | deform_init_init_step_tpx = inputrec->init_step; | |||
1380 | copy_mat(box, deform_init_box_tpx); | |||
1381 | tMPI_Thread_mutex_unlock(&deform_init_box_mutex); | |||
1382 | } | |||
1383 | ||||
1384 | if (opt2bSet("-cpi", nfile, fnm)) | |||
1385 | { | |||
1386 | /* Check if checkpoint file exists before doing continuation. | |||
1387 | * This way we can use identical input options for the first and subsequent runs... | |||
1388 | */ | |||
1389 | if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) ) | |||
1390 | { | |||
1391 | load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog, | |||
1392 | cr, ddxyz, | |||
1393 | inputrec, state, &bReadEkin, | |||
1394 | (Flags & MD_APPENDFILES(1<<15)), | |||
1395 | (Flags & MD_APPENDFILESSET(1<<21))); | |||
1396 | ||||
1397 | if (bReadEkin) | |||
1398 | { | |||
1399 | Flags |= MD_READ_EKIN(1<<17); | |||
1400 | } | |||
1401 | } | |||
1402 | } | |||
1403 | ||||
1404 | if (((MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1)) || (Flags & MD_SEPPOT(1<<7))) && (Flags & MD_APPENDFILES(1<<15))) | |||
1405 | #ifdef GMX_THREAD_MPI | |||
1406 | /* With thread MPI only the master node/thread exists in mdrun.c, | |||
1407 | * therefore non-master nodes need to open the "seppot" log file here. | |||
1408 | */ | |||
1409 | || (!MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && (Flags & MD_SEPPOT(1<<7))) | |||
1410 | #endif | |||
1411 | ) | |||
1412 | { | |||
1413 | gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT(1<<7)), | |||
1414 | Flags, &fplog); | |||
1415 | } | |||
1416 | ||||
1417 | /* override nsteps with value from cmdline */ | |||
1418 | override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr); | |||
1419 | ||||
1420 | if (SIMMASTER(cr)(((((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && ((cr)->duty & (1<<0))) || !((cr)->nnodes > 1))) | |||
1421 | { | |||
1422 | copy_mat(state->box, box); | |||
1423 | } | |||
1424 | ||||
1425 | if (PAR(cr)((cr)->nnodes > 1)) | |||
1426 | { | |||
1427 | gmx_bcast(sizeof(box), box, cr); | |||
1428 | } | |||
1429 | ||||
1430 | /* Essential dynamics */ | |||
1431 | if (opt2bSet("-ei", nfile, fnm)) | |||
1432 | { | |||
1433 | /* Open input and output files, allocate space for ED data structure */ | |||
1434 | ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr); | |||
1435 | } | |||
1436 | ||||
1437 | if (PAR(cr)((cr)->nnodes > 1) && !(EI_TPI(inputrec->eI)((inputrec->eI) == eiTPI || (inputrec->eI) == eiTPIC) || | |||
1438 | inputrec->eI == eiNM)) | |||
1439 | { | |||
1440 | cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr, | |||
1441 | dddlb_opt, dlb_scale, | |||
1442 | ddcsx, ddcsy, ddcsz, | |||
1443 | mtop, inputrec, | |||
1444 | box, state->x, | |||
1445 | &ddbox, &npme_major, &npme_minor); | |||
1446 | ||||
1447 | make_dd_communicators(fplog, cr, dd_node_order); | |||
1448 | ||||
1449 | /* Set overallocation to avoid frequent reallocation of arrays */ | |||
1450 | set_over_alloc_dd(TRUE1); | |||
1451 | } | |||
1452 | else | |||
1453 | { | |||
1454 | /* PME, if used, is done on all nodes with 1D decomposition */ | |||
1455 | cr->npmenodes = 0; | |||
1456 | cr->duty = (DUTY_PP(1<<0) | DUTY_PME(1<<1)); | |||
1457 | npme_major = 1; | |||
1458 | npme_minor = 1; | |||
1459 | ||||
1460 | if (inputrec->ePBC == epbcSCREW) | |||
1461 | { | |||
1462 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1462, | |||
1463 | "pbc=%s is only implemented with domain decomposition", | |||
1464 | epbc_names[inputrec->ePBC]); | |||
1465 | } | |||
1466 | } | |||
1467 | ||||
1468 | if (PAR(cr)((cr)->nnodes > 1)) | |||
1469 | { | |||
1470 | /* After possible communicator splitting in make_dd_communicators. | |||
1471 | * we can set up the intra/inter node communication. | |||
1472 | */ | |||
1473 | gmx_setup_nodecomm(fplog, cr); | |||
1474 | } | |||
1475 | ||||
1476 | /* Initialize per-physical-node MPI process/thread ID and counters. */ | |||
1477 | gmx_init_intranode_counters(cr); | |||
1478 | ||||
1479 | #ifdef GMX_MPI | |||
1480 | md_print_info(cr, fplog, "Using %d MPI %s\n", | |||
1481 | cr->nnodes, | |||
1482 | #ifdef GMX_THREAD_MPI | |||
1483 | cr->nnodes == 1 ? "thread" : "threads" | |||
1484 | #else | |||
1485 | cr->nnodes == 1 ? "process" : "processes" | |||
1486 | #endif | |||
1487 | ); | |||
1488 | fflush(stderrstderr); | |||
1489 | #endif | |||
1490 | ||||
1491 | /* Check and update hw_opt for the cut-off scheme */ | |||
1492 | check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme); | |||
1493 | ||||
1494 | gmx_omp_nthreads_init(fplog, cr, | |||
1495 | hwinfo->nthreads_hw_avail, | |||
1496 | hw_opt->nthreads_omp, | |||
1497 | hw_opt->nthreads_omp_pme, | |||
1498 | (cr->duty & DUTY_PP(1<<0)) == 0, | |||
1499 | inputrec->cutoff_scheme == ecutsVERLET); | |||
1500 | ||||
1501 | if (PAR(cr)((cr)->nnodes > 1)) | |||
1502 | { | |||
1503 | /* The master rank decided on the use of GPUs, | |||
1504 | * broadcast this information to all ranks. | |||
1505 | */ | |||
1506 | gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr); | |||
1507 | } | |||
1508 | ||||
1509 | if (bUseGPU) | |||
1510 | { | |||
1511 | if (cr->npmenodes == -1) | |||
1512 | { | |||
1513 | /* Don't automatically use PME-only nodes with GPUs */ | |||
1514 | cr->npmenodes = 0; | |||
1515 | } | |||
1516 | ||||
1517 | /* Select GPU id's to use */ | |||
1518 | gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU, | |||
1519 | &hw_opt->gpu_opt); | |||
1520 | } | |||
1521 | else | |||
1522 | { | |||
1523 | /* Ignore (potentially) manually selected GPUs */ | |||
1524 | hw_opt->gpu_opt.ncuda_dev_use = 0; | |||
1525 | } | |||
1526 | ||||
1527 | /* check consistency across ranks of things like SIMD | |||
1528 | * support and number of GPUs selected */ | |||
1529 | gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU); | |||
1530 | ||||
1531 | if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1))) | |||
1532 | { | |||
1533 | /* When we share GPUs over ranks, we need to know this for the DLB */ | |||
1534 | dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt); | |||
1535 | } | |||
1536 | ||||
1537 | /* getting number of PP/PME threads | |||
1538 | PME: env variable should be read only on one node to make sure it is | |||
1539 | identical everywhere; | |||
1540 | */ | |||
1541 | /* TODO nthreads_pp is only used for pinning threads. | |||
1542 | * This is a temporary solution until we have a hw topology library. | |||
1543 | */ | |||
1544 | nthreads_pp = gmx_omp_nthreads_get(emntNonbonded); | |||
1545 | nthreads_pme = gmx_omp_nthreads_get(emntPME); | |||
1546 | ||||
1547 | wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme); | |||
1548 | ||||
1549 | if (PAR(cr)((cr)->nnodes > 1)) | |||
1550 | { | |||
1551 | /* Master synchronizes its value of reset_counters with all nodes | |||
1552 | * including PME only nodes */ | |||
1553 | reset_counters = wcycle_get_reset_counters(wcycle); | |||
1554 | gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr); | |||
1555 | wcycle_set_reset_counters(wcycle, reset_counters); | |||
1556 | } | |||
1557 | ||||
1558 | snew(nrnb, 1)(nrnb) = save_calloc("nrnb", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1558, (1), sizeof(*(nrnb))); | |||
1559 | if (cr->duty & DUTY_PP(1<<0)) | |||
1560 | { | |||
1561 | bcast_state(cr, state); | |||
1562 | ||||
1563 | /* Initiate forcerecord */ | |||
1564 | fr = mk_forcerec(); | |||
1565 | fr->hwinfo = hwinfo; | |||
1566 | fr->gpu_opt = &hw_opt->gpu_opt; | |||
1567 | init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box, | |||
1568 | opt2fn("-table", nfile, fnm), | |||
1569 | opt2fn("-tabletf", nfile, fnm), | |||
1570 | opt2fn("-tablep", nfile, fnm), | |||
1571 | opt2fn("-tableb", nfile, fnm), | |||
1572 | nbpu_opt, | |||
1573 | FALSE0, | |||
1574 | pforce); | |||
1575 | ||||
1576 | /* version for PCA_NOT_READ_NODE (see md.c) */ | |||
1577 | /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE, | |||
1578 | "nofile","nofile","nofile","nofile",FALSE,pforce); | |||
1579 | */ | |||
1580 | fr->bSepDVDL = ((Flags & MD_SEPPOT(1<<7)) == MD_SEPPOT(1<<7)); | |||
1581 | ||||
1582 | /* Initialize QM-MM */ | |||
1583 | if (fr->bQMMM) | |||
1584 | { | |||
1585 | init_QMMMrec(cr, mtop, inputrec, fr); | |||
1586 | } | |||
1587 | ||||
1588 | /* Initialize the mdatoms structure. | |||
1589 | * mdatoms is not filled with atom data, | |||
1590 | * as this can not be done now with domain decomposition. | |||
1591 | */ | |||
1592 | mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO); | |||
1593 | ||||
1594 | /* Initialize the virtual site communication */ | |||
1595 | vsite = init_vsite(mtop, cr, FALSE0); | |||
1596 | ||||
1597 | calc_shifts(box, fr->shift_vec); | |||
1598 | ||||
1599 | /* With periodic molecules the charge groups should be whole at start up | |||
1600 | * and the virtual sites should not be far from their proper positions. | |||
1601 | */ | |||
1602 | if (!inputrec->bContinuation && MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && | |||
1603 | !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols)) | |||
1604 | { | |||
1605 | /* Make molecules whole at start of run */ | |||
1606 | if (fr->ePBC != epbcNONE) | |||
1607 | { | |||
1608 | do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x); | |||
1609 | } | |||
1610 | if (vsite) | |||
1611 | { | |||
1612 | /* Correct initial vsite positions are required | |||
1613 | * for the initial distribution in the domain decomposition | |||
1614 | * and for the initial shell prediction. | |||
1615 | */ | |||
1616 | construct_vsites_mtop(vsite, mtop, state->x); | |||
1617 | } | |||
1618 | } | |||
1619 | ||||
1620 | if (EEL_PME(fr->eeltype)((fr->eeltype) == eelPME || (fr->eeltype) == eelPMESWITCH || (fr->eeltype) == eelPMEUSER || (fr->eeltype) == eelPMEUSERSWITCH || (fr->eeltype) == eelP3M_AD) || EVDW_PME(fr->vdwtype)((fr->vdwtype) == evdwPME)) | |||
1621 | { | |||
1622 | ewaldcoeff_q = fr->ewaldcoeff_q; | |||
1623 | ewaldcoeff_lj = fr->ewaldcoeff_lj; | |||
1624 | pmedata = &fr->pmedata; | |||
1625 | } | |||
1626 | else | |||
1627 | { | |||
1628 | pmedata = NULL((void*)0); | |||
1629 | } | |||
1630 | } | |||
1631 | else | |||
1632 | { | |||
1633 | /* This is a PME only node */ | |||
1634 | ||||
1635 | /* We don't need the state */ | |||
1636 | done_state(state); | |||
1637 | ||||
1638 | ewaldcoeff_q = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol); | |||
1639 | ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj); | |||
1640 | snew(pmedata, 1)(pmedata) = save_calloc("pmedata", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1640, (1), sizeof(*(pmedata))); | |||
1641 | } | |||
1642 | ||||
1643 | if (hw_opt->thread_affinity != threadaffOFF) | |||
1644 | { | |||
1645 | /* Before setting affinity, check whether the affinity has changed | |||
1646 | * - which indicates that probably the OpenMP library has changed it | |||
1647 | * since we first checked). | |||
1648 | */ | |||
1649 | gmx_check_thread_affinity_set(fplog, cr, | |||
1650 | hw_opt, hwinfo->nthreads_hw_avail, TRUE1); | |||
1651 | ||||
1652 | /* Set the CPU affinity */ | |||
1653 | gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo); | |||
1654 | } | |||
1655 | ||||
1656 | /* Initiate PME if necessary, | |||
1657 | * either on all nodes or on dedicated PME nodes only. */ | |||
1658 | if (EEL_PME(inputrec->coulombtype)((inputrec->coulombtype) == eelPME || (inputrec->coulombtype ) == eelPMESWITCH || (inputrec->coulombtype) == eelPMEUSER || (inputrec->coulombtype) == eelPMEUSERSWITCH || (inputrec ->coulombtype) == eelP3M_AD) || EVDW_PME(inputrec->vdwtype)((inputrec->vdwtype) == evdwPME)) | |||
1659 | { | |||
1660 | if (mdatoms) | |||
1661 | { | |||
1662 | nChargePerturbed = mdatoms->nChargePerturbed; | |||
1663 | if (EVDW_PME(inputrec->vdwtype)((inputrec->vdwtype) == evdwPME)) | |||
1664 | { | |||
1665 | nTypePerturbed = mdatoms->nTypePerturbed; | |||
1666 | } | |||
1667 | } | |||
1668 | if (cr->npmenodes > 0) | |||
1669 | { | |||
1670 | /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/ | |||
1671 | gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr); | |||
1672 | gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr); | |||
1673 | } | |||
1674 | ||||
1675 | if (cr->duty & DUTY_PME(1<<1)) | |||
1676 | { | |||
1677 | status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec, | |||
1678 | mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed, | |||
1679 | (Flags & MD_REPRODUCIBLE(1<<13)), nthreads_pme); | |||
1680 | if (status != 0) | |||
1681 | { | |||
1682 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1682, "Error %d initializing PME", status); | |||
1683 | } | |||
1684 | } | |||
1685 | } | |||
1686 | ||||
1687 | ||||
1688 | if (integrator[inputrec->eI].func == do_md) | |||
1689 | { | |||
1690 | /* Turn on signal handling on all nodes */ | |||
1691 | /* | |||
1692 | * (A user signal from the PME nodes (if any) | |||
1693 | * is communicated to the PP nodes. | |||
1694 | */ | |||
1695 | signal_handler_install(); | |||
1696 | } | |||
1697 | ||||
1698 | if (cr->duty & DUTY_PP(1<<0)) | |||
1699 | { | |||
1700 | /* Assumes uniform use of the number of OpenMP threads */ | |||
1701 | walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault)); | |||
1702 | ||||
1703 | if (inputrec->ePull != epullNO) | |||
1704 | { | |||
1705 | /* Initialize pull code */ | |||
1706 | init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda, | |||
1707 | EI_DYNAMICS(inputrec->eI)(((inputrec->eI) == eiMD || ((inputrec->eI) == eiVV || ( inputrec->eI) == eiVVAK)) || ((inputrec->eI) == eiSD1 || (inputrec->eI) == eiSD2) || (inputrec->eI) == eiBD) && MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1)), Flags); | |||
1708 | } | |||
1709 | ||||
1710 | if (inputrec->bRot) | |||
1711 | { | |||
1712 | /* Initialize enforced rotation code */ | |||
1713 | init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv, | |||
1714 | bVerbose, Flags); | |||
1715 | } | |||
1716 | ||||
1717 | if (inputrec->eSwapCoords != eswapNO) | |||
1718 | { | |||
1719 | /* Initialize ion swapping code */ | |||
1720 | init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr), | |||
1721 | mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags); | |||
1722 | } | |||
1723 | ||||
1724 | constr = init_constraints(fplog, mtop, inputrec, ed, state, cr); | |||
1725 | ||||
1726 | if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1))) | |||
1727 | { | |||
1728 | dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec, | |||
1729 | Flags & MD_DDBONDCHECK(1<<10), fr->cginfo_mb); | |||
| ||||
1730 | ||||
1731 | set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox); | |||
1732 | ||||
1733 | setup_dd_grid(fplog, cr->dd); | |||
1734 | } | |||
1735 | ||||
1736 | /* Now do whatever the user wants us to do (how flexible...) */ | |||
1737 | integrator[inputrec->eI].func(fplog, cr, nfile, fnm, | |||
1738 | oenv, bVerbose, bCompact, | |||
1739 | nstglobalcomm, | |||
1740 | vsite, constr, | |||
1741 | nstepout, inputrec, mtop, | |||
1742 | fcd, state, | |||
1743 | mdatoms, nrnb, wcycle, ed, fr, | |||
1744 | repl_ex_nst, repl_ex_nex, repl_ex_seed, | |||
1745 | membed, | |||
1746 | cpt_period, max_hours, | |||
1747 | deviceOptions, | |||
1748 | imdport, | |||
1749 | Flags, | |||
1750 | walltime_accounting); | |||
1751 | ||||
1752 | if (inputrec->ePull != epullNO) | |||
1753 | { | |||
1754 | finish_pull(inputrec->pull); | |||
1755 | } | |||
1756 | ||||
1757 | if (inputrec->bRot) | |||
1758 | { | |||
1759 | finish_rot(inputrec->rot); | |||
1760 | } | |||
1761 | ||||
1762 | } | |||
1763 | else | |||
1764 | { | |||
1765 | /* do PME only */ | |||
1766 | walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME)); | |||
1767 | gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec); | |||
1768 | } | |||
1769 | ||||
1770 | wallcycle_stop(wcycle, ewcRUN); | |||
1771 | ||||
1772 | /* Finish up, write some stuff | |||
1773 | * if rerunMD, don't write last frame again | |||
1774 | */ | |||
1775 | finish_run(fplog, cr, | |||
1776 | inputrec, nrnb, wcycle, walltime_accounting, | |||
1777 | fr != NULL((void*)0) && fr->nbv != NULL((void*)0) && fr->nbv->bUseGPU ? | |||
1778 | nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL((void*)0), | |||
1779 | EI_DYNAMICS(inputrec->eI)(((inputrec->eI) == eiMD || ((inputrec->eI) == eiVV || ( inputrec->eI) == eiVVAK)) || ((inputrec->eI) == eiSD1 || (inputrec->eI) == eiSD2) || (inputrec->eI) == eiBD) && !MULTISIM(cr)((cr)->ms)); | |||
1780 | ||||
1781 | ||||
1782 | /* Free GPU memory and context */ | |||
1783 | free_gpu_resources(fr, cr); | |||
1784 | ||||
1785 | if (opt2bSet("-membed", nfile, fnm)) | |||
1786 | { | |||
1787 | sfree(membed)save_free("membed", "/home/alexxy/Develop/gromacs/src/programs/mdrun/runner.c" , 1787, (membed)); | |||
1788 | } | |||
1789 | ||||
1790 | gmx_hardware_info_free(hwinfo); | |||
1791 | ||||
1792 | /* Does what it says */ | |||
1793 | print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime()); | |||
1794 | walltime_accounting_destroy(walltime_accounting); | |||
1795 | ||||
1796 | /* Close logfile already here if we were appending to it */ | |||
1797 | if (MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1)) && (Flags & MD_APPENDFILES(1<<15))) | |||
1798 | { | |||
1799 | gmx_log_close(fplog); | |||
1800 | } | |||
1801 | ||||
1802 | rc = (int)gmx_get_stop_condition(); | |||
1803 | ||||
1804 | #ifdef GMX_THREAD_MPI | |||
1805 | /* we need to join all threads. The sub-threads join when they | |||
1806 | exit this function, but the master thread needs to be told to | |||
1807 | wait for that. */ | |||
1808 | if (PAR(cr)((cr)->nnodes > 1) && MASTER(cr)(((cr)->nodeid == 0) || !((cr)->nnodes > 1))) | |||
1809 | { | |||
1810 | tMPI_Finalize(); | |||
1811 | } | |||
1812 | #endif | |||
1813 | ||||
1814 | return rc; | |||
1815 | } |