Merge release-4-6 into master
[alexxy/gromacs.git] / src / programs / mdrun / pme_loadbal.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  *                This source code is part of
5  * 
6  *                 G   R   O   M   A   C   S
7  * 
8  *          GROningen MAchine for Chemical Simulations
9  * 
10  *                        VERSION 4.6.0
11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13  * Copyright (c) 2001-2011, The GROMACS development team,
14  * check out http://www.gromacs.org for more information.
15
16  * This program is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU General Public License
18  * as published by the Free Software Foundation; either version 2
19  * of the License, or (at your option) any later version.
20  * 
21  * If you want to redistribute modifications, please consider that
22  * scientific software is very special. Version control is crucial -
23  * bugs must be traceable. We will be happy to consider code for
24  * inclusion in the official distribution, but derived work must not
25  * be called official GROMACS. Details are found in the README & COPYING
26  * files - if they are missing, get the official version at www.gromacs.org.
27  * 
28  * To help us fund GROMACS development, we humbly ask that you cite
29  * the papers on the package - you can find them in the top README file.
30  * 
31  * For more info, check our website at http://www.gromacs.org
32  * 
33  * And Hey:
34  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
35  */
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39
40 #include "smalloc.h"
41 #include "network.h"
42 #include "calcgrid.h"
43 #include "pme.h"
44 #include "vec.h"
45 #include "domdec.h"
46 #include "nbnxn_cuda_data_mgmt.h"
47 #include "force.h"
48 #include "macros.h"
49 #include "pme_loadbal.h"
50
51 /* Parameters and setting for one PP-PME setup */
52 typedef struct {
53     real rcut;            /* Coulomb cut-off                              */
54     real rlist;           /* pair-list cut-off                            */
55     real spacing;         /* (largest) PME grid spacing                   */
56     ivec grid;            /* the PME grid dimensions                      */
57     real grid_efficiency; /* ineffiency factor for non-uniform grids <= 1 */
58     real ewaldcoeff;      /* the Ewald coefficient                        */
59     gmx_pme_t pmedata;    /* the data structure used in the PME code      */
60
61     int  count;           /* number of times this setup has been timed    */
62     double cycles;        /* the fastest time for this setup in cycles    */
63 } pme_setup_t;
64
65 /* In the initial scan, step by grids that are at least a factor 0.8 coarser */
66 #define PME_LB_GRID_SCALE_FAC  0.8
67 /* In the initial scan, try to skip grids with uneven x/y/z spacing,
68  * checking if the "efficiency" is more than 5% worse than the previous grid.
69  */
70 #define PME_LB_GRID_EFFICIENCY_REL_FAC  1.05
71 /* Rerun up till 12% slower setups than the fastest up till now */
72 #define PME_LB_SLOW_FAC  1.12
73 /* If setups get more than 2% faster, do another round to avoid
74  * choosing a slower setup due to acceleration or fluctuations.
75  */
76 #define PME_LB_ACCEL_TOL 1.02
77
78 enum { epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimNR };
79
80 const char *pmelblim_str[epmelblimNR] =
81 { "no", "box size", "domain decompostion" };
82
83 struct pme_load_balancing {
84     int  nstage;        /* the current maximum number of stages */
85
86     real cut_spacing;   /* the minimum cutoff / PME grid spacing ratio */
87     real rbuf;          /* the pairlist buffer size */
88     matrix box_start;   /* the initial simulation box */
89     int n;              /* the count of setup as well as the allocation size */
90     pme_setup_t *setup; /* the PME+cutoff setups */
91     int cur;            /* the current setup */
92     int fastest;        /* fastest setup up till now */
93     int start;          /* start of setup range to consider in stage>0 */
94     int end;            /* end   of setup range to consider in stage>0 */
95     int elimited;       /* was the balancing limited, uses enum above */
96
97     int stage;          /* the current stage */
98 };
99
100 void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
101                       const t_inputrec *ir,matrix box,
102                       const interaction_const_t *ic,
103                       gmx_pme_t pmedata)
104 {
105     pme_load_balancing_t pme_lb;
106     real spm,sp;
107     int  d;
108
109     snew(pme_lb,1);
110
111     /* Any number of stages >= 2 is supported */
112     pme_lb->nstage   = 2;
113
114     pme_lb->rbuf = ic->rlist - ic->rcoulomb;
115
116     copy_mat(box,pme_lb->box_start);
117     if (ir->ePBC==epbcXY && ir->nwall==2)
118     {
119         svmul(ir->wall_ewald_zfac,pme_lb->box_start[ZZ],pme_lb->box_start[ZZ]);
120     }
121
122     pme_lb->n = 1;
123     snew(pme_lb->setup,pme_lb->n);
124
125     pme_lb->cur = 0;
126     pme_lb->setup[0].rcut       = ic->rcoulomb;
127     pme_lb->setup[0].rlist      = ic->rlist;
128     pme_lb->setup[0].grid[XX]   = ir->nkx;
129     pme_lb->setup[0].grid[YY]   = ir->nky;
130     pme_lb->setup[0].grid[ZZ]   = ir->nkz;
131     pme_lb->setup[0].ewaldcoeff = ic->ewaldcoeff;
132
133     pme_lb->setup[0].pmedata  = pmedata;
134     
135     spm = 0;
136     for(d=0; d<DIM; d++)
137     {
138         sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
139         if (sp > spm)
140         {
141             spm = sp;
142         }
143     }
144     pme_lb->setup[0].spacing = spm;
145
146     if (ir->fourier_spacing > 0)
147     {
148         pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
149     }
150     else
151     {
152         pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing;
153     }
154
155     pme_lb->stage = 0;
156
157     pme_lb->fastest  = 0;
158     pme_lb->start    = 0;
159     pme_lb->end      = 0;
160     pme_lb->elimited = epmelblimNO;
161
162     *pme_lb_p = pme_lb;
163 }
164
165 static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
166                                             int pme_order)
167 {
168     pme_setup_t *set;
169     real fac,sp;
170     int d;
171
172     /* Try to add a new setup with next larger cut-off to the list */
173     pme_lb->n++;
174     srenew(pme_lb->setup,pme_lb->n);
175     set = &pme_lb->setup[pme_lb->n-1];
176     set->pmedata = NULL;
177
178     fac = 1;
179     do
180     {
181         fac *= 1.01;
182         clear_ivec(set->grid);
183         sp = calc_grid(NULL,pme_lb->box_start,
184                        fac*pme_lb->setup[pme_lb->cur].spacing,
185                        &set->grid[XX],
186                        &set->grid[YY],
187                        &set->grid[ZZ]);
188
189         /* In parallel we can't have grids smaller than 2*pme_order,
190          * and we would anyhow not gain much speed at these grid sizes.
191          */
192         for(d=0; d<DIM; d++)
193         {
194             if (set->grid[d] <= 2*pme_order)
195             {
196                 pme_lb->n--;
197
198                 return FALSE;
199             }
200         }
201     }
202     while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing);
203
204     set->rcut    = pme_lb->cut_spacing*sp;
205     set->rlist   = set->rcut + pme_lb->rbuf;
206     set->spacing = sp;
207     /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
208     set->grid_efficiency = 1;
209     for(d=0; d<DIM; d++)
210     {
211         set->grid_efficiency *= (set->grid[d]*sp)/norm(pme_lb->box_start[d]);
212     }
213     /* The Ewald coefficient is inversly proportional to the cut-off */
214     set->ewaldcoeff =
215         pme_lb->setup[0].ewaldcoeff*pme_lb->setup[0].rcut/set->rcut;
216
217     set->count   = 0;
218     set->cycles  = 0;
219
220     if (debug)
221     {
222         fprintf(debug,"PME loadbal: grid %d %d %d, cutoff %f\n",
223                 set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut);
224     }
225
226     return TRUE;
227 }
228
229 static void print_grid(FILE *fp_err,FILE *fp_log,
230                        const char *pre,
231                        const char *desc,
232                        const pme_setup_t *set,
233                        double cycles)
234 {
235     char buf[STRLEN],buft[STRLEN];
236     
237     if (cycles >= 0)
238     {
239         sprintf(buft,": %.1f M-cycles",cycles*1e-6);
240     }
241     else
242     {
243         buft[0] = '\0';
244     }
245     sprintf(buf,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
246             pre,
247             desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut,
248             buft);
249     if (fp_err != NULL)
250     {
251         fprintf(fp_err,"\r%s\n",buf);
252     }
253     if (fp_log != NULL)
254     {
255         fprintf(fp_log,"%s\n",buf);
256     }
257 }
258
259 static int pme_loadbal_end(pme_load_balancing_t pme_lb)
260 {
261     /* In the initial stage only n is set; end is not set yet */
262     if (pme_lb->end > 0)
263     {
264         return pme_lb->end;
265     }
266     else
267     {
268         return pme_lb->n;
269     }
270 }
271
272 static void print_loadbal_limited(FILE *fp_err,FILE *fp_log,
273                                   gmx_large_int_t step,
274                                   pme_load_balancing_t pme_lb)
275 {
276     char buf[STRLEN],sbuf[22];
277
278     sprintf(buf,"step %4s: the %s limited the PME load balancing to a cut-off of %.3f",
279             gmx_step_str(step,sbuf),
280             pmelblim_str[pme_lb->elimited],
281             pme_lb->setup[pme_loadbal_end(pme_lb)-1].rcut);
282     if (fp_err != NULL)
283     {
284         fprintf(fp_err,"\r%s\n",buf);
285     }
286     if (fp_log != NULL)
287     {
288         fprintf(fp_log,"%s\n",buf);
289     }
290 }
291
292 static void switch_to_stage1(pme_load_balancing_t pme_lb)
293 {
294     pme_lb->start = 0;
295     while (pme_lb->start+1 < pme_lb->n &&
296            (pme_lb->setup[pme_lb->start].count == 0 ||
297             pme_lb->setup[pme_lb->start].cycles >
298             pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC))
299     {
300         pme_lb->start++;
301     }
302     while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0)
303     {
304         pme_lb->start--;
305     }
306
307     pme_lb->end = pme_lb->n;
308     if (pme_lb->setup[pme_lb->end-1].count > 0 &&
309         pme_lb->setup[pme_lb->end-1].cycles >
310         pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
311     {
312         pme_lb->end--;
313     }
314
315     pme_lb->stage = 1;
316
317     /* Next we want to choose setup pme_lb->start, but as we will increase
318      * pme_ln->cur by one right after returning, we subtract 1 here.
319      */
320     pme_lb->cur = pme_lb->start - 1;
321 }
322
323 gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
324                           t_commrec *cr,
325                           FILE *fp_err,
326                           FILE *fp_log,
327                           t_inputrec *ir,
328                           t_state *state,
329                           double cycles,
330                           interaction_const_t *ic,
331                           nonbonded_verlet_t *nbv,
332                           gmx_pme_t *pmedata,
333                           gmx_large_int_t step)
334 {
335     gmx_bool OK;
336     pme_setup_t *set;
337     double cycles_fast;
338     char buf[STRLEN],sbuf[22];
339
340     if (pme_lb->stage == pme_lb->nstage)
341     {
342         return FALSE;
343     }
344
345     if (PAR(cr))
346     {
347         gmx_sumd(1,&cycles,cr);
348         cycles /= cr->nnodes;
349     }
350
351     set = &pme_lb->setup[pme_lb->cur];
352
353     set->count++;
354     if (set->count % 2 == 1)
355     {
356         /* Skip the first cycle, because the first step after a switch
357          * is much slower due to allocation and/or caching effects.
358          */
359         return TRUE;
360     }
361
362     sprintf(buf, "step %4s: ", gmx_step_str(step,sbuf));
363     print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
364
365     if (set->count <= 2)
366     {
367         set->cycles = cycles;
368     }
369     else
370     {
371         if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
372             pme_lb->stage == pme_lb->nstage - 1)
373         {
374             /* The performance went up a lot (due to e.g. DD load balancing).
375              * Add a stage, keep the minima, but rescan all setups.
376              */
377             pme_lb->nstage++;
378
379             if (debug)
380             {
381                 fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
382                         "Increased the number stages to %d"
383                         " and ignoring the previous performance\n",
384                         set->grid[XX],set->grid[YY],set->grid[ZZ],
385                         cycles*1e-6,set->cycles*1e-6,PME_LB_ACCEL_TOL,
386                         pme_lb->nstage);
387             }
388         }
389         set->cycles = min(set->cycles,cycles);
390     }
391
392     if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
393     {
394         pme_lb->fastest = pme_lb->cur;
395     }
396     cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
397
398     /* Check in stage 0 if we should stop scanning grids.
399      * Stop when the time is more than SLOW_FAC longer than the fastest.
400      */
401     if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
402         cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
403     {
404         pme_lb->n = pme_lb->cur + 1;
405         /* Done with scanning, go to stage 1 */
406         switch_to_stage1(pme_lb);
407     }
408
409     if (pme_lb->stage == 0)
410     {
411         int gridsize_start;
412
413         gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
414
415         do
416         {
417             if (pme_lb->cur+1 < pme_lb->n)
418             {
419                 /* We had already generated the next setup */
420                 OK = TRUE;
421             }
422             else
423             {
424                 /* Find the next setup */
425                 OK = pme_loadbal_increase_cutoff(pme_lb,ir->pme_order);
426             }
427                 
428             if (OK && ir->ePBC != epbcNONE)
429             {
430                 OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlist)
431                       <= max_cutoff2(ir->ePBC,state->box));
432                 if (!OK)
433                 {
434                     pme_lb->elimited = epmelblimBOX;
435                 }
436             }
437
438             if (OK)
439             {
440                 pme_lb->cur++;
441
442                 if (DOMAINDECOMP(cr))
443                 {
444                     OK = change_dd_cutoff(cr,state,ir,
445                                           pme_lb->setup[pme_lb->cur].rlist);
446                     if (!OK)
447                     {
448                         /* Failed: do not use this setup */
449                         pme_lb->cur--;
450                         pme_lb->elimited = epmelblimDD;
451                     }
452                 }
453             }
454             if (!OK)
455             {
456                 /* We hit the upper limit for the cut-off,
457                  * the setup should not go further than cur.
458                  */
459                 pme_lb->n = pme_lb->cur + 1;
460                 print_loadbal_limited(fp_err,fp_log,step,pme_lb);
461                 /* Switch to the next stage */
462                 switch_to_stage1(pme_lb);
463             }
464         }
465         while (OK &&
466                !(pme_lb->setup[pme_lb->cur].grid[XX]*
467                  pme_lb->setup[pme_lb->cur].grid[YY]*
468                  pme_lb->setup[pme_lb->cur].grid[ZZ] <
469                  gridsize_start*PME_LB_GRID_SCALE_FAC
470                  &&
471                  pme_lb->setup[pme_lb->cur].grid_efficiency <
472                  pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC));
473     }
474
475     if (pme_lb->stage > 0 && pme_lb->end == 1)
476     {
477         pme_lb->cur = 0;
478         pme_lb->stage = pme_lb->nstage;
479     }
480     else if (pme_lb->stage > 0 && pme_lb->end > 1)
481     {
482         /* If stage = nstage-1:
483          *   scan over all setups, rerunning only those setups
484          *   which are not much slower than the fastest
485          * else:
486          *   use the next setup
487          */
488         do
489         {
490             pme_lb->cur++;
491             if (pme_lb->cur == pme_lb->end)
492             {
493                 pme_lb->stage++;
494                 pme_lb->cur = pme_lb->start;
495             }
496         }
497         while (pme_lb->stage == pme_lb->nstage - 1 &&
498                pme_lb->setup[pme_lb->cur].count > 0 &&
499                pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);
500
501         if (pme_lb->stage == pme_lb->nstage)
502         {
503             /* We are done optimizing, use the fastest setup we found */
504             pme_lb->cur = pme_lb->fastest;
505         }
506     }
507
508     if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
509     {
510         OK = change_dd_cutoff(cr,state,ir,pme_lb->setup[pme_lb->cur].rlist);
511         if (!OK)
512         {
513             /* Failsafe solution */
514             if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
515             {
516                 pme_lb->stage--;
517             }
518             pme_lb->fastest  = 0;
519             pme_lb->start    = 0;
520             pme_lb->end      = pme_lb->cur;
521             pme_lb->cur      = pme_lb->start;
522             pme_lb->elimited = epmelblimDD;
523             print_loadbal_limited(fp_err,fp_log,step,pme_lb);
524         }
525     }
526
527     /* Change the Coulomb cut-off and the PME grid */
528
529     set = &pme_lb->setup[pme_lb->cur];
530
531     ic->rcoulomb   = set->rcut;
532     ic->rlist      = set->rlist;
533     ic->ewaldcoeff = set->ewaldcoeff;
534
535     if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
536     {
537         nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
538     }
539     else
540     {
541         init_interaction_const_tables(NULL,ic,nbv->grp[0].kernel_type);
542     }
543
544     if (nbv->ngrp > 1)
545     {
546         init_interaction_const_tables(NULL,ic,nbv->grp[1].kernel_type);
547     }
548
549     if (cr->duty & DUTY_PME)
550     {
551         if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
552         {
553             /* Generate a new PME data structure,
554              * copying part of the old pointers.
555              */
556             gmx_pme_reinit(&set->pmedata,
557                            cr,pme_lb->setup[0].pmedata,ir,
558                            set->grid);
559         }
560         *pmedata = set->pmedata;
561     }
562     else
563     {
564         /* Tell our PME-only node to switch grid */
565         gmx_pme_send_switch(cr, set->grid, set->ewaldcoeff);
566     }
567
568     if (debug)
569     {
570         print_grid(NULL,debug,"","switched to",set,-1);
571     }
572
573     if (pme_lb->stage == pme_lb->nstage)
574     {
575         print_grid(fp_err,fp_log,"","optimal",set,-1);
576     }
577
578     return TRUE;
579 }
580
581 void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n)
582 {
583     pme_lb->nstage += n;
584 }
585
586 static int pme_grid_points(const pme_setup_t *setup)
587 {
588     return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
589 }
590
591 static void print_pme_loadbal_setting(FILE *fplog,
592                                      char *name,
593                                      const pme_setup_t *setup)
594 {
595     fprintf(fplog,
596             "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
597             name,
598             setup->rcut,setup->rlist,
599             setup->grid[XX],setup->grid[YY],setup->grid[ZZ],
600             setup->spacing,1/setup->ewaldcoeff);
601 }
602
603 static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
604                                        FILE *fplog)
605 {
606     double pp_ratio,grid_ratio;
607
608     pp_ratio   = pow(pme_lb->setup[pme_lb->cur].rlist/pme_lb->setup[0].rlist,3.0);
609     grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
610         (double)pme_grid_points(&pme_lb->setup[0]);
611
612     fprintf(fplog,"\n");
613     fprintf(fplog,"       P P   -   P M E   L O A D   B A L A N C I N G\n");
614     fprintf(fplog,"\n");
615     /* Here we only warn when the optimal setting is the last one */
616     if (pme_lb->elimited != epmelblimNO &&
617         pme_lb->cur == pme_loadbal_end(pme_lb)-1)
618     {
619         fprintf(fplog," NOTE: The PP/PME load balancing was limited by the %s,\n",
620                 pmelblim_str[pme_lb->elimited]);
621         fprintf(fplog,"       you might not have reached a good load balance.\n");
622         if (pme_lb->elimited == epmelblimDD)
623         {
624             fprintf(fplog,"       Try different mdrun -dd settings or lower the -dds value.\n");
625         }
626         fprintf(fplog,"\n");
627     }
628     fprintf(fplog," PP/PME load balancing changed the cut-off and PME settings:\n");
629     fprintf(fplog,"           particle-particle                    PME\n");
630     fprintf(fplog,"            rcoulomb  rlist            grid      spacing   1/beta\n");
631     print_pme_loadbal_setting(fplog,"initial",&pme_lb->setup[0]);
632     print_pme_loadbal_setting(fplog,"final"  ,&pme_lb->setup[pme_lb->cur]);
633     fprintf(fplog," cost-ratio           %4.2f             %4.2f\n",
634             pp_ratio,grid_ratio);
635     fprintf(fplog," (note that these numbers concern only part of the total PP and PME load)\n");
636     fprintf(fplog,"\n");
637 }
638
639 void pme_loadbal_done(pme_load_balancing_t pme_lb, FILE *fplog)
640 {
641     if (fplog != NULL && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
642     {
643         print_pme_loadbal_settings(pme_lb,fplog);
644     }
645
646     /* TODO: Here we should free all pointers in pme_lb,
647      * but as it contains pme data structures,
648      * we need to first make pme.c free all data.
649      */
650 }