src/kernel/pme_switch.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 4.6.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2011, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  35  */
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40 #include "smalloc.h"
  41 #include "network.h"
  42 #include "calcgrid.h"
  43 #include "pme.h"
  44 #include "vec.h"
  45 #include "domdec.h"
  46 #include "nbnxn_cuda_data_mgmt.h"
  47 #include "force.h"
  48 #include "pme_switch.h"
  49
  50 typedef struct {
  51     real rcut;
  52     real rlist;
  53     real spacing;
  54     ivec grid;
  55     real grid_eff;
  56     real coeff;
  57     gmx_pme_t pmedata;
  58
  59     int  count;
  60     double cycles;
  61 } pme_setup_t;
  62
  63 /* In the initial scan, step by grids that are at least a factor 0.8 coarser */
  64 #define PMES_GRID_SCALE_FAC  0.8
  65 /* In the initial scan, try to skip grids with uneven x/y/z spacing,
  66  * checking if the "efficiency" is more than 5% worse than the previous grid.
  67  */
  68 #define PMES_GRID_EFF_FAC  1.05
  69 /* Rerun up till 12% slower setups than the fastest up till now */
  70 #define PMES_SLOW_FAC  1.12
  71 /* If setups get more than 2% faster, do another round to avoid
  72  * choosing a slower setup due to acceleration or fluctuations.
  73  */
  74 #define PMES_ACCEL_TOL 1.02
  75
  76 typedef struct pme_switch {
  77     int  nstage;        /* the current maximum number of stages */
  78
  79     real cut_spacing;   /* the minimum cutoff / PME grid spacing ratio */
  80     real rbuf;          /* the pairlist buffer size */
  81     matrix box_start;   /* the initial simulation box */
  82     int n;              /* the count of setup as well as the allocation size */
  83     pme_setup_t *setup; /* the PME+cutoff setups */
  84     int cur;            /* the current setup */
  85     int fastest;        /* fastest setup up till now */
  86     int start;          /* start of setup range to consider in stage>0 */
  87     int end;            /* end   of setup range to consider in stage>0 */
  88
  89     int stage;          /* the current stage */
  90 } t_pme_switch;
  91
  92 void switch_pme_init(pme_switch_t *pmes_p,
  93                      const t_inputrec *ir,matrix box,
  94                      const interaction_const_t *ic,
  95                      gmx_pme_t pmedata)
  96 {
  97     pme_switch_t pmes;
  98     real spm,sp;
  99     int  d;
 100
 101     snew(pmes,1);
 102
 103     /* Any number of stages >= 2 is supported */
 104     pmes->nstage   = 2;
 105
 106     pmes->rbuf = ic->rlist - ic->rcoulomb;
 107
 108     copy_mat(box,pmes->box_start);
 109     if (ir->ePBC==epbcXY && ir->nwall==2)
 110     {
 111         svmul(ir->wall_ewald_zfac,pmes->box_start[ZZ],pmes->box_start[ZZ]);
 112     }
 113
 114     pmes->n = 1;
 115     snew(pmes->setup,pmes->n);
 116
 117     pmes->cur = 0;
 118     pmes->setup[0].rcut     = ic->rcoulomb;
 119     pmes->setup[0].rlist    = ic->rlist;
 120     pmes->setup[0].grid[XX] = ir->nkx;
 121     pmes->setup[0].grid[YY] = ir->nky;
 122     pmes->setup[0].grid[ZZ] = ir->nkz;
 123     pmes->setup[0].coeff    = ic->ewaldcoeff;
 124
 125     pmes->setup[0].pmedata  = pmedata;
 126
 127     spm = 0;
 128     for(d=0; d<DIM; d++)
 129     {
 130         sp = norm(pmes->box_start[d])/pmes->setup[0].grid[d];
 131         if (sp > spm)
 132         {
 133             spm = sp;
 134         }
 135     }
 136     pmes->setup[0].spacing = spm;
 137
 138     if (ir->fourier_spacing > 0)
 139     {
 140         pmes->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
 141     }
 142     else
 143     {
 144         pmes->cut_spacing = ir->rcoulomb/pmes->setup[0].spacing;
 145     }
 146
 147     pmes->stage = 0;
 148
 149     pmes->fastest = 0;
 150     pmes->start   = 0;
 151
 152     *pmes_p = pmes;
 153 }
 154
 155 static gmx_bool switch_pme_increase_cutoff(pme_switch_t pmes,int pme_order)
 156 {
 157     pme_setup_t *set;
 158     real fac,sp;
 159     int d;
 160
 161     /* Try to add a new setup with next larger cut-off to the list */
 162     pmes->n++;
 163     srenew(pmes->setup,pmes->n);
 164     set = &pmes->setup[pmes->n-1];
 165     set->pmedata = NULL;
 166
 167     fac = 1;
 168     do
 169     {
 170         fac *= 1.01;
 171         clear_ivec(set->grid);
 172         sp = calc_grid(NULL,pmes->box_start,
 173                        fac*pmes->setup[pmes->cur].spacing,
 174                        &set->grid[XX],
 175                        &set->grid[YY],
 176                        &set->grid[ZZ]);
 177
 178         /* In parallel we can't have grids smaller than 2*pme_order,
 179          * and we would anyhow not gain much speed at these grid sizes.
 180          */
 181         for(d=0; d<DIM; d++)
 182         {
 183             if (set->grid[d] <= 2*pme_order)
 184             {
 185                 pmes->n--;
 186
 187                 return FALSE;
 188             }
 189         }
 190     }
 191     while (sp <= 1.001*pmes->setup[pmes->cur].spacing);
 192
 193     set->rcut    = pmes->cut_spacing*sp;
 194     set->rlist   = set->rcut + pmes->rbuf;
 195     set->spacing = sp;
 196     /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
 197     set->grid_eff = 1;
 198     for(d=0; d<DIM; d++)
 199     {
 200         set->grid_eff *= (set->grid[d]*sp)/norm(pmes->box_start[d]);
 201     }
 202     /* The Ewald coefficient is inversly proportional to the cut-off */
 203     set->coeff   = pmes->setup[0].coeff*pmes->setup[0].rcut/set->rcut;
 204
 205     set->count   = 0;
 206     set->cycles  = 0;
 207
 208     if (debug)
 209     {
 210         fprintf(debug,"PME switch grid %d %d %d, cutoff %f\n",
 211                 set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut);
 212     }
 213
 214     return TRUE;
 215 }
 216
 217 static void print_grid(FILE *fp_err,FILE *fp_log,
 218                        const char *pre,
 219                        const char *desc,
 220                        const pme_setup_t *set,
 221                        double cycles)
 222 {
 223     char buf[STRLEN],buft[STRLEN];
 224
 225     if (cycles >= 0)
 226     {
 227         sprintf(buft,": %.1f M-cycles",cycles*1e-6);
 228     }
 229     else
 230     {
 231         buft[0] = '\0';
 232     }
 233     sprintf(buf,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
 234             pre,
 235             desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut,
 236             buft);
 237     if (fp_err != NULL)
 238     {
 239         fprintf(fp_err,"%s\n",buf);
 240     }
 241     if (fp_log != NULL)
 242     {
 243         fprintf(fp_log,"%s\n",buf);
 244     }
 245 }
 246
 247 static void switch_to_stage1(pme_switch_t pmes)
 248 {
 249     pmes->start = 0;
 250     while (pmes->start+1 < pmes->n &&
 251            (pmes->setup[pmes->start].count == 0 ||
 252             pmes->setup[pmes->start].cycles >
 253             pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC))
 254     {
 255         pmes->start++;
 256     }
 257     while (pmes->start > 0 && pmes->setup[pmes->start-1].cycles == 0)
 258     {
 259         pmes->start--;
 260     }
 261
 262     pmes->end = pmes->n;
 263     if (pmes->setup[pmes->end-1].count > 0 &&
 264         pmes->setup[pmes->end-1].cycles >
 265         pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC)
 266     {
 267         pmes->end--;
 268     }
 269
 270     pmes->stage = 1;
 271
 272     /* Start add start, 1 will be added immediately after returning */
 273     pmes->cur = pmes->start - 1;
 274 }
 275
 276 gmx_bool switch_pme(pme_switch_t pmes,
 277                     t_commrec *cr,
 278                     FILE *fp_err,
 279                     FILE *fp_log,
 280                     t_inputrec *ir,
 281                     t_state *state,
 282                     double cycles,
 283                     interaction_const_t *ic,
 284                     nonbonded_verlet_t *nbv,
 285                     gmx_pme_t *pmedata,
 286                     int step)
 287 {
 288     gmx_bool OK;
 289     pme_setup_t *set;
 290     double cycles_fast;
 291     char buf[STRLEN];
 292
 293     if (pmes->stage == pmes->nstage)
 294     {
 295         return FALSE;
 296     }
 297
 298     if (PAR(cr))
 299     {
 300         gmx_sumd(1,&cycles,cr);
 301         cycles /= cr->nnodes;
 302     }
 303
 304     set = &pmes->setup[pmes->cur];
 305
 306     set->count++;
 307     if (set->count % 2 == 1)
 308     {
 309         /* Skip the first cycle, because the first step after a switch
 310          * is much slower due to allocation and/or caching effects.
 311          */
 312         return TRUE;
 313     }
 314
 315     sprintf(buf, "step %4d: ", step);
 316     print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
 317
 318     if (set->count <= 2)
 319     {
 320         set->cycles = cycles;
 321     }
 322     else
 323     {
 324         if (cycles*PMES_ACCEL_TOL < set->cycles &&
 325             pmes->stage == pmes->nstage - 1)
 326         {
 327             /* The performance went up a lot (due to e.g. DD load balancing).
 328              * Add a stage, keep the minima, but rescan all setups.
 329              */
 330             pmes->nstage++;
 331
 332             if (debug)
 333             {
 334                 fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
 335                         "Increased the number stages to %d"
 336                         " and ignoring the previous performance\n",
 337                         set->grid[XX],set->grid[YY],set->grid[ZZ],
 338                         cycles*1e-6,set->cycles*1e-6,PMES_ACCEL_TOL,
 339                         pmes->nstage);
 340             }
 341         }
 342         set->cycles = min(set->cycles,cycles);
 343     }
 344
 345     if (set->cycles < pmes->setup[pmes->fastest].cycles)
 346     {
 347         pmes->fastest = pmes->cur;
 348     }
 349     cycles_fast = pmes->setup[pmes->fastest].cycles;
 350
 351     /* Check in stage 0 if we should stop scanning grids.
 352      * Stop when the time is more than SLOW_FAC longer than the fastest.
 353      */
 354     if (pmes->stage == 0 && pmes->cur > 0 &&
 355         cycles > pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC)
 356     {
 357         pmes->n = pmes->cur + 1;
 358         /* Done with scanning, go to stage 1 */
 359         switch_to_stage1(pmes);
 360     }
 361
 362     if (pmes->stage == 0)
 363     {
 364         int gridsize_start;
 365
 366         gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
 367
 368         do
 369         {
 370             if (pmes->cur+1 < pmes->n)
 371             {
 372                 /* We had already generated the next setup */
 373                 OK = TRUE;
 374             }
 375             else
 376             {
 377                 /* Find the next setup */
 378                 OK = switch_pme_increase_cutoff(pmes,ir->pme_order);
 379             }
 380
 381             if (OK && ir->ePBC != epbcNONE)
 382             {
 383                 OK = (sqr(pmes->setup[pmes->cur+1].rlist)
 384                       <= max_cutoff2(ir->ePBC,state->box));
 385             }
 386
 387             if (OK)
 388             {
 389                 pmes->cur++;
 390
 391                 if (DOMAINDECOMP(cr))
 392                 {
 393                     OK = change_dd_cutoff(cr,state,ir,
 394                                           pmes->setup[pmes->cur].rlist);
 395                     if (!OK)
 396                     {
 397                         /* Failed: do not use this setup */
 398                         pmes->cur--;
 399                     }
 400                 }
 401             }
 402             if (!OK)
 403             {
 404                 /* We hit the upper limit for the cut-off,
 405                  * the setup should not go further than cur.
 406                  */
 407                 pmes->n = pmes->cur + 1;
 408                 /* Switch to the next stage */
 409                 switch_to_stage1(pmes);
 410             }
 411         }
 412         while (OK &&
 413                !(pmes->setup[pmes->cur].grid[XX]*
 414                  pmes->setup[pmes->cur].grid[YY]*
 415                  pmes->setup[pmes->cur].grid[ZZ] <
 416                  gridsize_start*PMES_GRID_SCALE_FAC
 417                  &&
 418                  pmes->setup[pmes->cur].grid_eff <
 419                  pmes->setup[pmes->cur-1].grid_eff*PMES_GRID_EFF_FAC));
 420     }
 421
 422     if (pmes->stage > 0 && pmes->end == 1)
 423     {
 424         pmes->cur = 0;
 425         pmes->stage = pmes->nstage;
 426     }
 427     else if (pmes->stage > 0 && pmes->end > 1)
 428     {
 429         /* If stage = nstage-1:
 430          *   scan over all setups, rerunning only those setups
 431          *   which are not much slower than the fastest
 432          * else:
 433          *   use the next setup
 434          */
 435         do
 436         {
 437             pmes->cur++;
 438             if (pmes->cur == pmes->end)
 439             {
 440                 pmes->stage++;
 441                 pmes->cur = pmes->start;
 442             }
 443         }
 444         while (pmes->stage == pmes->nstage - 1 &&
 445                pmes->setup[pmes->cur].count > 0 &&
 446                pmes->setup[pmes->cur].cycles > cycles_fast*PMES_SLOW_FAC);
 447
 448         if (pmes->stage == pmes->nstage)
 449         {
 450             /* We are done optiming, use the fastest setup we found */
 451             pmes->cur = pmes->fastest;
 452         }
 453     }
 454
 455     if (DOMAINDECOMP(cr) && pmes->stage > 0)
 456     {
 457         OK = change_dd_cutoff(cr,state,ir,pmes->setup[pmes->cur].rlist);
 458         if (!OK)
 459         {
 460             /* Failsafe solution */
 461             if (pmes->cur > 1 && pmes->stage == pmes->nstage)
 462             {
 463                 pmes->stage--;
 464             }
 465             pmes->fastest = 0;
 466             pmes->start   = 0;
 467             pmes->end     = pmes->cur;
 468             pmes->cur     = pmes->start;
 469         }
 470     }
 471
 472     /* Change the Coulomb cut-off and the PME grid */
 473
 474     set = &pmes->setup[pmes->cur];
 475
 476     ic->rcoulomb   = set->rcut;
 477     ic->rlist      = set->rlist;
 478     ic->ewaldcoeff = set->coeff;
 479
 480     if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
 481     {
 482         nbnxn_cuda_pmetune_update_param(nbv->cu_nbv,ic);
 483     }
 484     else
 485     {
 486         init_interaction_const_tables(NULL,ic,nbv->grp[0].kernel_type);
 487     }
 488
 489     if (nbv->ngrp > 1)
 490     {
 491         init_interaction_const_tables(NULL,ic,nbv->grp[1].kernel_type);
 492     }
 493
 494     if (cr->duty & DUTY_PME)
 495     {
 496         if (pmes->setup[pmes->cur].pmedata == NULL)
 497         {
 498             /* Generate a new PME data structure,
 499              * copying part of the old pointers.
 500              */
 501             gmx_pme_reinit(&set->pmedata,
 502                            cr,pmes->setup[0].pmedata,ir,
 503                            set->grid);
 504         }
 505         *pmedata = set->pmedata;
 506     }
 507     else
 508     {
 509         /* Tell our PME-only node to switch grid */
 510         gmx_pme_send_switch(cr, set->grid, set->coeff);
 511     }
 512
 513     if (debug)
 514     {
 515         print_grid(NULL,debug,"","switched to",set,-1);
 516     }
 517
 518     if (pmes->stage == pmes->nstage)
 519     {
 520         print_grid(fp_err,fp_log,"","optimal",set,-1);
 521     }
 522
 523     return TRUE;
 524 }
 525
 526 void restart_switch_pme(pme_switch_t pmes, int n)
 527 {
 528     pmes->nstage += n;
 529 }