added Verlet scheme and NxN non-bonded functionality
[alexxy/gromacs.git] / src / kernel / pme_switch.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  *                This source code is part of
5  * 
6  *                 G   R   O   M   A   C   S
7  * 
8  *          GROningen MAchine for Chemical Simulations
9  * 
10  *                        VERSION 4.6.0
11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13  * Copyright (c) 2001-2011, The GROMACS development team,
14  * check out http://www.gromacs.org for more information.
15
16  * This program is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU General Public License
18  * as published by the Free Software Foundation; either version 2
19  * of the License, or (at your option) any later version.
20  * 
21  * If you want to redistribute modifications, please consider that
22  * scientific software is very special. Version control is crucial -
23  * bugs must be traceable. We will be happy to consider code for
24  * inclusion in the official distribution, but derived work must not
25  * be called official GROMACS. Details are found in the README & COPYING
26  * files - if they are missing, get the official version at www.gromacs.org.
27  * 
28  * To help us fund GROMACS development, we humbly ask that you cite
29  * the papers on the package - you can find them in the top README file.
30  * 
31  * For more info, check our website at http://www.gromacs.org
32  * 
33  * And Hey:
34  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
35  */
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39
40 #include "smalloc.h"
41 #include "network.h"
42 #include "calcgrid.h"
43 #include "pme.h"
44 #include "vec.h"
45 #include "domdec.h"
46 #include "nbnxn_cuda_data_mgmt.h"
47 #include "force.h"
48 #include "pme_switch.h"
49
50 typedef struct {
51     real rcut;
52     real rlist;
53     real spacing;
54     ivec grid;
55     real grid_eff;
56     real coeff;
57     gmx_pme_t pmedata;
58
59     int  count;
60     double cycles;
61 } pme_setup_t;
62
63 /* In the initial scan, step by grids that are at least a factor 0.8 coarser */
64 #define PMES_GRID_SCALE_FAC  0.8
65 /* In the initial scan, try to skip grids with uneven x/y/z spacing,
66  * checking if the "efficiency" is more than 5% worse than the previous grid.
67  */
68 #define PMES_GRID_EFF_FAC  1.05
69 /* Rerun up till 12% slower setups than the fastest up till now */
70 #define PMES_SLOW_FAC  1.12
71 /* If setups get more than 2% faster, do another round to avoid
72  * choosing a slower setup due to acceleration or fluctuations.
73  */
74 #define PMES_ACCEL_TOL 1.02
75
76 typedef struct pme_switch {
77     int  nstage;        /* the current maximum number of stages */
78
79     real cut_spacing;   /* the minimum cutoff / PME grid spacing ratio */
80     real rbuf;          /* the pairlist buffer size */
81     matrix box_start;   /* the initial simulation box */
82     int n;              /* the count of setup as well as the allocation size */
83     pme_setup_t *setup; /* the PME+cutoff setups */
84     int cur;            /* the current setup */
85     int fastest;        /* fastest setup up till now */
86     int start;          /* start of setup range to consider in stage>0 */
87     int end;            /* end   of setup range to consider in stage>0 */
88
89     int stage;          /* the current stage */
90 } t_pme_switch;
91
92 void switch_pme_init(pme_switch_t *pmes_p,
93                      const t_inputrec *ir,matrix box,
94                      const interaction_const_t *ic,
95                      gmx_pme_t pmedata)
96 {
97     pme_switch_t pmes;
98     real spm,sp;
99     int  d;
100
101     snew(pmes,1);
102
103     /* Any number of stages >= 2 is supported */
104     pmes->nstage   = 2;
105
106     pmes->rbuf = ic->rlist - ic->rcoulomb;
107
108     copy_mat(box,pmes->box_start);
109     if (ir->ePBC==epbcXY && ir->nwall==2)
110     {
111         svmul(ir->wall_ewald_zfac,pmes->box_start[ZZ],pmes->box_start[ZZ]);
112     }
113
114     pmes->n = 1;
115     snew(pmes->setup,pmes->n);
116
117     pmes->cur = 0;
118     pmes->setup[0].rcut     = ic->rcoulomb;
119     pmes->setup[0].rlist    = ic->rlist;
120     pmes->setup[0].grid[XX] = ir->nkx;
121     pmes->setup[0].grid[YY] = ir->nky;
122     pmes->setup[0].grid[ZZ] = ir->nkz;
123     pmes->setup[0].coeff    = ic->ewaldcoeff;
124
125     pmes->setup[0].pmedata  = pmedata;
126     
127     spm = 0;
128     for(d=0; d<DIM; d++)
129     {
130         sp = norm(pmes->box_start[d])/pmes->setup[0].grid[d];
131         if (sp > spm)
132         {
133             spm = sp;
134         }
135     }
136     pmes->setup[0].spacing = spm;
137
138     if (ir->fourier_spacing > 0)
139     {
140         pmes->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
141     }
142     else
143     {
144         pmes->cut_spacing = ir->rcoulomb/pmes->setup[0].spacing;
145     }
146
147     pmes->stage = 0;
148
149     pmes->fastest = 0;
150     pmes->start   = 0;
151
152     *pmes_p = pmes;
153 }
154
155 static gmx_bool switch_pme_increase_cutoff(pme_switch_t pmes,int pme_order)
156 {
157     pme_setup_t *set;
158     real fac,sp;
159     int d;
160
161     /* Try to add a new setup with next larger cut-off to the list */
162     pmes->n++;
163     srenew(pmes->setup,pmes->n);
164     set = &pmes->setup[pmes->n-1];
165     set->pmedata = NULL;
166
167     fac = 1;
168     do
169     {
170         fac *= 1.01;
171         clear_ivec(set->grid);
172         sp = calc_grid(NULL,pmes->box_start,
173                        fac*pmes->setup[pmes->cur].spacing,
174                        &set->grid[XX],
175                        &set->grid[YY],
176                        &set->grid[ZZ]);
177
178         /* In parallel we can't have grids smaller than 2*pme_order,
179          * and we would anyhow not gain much speed at these grid sizes.
180          */
181         for(d=0; d<DIM; d++)
182         {
183             if (set->grid[d] <= 2*pme_order)
184             {
185                 pmes->n--;
186
187                 return FALSE;
188             }
189         }
190     }
191     while (sp <= 1.001*pmes->setup[pmes->cur].spacing);
192
193     set->rcut    = pmes->cut_spacing*sp;
194     set->rlist   = set->rcut + pmes->rbuf;
195     set->spacing = sp;
196     /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
197     set->grid_eff = 1;
198     for(d=0; d<DIM; d++)
199     {
200         set->grid_eff *= (set->grid[d]*sp)/norm(pmes->box_start[d]);
201     }
202     /* The Ewald coefficient is inversly proportional to the cut-off */
203     set->coeff   = pmes->setup[0].coeff*pmes->setup[0].rcut/set->rcut;
204
205     set->count   = 0;
206     set->cycles  = 0;
207
208     if (debug)
209     {
210         fprintf(debug,"PME switch grid %d %d %d, cutoff %f\n",
211                 set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut);
212     }
213
214     return TRUE;
215 }
216
217 static void print_grid(FILE *fp_err,FILE *fp_log,
218                        const char *pre,
219                        const char *desc,
220                        const pme_setup_t *set,
221                        double cycles)
222 {
223     char buf[STRLEN],buft[STRLEN];
224     
225     if (cycles >= 0)
226     {
227         sprintf(buft,": %.1f M-cycles",cycles*1e-6);
228     }
229     else
230     {
231         buft[0] = '\0';
232     }
233     sprintf(buf,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
234             pre,
235             desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut,
236             buft);
237     if (fp_err != NULL)
238     {
239         fprintf(fp_err,"%s\n",buf);
240     }
241     if (fp_log != NULL)
242     {
243         fprintf(fp_log,"%s\n",buf);
244     }
245 }
246
247 static void switch_to_stage1(pme_switch_t pmes)
248 {
249     pmes->start = 0;
250     while (pmes->start+1 < pmes->n &&
251            (pmes->setup[pmes->start].count == 0 ||
252             pmes->setup[pmes->start].cycles >
253             pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC))
254     {
255         pmes->start++;
256     }
257     while (pmes->start > 0 && pmes->setup[pmes->start-1].cycles == 0)
258     {
259         pmes->start--;
260     }
261
262     pmes->end = pmes->n;
263     if (pmes->setup[pmes->end-1].count > 0 &&
264         pmes->setup[pmes->end-1].cycles >
265         pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC)
266     {
267         pmes->end--;
268     }
269
270     pmes->stage = 1;
271
272     /* Start add start, 1 will be added immediately after returning */
273     pmes->cur = pmes->start - 1;
274 }
275
276 gmx_bool switch_pme(pme_switch_t pmes,
277                     t_commrec *cr,
278                     FILE *fp_err,
279                     FILE *fp_log,
280                     t_inputrec *ir,
281                     t_state *state,
282                     double cycles,
283                     interaction_const_t *ic,
284                     nonbonded_verlet_t *nbv,
285                     gmx_pme_t *pmedata,
286                     int step)
287 {
288     gmx_bool OK;
289     pme_setup_t *set;
290     double cycles_fast;
291     char buf[STRLEN];
292
293     if (pmes->stage == pmes->nstage)
294     {
295         return FALSE;
296     }
297
298     if (PAR(cr))
299     {
300         gmx_sumd(1,&cycles,cr);
301         cycles /= cr->nnodes;
302     }
303
304     set = &pmes->setup[pmes->cur];
305
306     set->count++;
307     if (set->count % 2 == 1)
308     {
309         /* Skip the first cycle, because the first step after a switch
310          * is much slower due to allocation and/or caching effects.
311          */
312         return TRUE;
313     }
314
315     sprintf(buf, "step %4d: ", step);
316     print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
317
318     if (set->count <= 2)
319     {
320         set->cycles = cycles;
321     }
322     else
323     {
324         if (cycles*PMES_ACCEL_TOL < set->cycles &&
325             pmes->stage == pmes->nstage - 1)
326         {
327             /* The performance went up a lot (due to e.g. DD load balancing).
328              * Add a stage, keep the minima, but rescan all setups.
329              */
330             pmes->nstage++;
331
332             if (debug)
333             {
334                 fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
335                         "Increased the number stages to %d"
336                         " and ignoring the previous performance\n",
337                         set->grid[XX],set->grid[YY],set->grid[ZZ],
338                         cycles*1e-6,set->cycles*1e-6,PMES_ACCEL_TOL,
339                         pmes->nstage);
340             }
341         }
342         set->cycles = min(set->cycles,cycles);
343     }
344
345     if (set->cycles < pmes->setup[pmes->fastest].cycles)
346     {
347         pmes->fastest = pmes->cur;
348     }
349     cycles_fast = pmes->setup[pmes->fastest].cycles;
350
351     /* Check in stage 0 if we should stop scanning grids.
352      * Stop when the time is more than SLOW_FAC longer than the fastest.
353      */
354     if (pmes->stage == 0 && pmes->cur > 0 &&
355         cycles > pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC)
356     {
357         pmes->n = pmes->cur + 1;
358         /* Done with scanning, go to stage 1 */
359         switch_to_stage1(pmes);
360     }
361
362     if (pmes->stage == 0)
363     {
364         int gridsize_start;
365
366         gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
367
368         do
369         {
370             if (pmes->cur+1 < pmes->n)
371             {
372                 /* We had already generated the next setup */
373                 OK = TRUE;
374             }
375             else
376             {
377                 /* Find the next setup */
378                 OK = switch_pme_increase_cutoff(pmes,ir->pme_order);
379             }
380                 
381             if (OK && ir->ePBC != epbcNONE)
382             {
383                 OK = (sqr(pmes->setup[pmes->cur+1].rlist)
384                       <= max_cutoff2(ir->ePBC,state->box));
385             }
386
387             if (OK)
388             {
389                 pmes->cur++;
390
391                 if (DOMAINDECOMP(cr))
392                 {
393                     OK = change_dd_cutoff(cr,state,ir,
394                                           pmes->setup[pmes->cur].rlist);
395                     if (!OK)
396                     {
397                         /* Failed: do not use this setup */
398                         pmes->cur--;
399                     }
400                 }
401             }
402             if (!OK)
403             {
404                 /* We hit the upper limit for the cut-off,
405                  * the setup should not go further than cur.
406                  */
407                 pmes->n = pmes->cur + 1;
408                 /* Switch to the next stage */
409                 switch_to_stage1(pmes);
410             }
411         }
412         while (OK &&
413                !(pmes->setup[pmes->cur].grid[XX]*
414                  pmes->setup[pmes->cur].grid[YY]*
415                  pmes->setup[pmes->cur].grid[ZZ] <
416                  gridsize_start*PMES_GRID_SCALE_FAC
417                  &&
418                  pmes->setup[pmes->cur].grid_eff <
419                  pmes->setup[pmes->cur-1].grid_eff*PMES_GRID_EFF_FAC));
420     }
421
422     if (pmes->stage > 0 && pmes->end == 1)
423     {
424         pmes->cur = 0;
425         pmes->stage = pmes->nstage;
426     }
427     else if (pmes->stage > 0 && pmes->end > 1)
428     {
429         /* If stage = nstage-1:
430          *   scan over all setups, rerunning only those setups
431          *   which are not much slower than the fastest
432          * else:
433          *   use the next setup
434          */
435         do
436         {
437             pmes->cur++;
438             if (pmes->cur == pmes->end)
439             {
440                 pmes->stage++;
441                 pmes->cur = pmes->start;
442             }
443         }
444         while (pmes->stage == pmes->nstage - 1 &&
445                pmes->setup[pmes->cur].count > 0 &&
446                pmes->setup[pmes->cur].cycles > cycles_fast*PMES_SLOW_FAC);
447
448         if (pmes->stage == pmes->nstage)
449         {
450             /* We are done optiming, use the fastest setup we found */
451             pmes->cur = pmes->fastest;
452         }
453     }
454
455     if (DOMAINDECOMP(cr) && pmes->stage > 0)
456     {
457         OK = change_dd_cutoff(cr,state,ir,pmes->setup[pmes->cur].rlist);
458         if (!OK)
459         {
460             /* Failsafe solution */
461             if (pmes->cur > 1 && pmes->stage == pmes->nstage)
462             {
463                 pmes->stage--;
464             }
465             pmes->fastest = 0;
466             pmes->start   = 0;
467             pmes->end     = pmes->cur;
468             pmes->cur     = pmes->start;
469         }
470     }
471
472     /* Change the Coulomb cut-off and the PME grid */
473
474     set = &pmes->setup[pmes->cur];
475
476     ic->rcoulomb   = set->rcut;
477     ic->rlist      = set->rlist;
478     ic->ewaldcoeff = set->coeff;
479
480     if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
481     {
482         nbnxn_cuda_pmetune_update_param(nbv->cu_nbv,ic);
483     }
484     else
485     {
486         init_interaction_const_tables(NULL,ic,nbv->grp[0].kernel_type);
487     }
488
489     if (nbv->ngrp > 1)
490     {
491         init_interaction_const_tables(NULL,ic,nbv->grp[1].kernel_type);
492     }
493
494     if (cr->duty & DUTY_PME)
495     {
496         if (pmes->setup[pmes->cur].pmedata == NULL)
497         {
498             /* Generate a new PME data structure,
499              * copying part of the old pointers.
500              */
501             gmx_pme_reinit(&set->pmedata,
502                            cr,pmes->setup[0].pmedata,ir,
503                            set->grid);
504         }
505         *pmedata = set->pmedata;
506     }
507     else
508     {
509         /* Tell our PME-only node to switch grid */
510         gmx_pme_send_switch(cr, set->grid, set->coeff);
511     }
512
513     if (debug)
514     {
515         print_grid(NULL,debug,"","switched to",set,-1);
516     }
517
518     if (pmes->stage == pmes->nstage)
519     {
520         print_grid(fp_err,fp_log,"","optimal",set,-1);
521     }
522
523     return TRUE;
524 }
525
526 void restart_switch_pme(pme_switch_t pmes, int n)
527 {
528     pmes->nstage += n;
529 }