New and reorganized documentation
[alexxy/gromacs.git] / src / gromacs / ewald / pme-spread.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "pme-spread.h"
40
41 #include "config.h"
42
43 #include <assert.h>
44
45 #include "gromacs/ewald/pme-internal.h"
46 #include "gromacs/ewald/pme-simd.h"
47 #include "gromacs/ewald/pme-spline-work.h"
48 #include "gromacs/legacyheaders/macros.h"
49 #include "gromacs/utility/smalloc.h"
50
51 static void calc_interpolation_idx(struct gmx_pme_t *pme, pme_atomcomm_t *atc,
52                                    int start, int grid_index, int end, int thread)
53 {
54     int             i;
55     int            *idxptr, tix, tiy, tiz;
56     real           *xptr, *fptr, tx, ty, tz;
57     real            rxx, ryx, ryy, rzx, rzy, rzz;
58     int             nx, ny, nz;
59     int             start_ix, start_iy, start_iz;
60     int            *g2tx, *g2ty, *g2tz;
61     gmx_bool        bThreads;
62     int            *thread_idx = NULL;
63     thread_plist_t *tpl        = NULL;
64     int            *tpl_n      = NULL;
65     int             thread_i;
66
67     nx  = pme->nkx;
68     ny  = pme->nky;
69     nz  = pme->nkz;
70
71     start_ix = pme->pmegrid_start_ix;
72     start_iy = pme->pmegrid_start_iy;
73     start_iz = pme->pmegrid_start_iz;
74
75     rxx = pme->recipbox[XX][XX];
76     ryx = pme->recipbox[YY][XX];
77     ryy = pme->recipbox[YY][YY];
78     rzx = pme->recipbox[ZZ][XX];
79     rzy = pme->recipbox[ZZ][YY];
80     rzz = pme->recipbox[ZZ][ZZ];
81
82     g2tx = pme->pmegrid[grid_index].g2t[XX];
83     g2ty = pme->pmegrid[grid_index].g2t[YY];
84     g2tz = pme->pmegrid[grid_index].g2t[ZZ];
85
86     bThreads = (atc->nthread > 1);
87     if (bThreads)
88     {
89         thread_idx = atc->thread_idx;
90
91         tpl   = &atc->thread_plist[thread];
92         tpl_n = tpl->n;
93         for (i = 0; i < atc->nthread; i++)
94         {
95             tpl_n[i] = 0;
96         }
97     }
98
99     for (i = start; i < end; i++)
100     {
101         xptr   = atc->x[i];
102         idxptr = atc->idx[i];
103         fptr   = atc->fractx[i];
104
105         /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
106         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
107         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
108         tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
109
110         tix = (int)(tx);
111         tiy = (int)(ty);
112         tiz = (int)(tz);
113
114         /* Because decomposition only occurs in x and y,
115          * we never have a fraction correction in z.
116          */
117         fptr[XX] = tx - tix + pme->fshx[tix];
118         fptr[YY] = ty - tiy + pme->fshy[tiy];
119         fptr[ZZ] = tz - tiz;
120
121         idxptr[XX] = pme->nnx[tix];
122         idxptr[YY] = pme->nny[tiy];
123         idxptr[ZZ] = pme->nnz[tiz];
124
125 #ifdef DEBUG
126         range_check(idxptr[XX], 0, pme->pmegrid_nx);
127         range_check(idxptr[YY], 0, pme->pmegrid_ny);
128         range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
129 #endif
130
131         if (bThreads)
132         {
133             thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
134             thread_idx[i] = thread_i;
135             tpl_n[thread_i]++;
136         }
137     }
138
139     if (bThreads)
140     {
141         /* Make a list of particle indices sorted on thread */
142
143         /* Get the cumulative count */
144         for (i = 1; i < atc->nthread; i++)
145         {
146             tpl_n[i] += tpl_n[i-1];
147         }
148         /* The current implementation distributes particles equally
149          * over the threads, so we could actually allocate for that
150          * in pme_realloc_atomcomm_things.
151          */
152         if (tpl_n[atc->nthread-1] > tpl->nalloc)
153         {
154             tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
155             srenew(tpl->i, tpl->nalloc);
156         }
157         /* Set tpl_n to the cumulative start */
158         for (i = atc->nthread-1; i >= 1; i--)
159         {
160             tpl_n[i] = tpl_n[i-1];
161         }
162         tpl_n[0] = 0;
163
164         /* Fill our thread local array with indices sorted on thread */
165         for (i = start; i < end; i++)
166         {
167             tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
168         }
169         /* Now tpl_n contains the cummulative count again */
170     }
171 }
172
173 static void make_thread_local_ind(pme_atomcomm_t *atc,
174                                   int thread, splinedata_t *spline)
175 {
176     int             n, t, i, start, end;
177     thread_plist_t *tpl;
178
179     /* Combine the indices made by each thread into one index */
180
181     n     = 0;
182     start = 0;
183     for (t = 0; t < atc->nthread; t++)
184     {
185         tpl = &atc->thread_plist[t];
186         /* Copy our part (start - end) from the list of thread t */
187         if (thread > 0)
188         {
189             start = tpl->n[thread-1];
190         }
191         end = tpl->n[thread];
192         for (i = start; i < end; i++)
193         {
194             spline->ind[n++] = tpl->i[i];
195         }
196     }
197
198     spline->n = n;
199 }
200
201 /* Macro to force loop unrolling by fixing order.
202  * This gives a significant performance gain.
203  */
204 #define CALC_SPLINE(order)                     \
205     {                                              \
206         int  j, k, l;                                 \
207         real dr, div;                               \
208         real data[PME_ORDER_MAX];                  \
209         real ddata[PME_ORDER_MAX];                 \
210                                                \
211         for (j = 0; (j < DIM); j++)                     \
212         {                                          \
213             dr  = xptr[j];                         \
214                                                \
215             /* dr is relative offset from lower cell limit */ \
216             data[order-1] = 0;                     \
217             data[1]       = dr;                          \
218             data[0]       = 1 - dr;                      \
219                                                \
220             for (k = 3; (k < order); k++)               \
221             {                                      \
222                 div       = 1.0/(k - 1.0);               \
223                 data[k-1] = div*dr*data[k-2];      \
224                 for (l = 1; (l < (k-1)); l++)           \
225                 {                                  \
226                     data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
227                                        data[k-l-1]);                \
228                 }                                  \
229                 data[0] = div*(1-dr)*data[0];      \
230             }                                      \
231             /* differentiate */                    \
232             ddata[0] = -data[0];                   \
233             for (k = 1; (k < order); k++)               \
234             {                                      \
235                 ddata[k] = data[k-1] - data[k];    \
236             }                                      \
237                                                \
238             div           = 1.0/(order - 1);                 \
239             data[order-1] = div*dr*data[order-2];  \
240             for (l = 1; (l < (order-1)); l++)           \
241             {                                      \
242                 data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
243                                        (order-l-dr)*data[order-l-1]); \
244             }                                      \
245             data[0] = div*(1 - dr)*data[0];        \
246                                                \
247             for (k = 0; k < order; k++)                 \
248             {                                      \
249                 theta[j][i*order+k]  = data[k];    \
250                 dtheta[j][i*order+k] = ddata[k];   \
251             }                                      \
252         }                                          \
253     }
254
255 static void make_bsplines(splinevec theta, splinevec dtheta, int order,
256                           rvec fractx[], int nr, int ind[], real coefficient[],
257                           gmx_bool bDoSplines)
258 {
259     /* construct splines for local atoms */
260     int   i, ii;
261     real *xptr;
262
263     for (i = 0; i < nr; i++)
264     {
265         /* With free energy we do not use the coefficient check.
266          * In most cases this will be more efficient than calling make_bsplines
267          * twice, since usually more than half the particles have non-zero coefficients.
268          */
269         ii = ind[i];
270         if (bDoSplines || coefficient[ii] != 0.0)
271         {
272             xptr = fractx[ii];
273             switch (order)
274             {
275                 case 4:  CALC_SPLINE(4);     break;
276                 case 5:  CALC_SPLINE(5);     break;
277                 default: CALC_SPLINE(order); break;
278             }
279         }
280     }
281 }
282
283 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
284 #define DO_BSPLINE(order)                            \
285     for (ithx = 0; (ithx < order); ithx++)                    \
286     {                                                    \
287         index_x = (i0+ithx)*pny*pnz;                     \
288         valx    = coefficient*thx[ithx];                          \
289                                                      \
290         for (ithy = 0; (ithy < order); ithy++)                \
291         {                                                \
292             valxy    = valx*thy[ithy];                   \
293             index_xy = index_x+(j0+ithy)*pnz;            \
294                                                      \
295             for (ithz = 0; (ithz < order); ithz++)            \
296             {                                            \
297                 index_xyz        = index_xy+(k0+ithz);   \
298                 grid[index_xyz] += valxy*thz[ithz];      \
299             }                                            \
300         }                                                \
301     }
302
303
304 static void spread_coefficients_bsplines_thread(pmegrid_t                         *pmegrid,
305                                                 pme_atomcomm_t                    *atc,
306                                                 splinedata_t                      *spline,
307                                                 struct pme_spline_work gmx_unused *work)
308 {
309
310     /* spread coefficients from home atoms to local grid */
311     real          *grid;
312     pme_overlap_t *ol;
313     int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
314     int       *    idxptr;
315     int            order, norder, index_x, index_xy, index_xyz;
316     real           valx, valxy, coefficient;
317     real          *thx, *thy, *thz;
318     int            localsize, bndsize;
319     int            pnx, pny, pnz, ndatatot;
320     int            offx, offy, offz;
321
322 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
323     real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
324
325     thz_aligned = gmx_simd4_align_r(thz_buffer);
326 #endif
327
328     pnx = pmegrid->s[XX];
329     pny = pmegrid->s[YY];
330     pnz = pmegrid->s[ZZ];
331
332     offx = pmegrid->offset[XX];
333     offy = pmegrid->offset[YY];
334     offz = pmegrid->offset[ZZ];
335
336     ndatatot = pnx*pny*pnz;
337     grid     = pmegrid->grid;
338     for (i = 0; i < ndatatot; i++)
339     {
340         grid[i] = 0;
341     }
342
343     order = pmegrid->order;
344
345     for (nn = 0; nn < spline->n; nn++)
346     {
347         n           = spline->ind[nn];
348         coefficient = atc->coefficient[n];
349
350         if (coefficient != 0)
351         {
352             idxptr = atc->idx[n];
353             norder = nn*order;
354
355             i0   = idxptr[XX] - offx;
356             j0   = idxptr[YY] - offy;
357             k0   = idxptr[ZZ] - offz;
358
359             thx = spline->theta[XX] + norder;
360             thy = spline->theta[YY] + norder;
361             thz = spline->theta[ZZ] + norder;
362
363             switch (order)
364             {
365                 case 4:
366 #ifdef PME_SIMD4_SPREAD_GATHER
367 #ifdef PME_SIMD4_UNALIGNED
368 #define PME_SPREAD_SIMD4_ORDER4
369 #else
370 #define PME_SPREAD_SIMD4_ALIGNED
371 #define PME_ORDER 4
372 #endif
373 #include "gromacs/ewald/pme-simd4.h"
374 #else
375                     DO_BSPLINE(4);
376 #endif
377                     break;
378                 case 5:
379 #ifdef PME_SIMD4_SPREAD_GATHER
380 #define PME_SPREAD_SIMD4_ALIGNED
381 #define PME_ORDER 5
382 #include "gromacs/ewald/pme-simd4.h"
383 #else
384                     DO_BSPLINE(5);
385 #endif
386                     break;
387                 default:
388                     DO_BSPLINE(order);
389                     break;
390             }
391         }
392     }
393 }
394
395 static void copy_local_grid(struct gmx_pme_t *pme, pmegrids_t *pmegrids,
396                             int grid_index, int thread, real *fftgrid)
397 {
398     ivec local_fft_ndata, local_fft_offset, local_fft_size;
399     int  fft_my, fft_mz;
400     int  nsx, nsy, nsz;
401     ivec nf;
402     int  offx, offy, offz, x, y, z, i0, i0t;
403     int  d;
404     pmegrid_t *pmegrid;
405     real *grid_th;
406
407     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
408                                    local_fft_ndata,
409                                    local_fft_offset,
410                                    local_fft_size);
411     fft_my = local_fft_size[YY];
412     fft_mz = local_fft_size[ZZ];
413
414     pmegrid = &pmegrids->grid_th[thread];
415
416     nsx = pmegrid->s[XX];
417     nsy = pmegrid->s[YY];
418     nsz = pmegrid->s[ZZ];
419
420     for (d = 0; d < DIM; d++)
421     {
422         nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
423                     local_fft_ndata[d] - pmegrid->offset[d]);
424     }
425
426     offx = pmegrid->offset[XX];
427     offy = pmegrid->offset[YY];
428     offz = pmegrid->offset[ZZ];
429
430     /* Directly copy the non-overlapping parts of the local grids.
431      * This also initializes the full grid.
432      */
433     grid_th = pmegrid->grid;
434     for (x = 0; x < nf[XX]; x++)
435     {
436         for (y = 0; y < nf[YY]; y++)
437         {
438             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
439             i0t = (x*nsy + y)*nsz;
440             for (z = 0; z < nf[ZZ]; z++)
441             {
442                 fftgrid[i0+z] = grid_th[i0t+z];
443             }
444         }
445     }
446 }
447
448 static void
449 reduce_threadgrid_overlap(struct gmx_pme_t *pme,
450                           const pmegrids_t *pmegrids, int thread,
451                           real *fftgrid, real *commbuf_x, real *commbuf_y,
452                           int grid_index)
453 {
454     ivec local_fft_ndata, local_fft_offset, local_fft_size;
455     int  fft_nx, fft_ny, fft_nz;
456     int  fft_my, fft_mz;
457     int  buf_my = -1;
458     int  nsx, nsy, nsz;
459     ivec localcopy_end, commcopy_end;
460     int  offx, offy, offz, x, y, z, i0, i0t;
461     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
462     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
463     gmx_bool bCommX, bCommY;
464     int  d;
465     int  thread_f;
466     const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
467     const real *grid_th;
468     real *commbuf = NULL;
469
470     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
471                                    local_fft_ndata,
472                                    local_fft_offset,
473                                    local_fft_size);
474     fft_nx = local_fft_ndata[XX];
475     fft_ny = local_fft_ndata[YY];
476     fft_nz = local_fft_ndata[ZZ];
477
478     fft_my = local_fft_size[YY];
479     fft_mz = local_fft_size[ZZ];
480
481     /* This routine is called when all thread have finished spreading.
482      * Here each thread sums grid contributions calculated by other threads
483      * to the thread local grid volume.
484      * To minimize the number of grid copying operations,
485      * this routines sums immediately from the pmegrid to the fftgrid.
486      */
487
488     /* Determine which part of the full node grid we should operate on,
489      * this is our thread local part of the full grid.
490      */
491     pmegrid = &pmegrids->grid_th[thread];
492
493     for (d = 0; d < DIM; d++)
494     {
495         /* Determine up to where our thread needs to copy from the
496          * thread-local charge spreading grid to the rank-local FFT grid.
497          * This is up to our spreading grid end minus order-1 and
498          * not beyond the local FFT grid.
499          */
500         localcopy_end[d] =
501             min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
502                 local_fft_ndata[d]);
503
504         /* Determine up to where our thread needs to copy from the
505          * thread-local charge spreading grid to the communication buffer.
506          * Note: only relevant with communication, ignored otherwise.
507          */
508         commcopy_end[d]  = localcopy_end[d];
509         if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
510         {
511             /* The last thread should copy up to the last pme grid line.
512              * When the rank-local FFT grid is narrower than pme-order,
513              * we need the max below to ensure copying of all data.
514              */
515             commcopy_end[d] = max(commcopy_end[d], pme->pme_order);
516         }
517     }
518
519     offx = pmegrid->offset[XX];
520     offy = pmegrid->offset[YY];
521     offz = pmegrid->offset[ZZ];
522
523
524     bClearBufX  = TRUE;
525     bClearBufY  = TRUE;
526     bClearBufXY = TRUE;
527
528     /* Now loop over all the thread data blocks that contribute
529      * to the grid region we (our thread) are operating on.
530      */
531     /* Note that fft_nx/y is equal to the number of grid points
532      * between the first point of our node grid and the one of the next node.
533      */
534     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
535     {
536         fx     = pmegrid->ci[XX] + sx;
537         ox     = 0;
538         bCommX = FALSE;
539         if (fx < 0)
540         {
541             fx    += pmegrids->nc[XX];
542             ox    -= fft_nx;
543             bCommX = (pme->nnodes_major > 1);
544         }
545         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
546         ox       += pmegrid_g->offset[XX];
547         /* Determine the end of our part of the source grid.
548          * Use our thread local source grid and target grid part
549          */
550         tx1 = min(ox + pmegrid_g->n[XX],
551                   !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
552
553         for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
554         {
555             fy     = pmegrid->ci[YY] + sy;
556             oy     = 0;
557             bCommY = FALSE;
558             if (fy < 0)
559             {
560                 fy    += pmegrids->nc[YY];
561                 oy    -= fft_ny;
562                 bCommY = (pme->nnodes_minor > 1);
563             }
564             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
565             oy       += pmegrid_g->offset[YY];
566             /* Determine the end of our part of the source grid.
567              * Use our thread local source grid and target grid part
568              */
569             ty1 = min(oy + pmegrid_g->n[YY],
570                       !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
571
572             for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
573             {
574                 fz = pmegrid->ci[ZZ] + sz;
575                 oz = 0;
576                 if (fz < 0)
577                 {
578                     fz += pmegrids->nc[ZZ];
579                     oz -= fft_nz;
580                 }
581                 pmegrid_g = &pmegrids->grid_th[fz];
582                 oz       += pmegrid_g->offset[ZZ];
583                 tz1       = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
584
585                 if (sx == 0 && sy == 0 && sz == 0)
586                 {
587                     /* We have already added our local contribution
588                      * before calling this routine, so skip it here.
589                      */
590                     continue;
591                 }
592
593                 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
594
595                 pmegrid_f = &pmegrids->grid_th[thread_f];
596
597                 grid_th = pmegrid_f->grid;
598
599                 nsx = pmegrid_f->s[XX];
600                 nsy = pmegrid_f->s[YY];
601                 nsz = pmegrid_f->s[ZZ];
602
603 #ifdef DEBUG_PME_REDUCE
604                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
605                        pme->nodeid, thread, thread_f,
606                        pme->pmegrid_start_ix,
607                        pme->pmegrid_start_iy,
608                        pme->pmegrid_start_iz,
609                        sx, sy, sz,
610                        offx-ox, tx1-ox, offx, tx1,
611                        offy-oy, ty1-oy, offy, ty1,
612                        offz-oz, tz1-oz, offz, tz1);
613 #endif
614
615                 if (!(bCommX || bCommY))
616                 {
617                     /* Copy from the thread local grid to the node grid */
618                     for (x = offx; x < tx1; x++)
619                     {
620                         for (y = offy; y < ty1; y++)
621                         {
622                             i0  = (x*fft_my + y)*fft_mz;
623                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
624                             for (z = offz; z < tz1; z++)
625                             {
626                                 fftgrid[i0+z] += grid_th[i0t+z];
627                             }
628                         }
629                     }
630                 }
631                 else
632                 {
633                     /* The order of this conditional decides
634                      * where the corner volume gets stored with x+y decomp.
635                      */
636                     if (bCommY)
637                     {
638                         commbuf = commbuf_y;
639                         /* The y-size of the communication buffer is set by
640                          * the overlap of the grid part of our local slab
641                          * with the part starting at the next slab.
642                          */
643                         buf_my  =
644                             pme->overlap[1].s2g1[pme->nodeid_minor] -
645                             pme->overlap[1].s2g0[pme->nodeid_minor+1];
646                         if (bCommX)
647                         {
648                             /* We index commbuf modulo the local grid size */
649                             commbuf += buf_my*fft_nx*fft_nz;
650
651                             bClearBuf   = bClearBufXY;
652                             bClearBufXY = FALSE;
653                         }
654                         else
655                         {
656                             bClearBuf  = bClearBufY;
657                             bClearBufY = FALSE;
658                         }
659                     }
660                     else
661                     {
662                         commbuf    = commbuf_x;
663                         buf_my     = fft_ny;
664                         bClearBuf  = bClearBufX;
665                         bClearBufX = FALSE;
666                     }
667
668                     /* Copy to the communication buffer */
669                     for (x = offx; x < tx1; x++)
670                     {
671                         for (y = offy; y < ty1; y++)
672                         {
673                             i0  = (x*buf_my + y)*fft_nz;
674                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
675
676                             if (bClearBuf)
677                             {
678                                 /* First access of commbuf, initialize it */
679                                 for (z = offz; z < tz1; z++)
680                                 {
681                                     commbuf[i0+z]  = grid_th[i0t+z];
682                                 }
683                             }
684                             else
685                             {
686                                 for (z = offz; z < tz1; z++)
687                                 {
688                                     commbuf[i0+z] += grid_th[i0t+z];
689                                 }
690                             }
691                         }
692                     }
693                 }
694             }
695         }
696     }
697 }
698
699
700 static void sum_fftgrid_dd(struct gmx_pme_t *pme, real *fftgrid, int grid_index)
701 {
702     ivec local_fft_ndata, local_fft_offset, local_fft_size;
703     pme_overlap_t *overlap;
704     int  send_index0, send_nindex;
705     int  recv_nindex;
706 #ifdef GMX_MPI
707     MPI_Status stat;
708 #endif
709     int  send_size_y, recv_size_y;
710     int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
711     real *sendptr, *recvptr;
712     int  x, y, z, indg, indb;
713
714     /* Note that this routine is only used for forward communication.
715      * Since the force gathering, unlike the coefficient spreading,
716      * can be trivially parallelized over the particles,
717      * the backwards process is much simpler and can use the "old"
718      * communication setup.
719      */
720
721     gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
722                                    local_fft_ndata,
723                                    local_fft_offset,
724                                    local_fft_size);
725
726     if (pme->nnodes_minor > 1)
727     {
728         /* Major dimension */
729         overlap = &pme->overlap[1];
730
731         if (pme->nnodes_major > 1)
732         {
733             size_yx = pme->overlap[0].comm_data[0].send_nindex;
734         }
735         else
736         {
737             size_yx = 0;
738         }
739         datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
740
741         send_size_y = overlap->send_size;
742
743         for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
744         {
745             send_id       = overlap->send_id[ipulse];
746             recv_id       = overlap->recv_id[ipulse];
747             send_index0   =
748                 overlap->comm_data[ipulse].send_index0 -
749                 overlap->comm_data[0].send_index0;
750             send_nindex   = overlap->comm_data[ipulse].send_nindex;
751             /* We don't use recv_index0, as we always receive starting at 0 */
752             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
753             recv_size_y   = overlap->comm_data[ipulse].recv_size;
754
755             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
756             recvptr = overlap->recvbuf;
757
758             if (debug != NULL)
759             {
760                 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
761                         local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
762             }
763
764 #ifdef GMX_MPI
765             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
766                          send_id, ipulse,
767                          recvptr, recv_size_y*datasize, GMX_MPI_REAL,
768                          recv_id, ipulse,
769                          overlap->mpi_comm, &stat);
770 #endif
771
772             for (x = 0; x < local_fft_ndata[XX]; x++)
773             {
774                 for (y = 0; y < recv_nindex; y++)
775                 {
776                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
777                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
778                     for (z = 0; z < local_fft_ndata[ZZ]; z++)
779                     {
780                         fftgrid[indg+z] += recvptr[indb+z];
781                     }
782                 }
783             }
784
785             if (pme->nnodes_major > 1)
786             {
787                 /* Copy from the received buffer to the send buffer for dim 0 */
788                 sendptr = pme->overlap[0].sendbuf;
789                 for (x = 0; x < size_yx; x++)
790                 {
791                     for (y = 0; y < recv_nindex; y++)
792                     {
793                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
794                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
795                         for (z = 0; z < local_fft_ndata[ZZ]; z++)
796                         {
797                             sendptr[indg+z] += recvptr[indb+z];
798                         }
799                     }
800                 }
801             }
802         }
803     }
804
805     /* We only support a single pulse here.
806      * This is not a severe limitation, as this code is only used
807      * with OpenMP and with OpenMP the (PME) domains can be larger.
808      */
809     if (pme->nnodes_major > 1)
810     {
811         /* Major dimension */
812         overlap = &pme->overlap[0];
813
814         datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
815         gridsize = local_fft_size[YY] *local_fft_size[ZZ];
816
817         ipulse = 0;
818
819         send_id       = overlap->send_id[ipulse];
820         recv_id       = overlap->recv_id[ipulse];
821         send_nindex   = overlap->comm_data[ipulse].send_nindex;
822         /* We don't use recv_index0, as we always receive starting at 0 */
823         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
824
825         sendptr = overlap->sendbuf;
826         recvptr = overlap->recvbuf;
827
828         if (debug != NULL)
829         {
830             fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
831                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
832         }
833
834 #ifdef GMX_MPI
835         MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
836                      send_id, ipulse,
837                      recvptr, recv_nindex*datasize, GMX_MPI_REAL,
838                      recv_id, ipulse,
839                      overlap->mpi_comm, &stat);
840 #endif
841
842         for (x = 0; x < recv_nindex; x++)
843         {
844             for (y = 0; y < local_fft_ndata[YY]; y++)
845             {
846                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
847                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
848                 for (z = 0; z < local_fft_ndata[ZZ]; z++)
849                 {
850                     fftgrid[indg+z] += recvptr[indb+z];
851                 }
852             }
853         }
854     }
855 }
856
857 void spread_on_grid(struct gmx_pme_t *pme,
858                     pme_atomcomm_t *atc, pmegrids_t *grids,
859                     gmx_bool bCalcSplines, gmx_bool bSpread,
860                     real *fftgrid, gmx_bool bDoSplines, int grid_index)
861 {
862     int nthread, thread;
863 #ifdef PME_TIME_THREADS
864     gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
865     static double cs1     = 0, cs2 = 0, cs3 = 0;
866     static double cs1a[6] = {0, 0, 0, 0, 0, 0};
867     static int cnt        = 0;
868 #endif
869
870     nthread = pme->nthread;
871     assert(nthread > 0);
872
873 #ifdef PME_TIME_THREADS
874     c1 = omp_cyc_start();
875 #endif
876     if (bCalcSplines)
877     {
878 #pragma omp parallel for num_threads(nthread) schedule(static)
879         for (thread = 0; thread < nthread; thread++)
880         {
881             int start, end;
882
883             start = atc->n* thread   /nthread;
884             end   = atc->n*(thread+1)/nthread;
885
886             /* Compute fftgrid index for all atoms,
887              * with help of some extra variables.
888              */
889             calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
890         }
891     }
892 #ifdef PME_TIME_THREADS
893     c1   = omp_cyc_end(c1);
894     cs1 += (double)c1;
895 #endif
896
897 #ifdef PME_TIME_THREADS
898     c2 = omp_cyc_start();
899 #endif
900 #pragma omp parallel for num_threads(nthread) schedule(static)
901     for (thread = 0; thread < nthread; thread++)
902     {
903         splinedata_t *spline;
904         pmegrid_t *grid = NULL;
905
906         /* make local bsplines  */
907         if (grids == NULL || !pme->bUseThreads)
908         {
909             spline = &atc->spline[0];
910
911             spline->n = atc->n;
912
913             if (bSpread)
914             {
915                 grid = &grids->grid;
916             }
917         }
918         else
919         {
920             spline = &atc->spline[thread];
921
922             if (grids->nthread == 1)
923             {
924                 /* One thread, we operate on all coefficients */
925                 spline->n = atc->n;
926             }
927             else
928             {
929                 /* Get the indices our thread should operate on */
930                 make_thread_local_ind(atc, thread, spline);
931             }
932
933             grid = &grids->grid_th[thread];
934         }
935
936         if (bCalcSplines)
937         {
938             make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
939                           atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
940         }
941
942         if (bSpread)
943         {
944             /* put local atoms on grid. */
945 #ifdef PME_TIME_SPREAD
946             ct1a = omp_cyc_start();
947 #endif
948             spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
949
950             if (pme->bUseThreads)
951             {
952                 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
953             }
954 #ifdef PME_TIME_SPREAD
955             ct1a          = omp_cyc_end(ct1a);
956             cs1a[thread] += (double)ct1a;
957 #endif
958         }
959     }
960 #ifdef PME_TIME_THREADS
961     c2   = omp_cyc_end(c2);
962     cs2 += (double)c2;
963 #endif
964
965     if (bSpread && pme->bUseThreads)
966     {
967 #ifdef PME_TIME_THREADS
968         c3 = omp_cyc_start();
969 #endif
970 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
971         for (thread = 0; thread < grids->nthread; thread++)
972         {
973             reduce_threadgrid_overlap(pme, grids, thread,
974                                       fftgrid,
975                                       pme->overlap[0].sendbuf,
976                                       pme->overlap[1].sendbuf,
977                                       grid_index);
978         }
979 #ifdef PME_TIME_THREADS
980         c3   = omp_cyc_end(c3);
981         cs3 += (double)c3;
982 #endif
983
984         if (pme->nnodes > 1)
985         {
986             /* Communicate the overlapping part of the fftgrid.
987              * For this communication call we need to check pme->bUseThreads
988              * to have all ranks communicate here, regardless of pme->nthread.
989              */
990             sum_fftgrid_dd(pme, fftgrid, grid_index);
991         }
992     }
993
994 #ifdef PME_TIME_THREADS
995     cnt++;
996     if (cnt % 20 == 0)
997     {
998         printf("idx %.2f spread %.2f red %.2f",
999                cs1*1e-9, cs2*1e-9, cs3*1e-9);
1000 #ifdef PME_TIME_SPREAD
1001         for (thread = 0; thread < nthread; thread++)
1002         {
1003             printf(" %.2f", cs1a[thread]*1e-9);
1004         }
1005 #endif
1006         printf("\n");
1007     }
1008 #endif
1009 }