2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "pme_spread.h"
47 #include "gromacs/ewald/pme.h"
48 #include "gromacs/fft/parallel_3dfft.h"
49 #include "gromacs/simd/simd.h"
50 #include "gromacs/utility/basedefinitions.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/fatalerror.h"
53 #include "gromacs/utility/smalloc.h"
56 #include "pme_internal.h"
58 #include "pme_spline_work.h"
60 /* TODO consider split of pme-spline from this file */
62 static void calc_interpolation_idx(const gmx_pme_t *pme, PmeAtomComm *atc,
63 int start, int grid_index, int end, int thread)
66 int *idxptr, tix, tiy, tiz;
68 real *fptr, tx, ty, tz;
69 real rxx, ryx, ryy, rzx, rzy, rzz;
71 int *g2tx, *g2ty, *g2tz;
73 int *thread_idx = nullptr;
81 rxx = pme->recipbox[XX][XX];
82 ryx = pme->recipbox[YY][XX];
83 ryy = pme->recipbox[YY][YY];
84 rzx = pme->recipbox[ZZ][XX];
85 rzy = pme->recipbox[ZZ][YY];
86 rzz = pme->recipbox[ZZ][ZZ];
88 g2tx = pme->pmegrid[grid_index].g2t[XX];
89 g2ty = pme->pmegrid[grid_index].g2t[YY];
90 g2tz = pme->pmegrid[grid_index].g2t[ZZ];
92 bThreads = (atc->nthread > 1);
95 thread_idx = atc->thread_idx.data();
97 tpl_n = atc->threadMap[thread].n;
98 for (i = 0; i < atc->nthread; i++)
104 const real shift = c_pmeMaxUnitcellShift;
106 for (i = start; i < end; i++)
109 idxptr = atc->idx[i];
110 fptr = atc->fractx[i];
112 /* Fractional coordinates along box vectors, add a positive shift to ensure tx/ty/tz are positive for triclinic boxes */
113 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + shift );
114 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + shift );
115 tz = nz * ( xptr[ZZ] * rzz + shift );
117 tix = static_cast<int>(tx);
118 tiy = static_cast<int>(ty);
119 tiz = static_cast<int>(tz);
122 range_check(tix, 0, c_pmeNeighborUnitcellCount * nx);
123 range_check(tiy, 0, c_pmeNeighborUnitcellCount * ny);
124 range_check(tiz, 0, c_pmeNeighborUnitcellCount * nz);
126 /* Because decomposition only occurs in x and y,
127 * we never have a fraction correction in z.
129 fptr[XX] = tx - tix + pme->fshx[tix];
130 fptr[YY] = ty - tiy + pme->fshy[tiy];
133 idxptr[XX] = pme->nnx[tix];
134 idxptr[YY] = pme->nny[tiy];
135 idxptr[ZZ] = pme->nnz[tiz];
138 range_check(idxptr[XX], 0, pme->pmegrid_nx);
139 range_check(idxptr[YY], 0, pme->pmegrid_ny);
140 range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
145 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
146 thread_idx[i] = thread_i;
153 /* Make a list of particle indices sorted on thread */
155 /* Get the cumulative count */
156 for (i = 1; i < atc->nthread; i++)
158 tpl_n[i] += tpl_n[i-1];
160 /* The current implementation distributes particles equally
161 * over the threads, so we could actually allocate for that
162 * in pme_realloc_atomcomm_things.
164 AtomToThreadMap &threadMap = atc->threadMap[thread];
165 threadMap.i.resize(tpl_n[atc->nthread - 1]);
166 /* Set tpl_n to the cumulative start */
167 for (i = atc->nthread-1; i >= 1; i--)
169 tpl_n[i] = tpl_n[i-1];
173 /* Fill our thread local array with indices sorted on thread */
174 for (i = start; i < end; i++)
176 threadMap.i[tpl_n[atc->thread_idx[i]]++] = i;
178 /* Now tpl_n contains the cummulative count again */
182 static void make_thread_local_ind(const PmeAtomComm *atc,
183 int thread, splinedata_t *spline)
185 int n, t, i, start, end;
187 /* Combine the indices made by each thread into one index */
191 for (t = 0; t < atc->nthread; t++)
193 const AtomToThreadMap &threadMap = atc->threadMap[t];
194 /* Copy our part (start - end) from the list of thread t */
197 start = threadMap.n[thread-1];
199 end = threadMap.n[thread];
200 for (i = start; i < end; i++)
202 spline->ind[n++] = threadMap.i[i];
209 // At run time, the values of order used and asserted upon mean that
210 // indexing out of bounds does not occur. However compilers don't
211 // always understand that, so we suppress this warning for this code
213 #pragma GCC diagnostic push
214 #pragma GCC diagnostic ignored "-Warray-bounds"
216 /* Macro to force loop unrolling by fixing order.
217 * This gives a significant performance gain.
219 #define CALC_SPLINE(order) \
221 for (int j = 0; (j < DIM); j++) \
224 real data[PME_ORDER_MAX]; \
228 /* dr is relative offset from lower cell limit */ \
229 data[(order)-1] = 0; \
233 for (int k = 3; (k < (order)); k++) \
235 div = 1.0/(k - 1.0); \
236 data[k-1] = div*dr*data[k-2]; \
237 for (int l = 1; (l < (k-1)); l++) \
239 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
242 data[0] = div*(1-dr)*data[0]; \
244 /* differentiate */ \
245 dtheta[j][i*(order)+0] = -data[0]; \
246 for (int k = 1; (k < (order)); k++) \
248 dtheta[j][i*(order)+k] = data[k-1] - data[k]; \
251 div = 1.0/((order) - 1); \
252 data[(order)-1] = div*dr*data[(order)-2]; \
253 for (int l = 1; (l < ((order)-1)); l++) \
255 data[(order)-l-1] = div*((dr+l)*data[(order)-l-2]+ \
256 ((order)-l-dr)*data[(order)-l-1]); \
258 data[0] = div*(1 - dr)*data[0]; \
260 for (int k = 0; k < (order); k++) \
262 theta[j][i*(order)+k] = data[k]; \
267 static void make_bsplines(splinevec theta, splinevec dtheta, int order,
268 rvec fractx[], int nr, const int ind[], const real coefficient[],
271 /* construct splines for local atoms */
275 for (i = 0; i < nr; i++)
277 /* With free energy we do not use the coefficient check.
278 * In most cases this will be more efficient than calling make_bsplines
279 * twice, since usually more than half the particles have non-zero coefficients.
282 if (bDoSplines || coefficient[ii] != 0.0)
285 assert(order >= 3 && order <= PME_ORDER_MAX);
288 case 4: CALC_SPLINE(4); break;
289 case 5: CALC_SPLINE(5); break;
290 default: CALC_SPLINE(order); break;
296 #pragma GCC diagnostic pop
298 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
299 #define DO_BSPLINE(order) \
300 for (ithx = 0; (ithx < (order)); ithx++) \
302 index_x = (i0+ithx)*pny*pnz; \
303 valx = coefficient*thx[ithx]; \
305 for (ithy = 0; (ithy < (order)); ithy++) \
307 valxy = valx*thy[ithy]; \
308 index_xy = index_x+(j0+ithy)*pnz; \
310 for (ithz = 0; (ithz < (order)); ithz++) \
312 index_xyz = index_xy+(k0+ithz); \
313 grid[index_xyz] += valxy*thz[ithz]; \
319 static void spread_coefficients_bsplines_thread(const pmegrid_t *pmegrid,
320 const PmeAtomComm *atc,
321 splinedata_t *spline,
322 struct pme_spline_work gmx_unused *work)
325 /* spread coefficients from home atoms to local grid */
327 int i, nn, n, ithx, ithy, ithz, i0, j0, k0;
329 int order, norder, index_x, index_xy, index_xyz;
330 real valx, valxy, coefficient;
331 real *thx, *thy, *thz;
332 int pnx, pny, pnz, ndatatot;
333 int offx, offy, offz;
335 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
336 alignas(GMX_SIMD_ALIGNMENT) real thz_aligned[GMX_SIMD4_WIDTH*2];
339 pnx = pmegrid->s[XX];
340 pny = pmegrid->s[YY];
341 pnz = pmegrid->s[ZZ];
343 offx = pmegrid->offset[XX];
344 offy = pmegrid->offset[YY];
345 offz = pmegrid->offset[ZZ];
347 ndatatot = pnx*pny*pnz;
348 grid = pmegrid->grid;
349 for (i = 0; i < ndatatot; i++)
354 order = pmegrid->order;
356 for (nn = 0; nn < spline->n; nn++)
359 coefficient = atc->coefficient[n];
361 if (coefficient != 0)
363 idxptr = atc->idx[n];
366 i0 = idxptr[XX] - offx;
367 j0 = idxptr[YY] - offy;
368 k0 = idxptr[ZZ] - offz;
370 thx = spline->theta.coefficients[XX] + norder;
371 thy = spline->theta.coefficients[YY] + norder;
372 thz = spline->theta.coefficients[ZZ] + norder;
377 #ifdef PME_SIMD4_SPREAD_GATHER
378 #ifdef PME_SIMD4_UNALIGNED
379 #define PME_SPREAD_SIMD4_ORDER4
381 #define PME_SPREAD_SIMD4_ALIGNED
384 #include "pme_simd4.h"
390 #ifdef PME_SIMD4_SPREAD_GATHER
391 #define PME_SPREAD_SIMD4_ALIGNED
393 #include "pme_simd4.h"
406 static void copy_local_grid(const gmx_pme_t *pme, const pmegrids_t *pmegrids,
407 int grid_index, int thread, real *fftgrid)
409 ivec local_fft_ndata, local_fft_offset, local_fft_size;
413 int offx, offy, offz, x, y, z, i0, i0t;
417 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
421 fft_my = local_fft_size[YY];
422 fft_mz = local_fft_size[ZZ];
424 const pmegrid_t *pmegrid = &pmegrids->grid_th[thread];
426 nsy = pmegrid->s[YY];
427 nsz = pmegrid->s[ZZ];
429 for (d = 0; d < DIM; d++)
431 nf[d] = std::min(pmegrid->n[d] - (pmegrid->order - 1),
432 local_fft_ndata[d] - pmegrid->offset[d]);
435 offx = pmegrid->offset[XX];
436 offy = pmegrid->offset[YY];
437 offz = pmegrid->offset[ZZ];
439 /* Directly copy the non-overlapping parts of the local grids.
440 * This also initializes the full grid.
442 grid_th = pmegrid->grid;
443 for (x = 0; x < nf[XX]; x++)
445 for (y = 0; y < nf[YY]; y++)
447 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
448 i0t = (x*nsy + y)*nsz;
449 for (z = 0; z < nf[ZZ]; z++)
451 fftgrid[i0+z] = grid_th[i0t+z];
458 reduce_threadgrid_overlap(const gmx_pme_t *pme,
459 const pmegrids_t *pmegrids, int thread,
460 real *fftgrid, real *commbuf_x, real *commbuf_y,
463 ivec local_fft_ndata, local_fft_offset, local_fft_size;
464 int fft_nx, fft_ny, fft_nz;
468 ivec localcopy_end, commcopy_end;
469 int offx, offy, offz, x, y, z, i0, i0t;
470 int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
471 gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
472 gmx_bool bCommX, bCommY;
475 const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
477 real *commbuf = nullptr;
479 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
483 fft_nx = local_fft_ndata[XX];
484 fft_ny = local_fft_ndata[YY];
485 fft_nz = local_fft_ndata[ZZ];
487 fft_my = local_fft_size[YY];
488 fft_mz = local_fft_size[ZZ];
490 /* This routine is called when all thread have finished spreading.
491 * Here each thread sums grid contributions calculated by other threads
492 * to the thread local grid volume.
493 * To minimize the number of grid copying operations,
494 * this routines sums immediately from the pmegrid to the fftgrid.
497 /* Determine which part of the full node grid we should operate on,
498 * this is our thread local part of the full grid.
500 pmegrid = &pmegrids->grid_th[thread];
502 for (d = 0; d < DIM; d++)
504 /* Determine up to where our thread needs to copy from the
505 * thread-local charge spreading grid to the rank-local FFT grid.
506 * This is up to our spreading grid end minus order-1 and
507 * not beyond the local FFT grid.
510 std::min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
513 /* Determine up to where our thread needs to copy from the
514 * thread-local charge spreading grid to the communication buffer.
515 * Note: only relevant with communication, ignored otherwise.
517 commcopy_end[d] = localcopy_end[d];
518 if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
520 /* The last thread should copy up to the last pme grid line.
521 * When the rank-local FFT grid is narrower than pme-order,
522 * we need the max below to ensure copying of all data.
524 commcopy_end[d] = std::max(commcopy_end[d], pme->pme_order);
528 offx = pmegrid->offset[XX];
529 offy = pmegrid->offset[YY];
530 offz = pmegrid->offset[ZZ];
537 /* Now loop over all the thread data blocks that contribute
538 * to the grid region we (our thread) are operating on.
540 /* Note that fft_nx/y is equal to the number of grid points
541 * between the first point of our node grid and the one of the next node.
543 for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
545 fx = pmegrid->ci[XX] + sx;
550 fx += pmegrids->nc[XX];
552 bCommX = (pme->nnodes_major > 1);
554 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
555 ox += pmegrid_g->offset[XX];
556 /* Determine the end of our part of the source grid.
557 * Use our thread local source grid and target grid part
559 tx1 = std::min(ox + pmegrid_g->n[XX],
560 !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
562 for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
564 fy = pmegrid->ci[YY] + sy;
569 fy += pmegrids->nc[YY];
571 bCommY = (pme->nnodes_minor > 1);
573 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
574 oy += pmegrid_g->offset[YY];
575 /* Determine the end of our part of the source grid.
576 * Use our thread local source grid and target grid part
578 ty1 = std::min(oy + pmegrid_g->n[YY],
579 !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
581 for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
583 fz = pmegrid->ci[ZZ] + sz;
587 fz += pmegrids->nc[ZZ];
590 pmegrid_g = &pmegrids->grid_th[fz];
591 oz += pmegrid_g->offset[ZZ];
592 tz1 = std::min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
594 if (sx == 0 && sy == 0 && sz == 0)
596 /* We have already added our local contribution
597 * before calling this routine, so skip it here.
602 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
604 pmegrid_f = &pmegrids->grid_th[thread_f];
606 grid_th = pmegrid_f->grid;
608 nsy = pmegrid_f->s[YY];
609 nsz = pmegrid_f->s[ZZ];
611 #ifdef DEBUG_PME_REDUCE
612 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
613 pme->nodeid, thread, thread_f,
614 pme->pmegrid_start_ix,
615 pme->pmegrid_start_iy,
616 pme->pmegrid_start_iz,
618 offx-ox, tx1-ox, offx, tx1,
619 offy-oy, ty1-oy, offy, ty1,
620 offz-oz, tz1-oz, offz, tz1);
623 if (!(bCommX || bCommY))
625 /* Copy from the thread local grid to the node grid */
626 for (x = offx; x < tx1; x++)
628 for (y = offy; y < ty1; y++)
630 i0 = (x*fft_my + y)*fft_mz;
631 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
632 for (z = offz; z < tz1; z++)
634 fftgrid[i0+z] += grid_th[i0t+z];
641 /* The order of this conditional decides
642 * where the corner volume gets stored with x+y decomp.
647 /* The y-size of the communication buffer is set by
648 * the overlap of the grid part of our local slab
649 * with the part starting at the next slab.
652 pme->overlap[1].s2g1[pme->nodeid_minor] -
653 pme->overlap[1].s2g0[pme->nodeid_minor+1];
656 /* We index commbuf modulo the local grid size */
657 commbuf += buf_my*fft_nx*fft_nz;
659 bClearBuf = bClearBufXY;
664 bClearBuf = bClearBufY;
672 bClearBuf = bClearBufX;
676 /* Copy to the communication buffer */
677 for (x = offx; x < tx1; x++)
679 for (y = offy; y < ty1; y++)
681 i0 = (x*buf_my + y)*fft_nz;
682 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
686 /* First access of commbuf, initialize it */
687 for (z = offz; z < tz1; z++)
689 commbuf[i0+z] = grid_th[i0t+z];
694 for (z = offz; z < tz1; z++)
696 commbuf[i0+z] += grid_th[i0t+z];
708 static void sum_fftgrid_dd(const gmx_pme_t *pme, real *fftgrid, int grid_index)
710 ivec local_fft_ndata, local_fft_offset, local_fft_size;
711 int send_index0, send_nindex;
718 int x, y, z, indg, indb;
720 /* Note that this routine is only used for forward communication.
721 * Since the force gathering, unlike the coefficient spreading,
722 * can be trivially parallelized over the particles,
723 * the backwards process is much simpler and can use the "old"
724 * communication setup.
727 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
732 if (pme->nnodes_minor > 1)
734 /* Major dimension */
735 const pme_overlap_t *overlap = &pme->overlap[1];
737 if (pme->nnodes_major > 1)
739 size_yx = pme->overlap[0].comm_data[0].send_nindex;
746 int datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
748 int send_size_y = overlap->send_size;
751 for (size_t ipulse = 0; ipulse < overlap->comm_data.size(); ipulse++)
754 overlap->comm_data[ipulse].send_index0 -
755 overlap->comm_data[0].send_index0;
756 send_nindex = overlap->comm_data[ipulse].send_nindex;
757 /* We don't use recv_index0, as we always receive starting at 0 */
758 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
759 recv_size_y = overlap->comm_data[ipulse].recv_size;
761 auto *sendptr = const_cast<real *>(overlap->sendbuf.data()) + send_index0 * local_fft_ndata[ZZ];
762 auto *recvptr = const_cast<real *>(overlap->recvbuf.data());
764 if (debug != nullptr)
766 fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
767 local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
771 int send_id = overlap->comm_data[ipulse].send_id;
772 int recv_id = overlap->comm_data[ipulse].recv_id;
773 MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
775 recvptr, recv_size_y*datasize, GMX_MPI_REAL,
777 overlap->mpi_comm, &stat);
780 for (x = 0; x < local_fft_ndata[XX]; x++)
782 for (y = 0; y < recv_nindex; y++)
784 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
785 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
786 for (z = 0; z < local_fft_ndata[ZZ]; z++)
788 fftgrid[indg+z] += recvptr[indb+z];
793 if (pme->nnodes_major > 1)
795 /* Copy from the received buffer to the send buffer for dim 0 */
796 sendptr = const_cast<real *>(pme->overlap[0].sendbuf.data());
797 for (x = 0; x < size_yx; x++)
799 for (y = 0; y < recv_nindex; y++)
801 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
802 indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
803 for (z = 0; z < local_fft_ndata[ZZ]; z++)
805 sendptr[indg+z] += recvptr[indb+z];
813 /* We only support a single pulse here.
814 * This is not a severe limitation, as this code is only used
815 * with OpenMP and with OpenMP the (PME) domains can be larger.
817 if (pme->nnodes_major > 1)
819 /* Major dimension */
820 const pme_overlap_t *overlap = &pme->overlap[0];
824 send_nindex = overlap->comm_data[ipulse].send_nindex;
825 /* We don't use recv_index0, as we always receive starting at 0 */
826 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
828 if (debug != nullptr)
830 fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
831 send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
835 int datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
836 int send_id = overlap->comm_data[ipulse].send_id;
837 int recv_id = overlap->comm_data[ipulse].recv_id;
838 auto *sendptr = const_cast<real *>(overlap->sendbuf.data());
839 auto *recvptr = const_cast<real *>(overlap->recvbuf.data());
840 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
842 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
844 overlap->mpi_comm, &stat);
847 for (x = 0; x < recv_nindex; x++)
849 for (y = 0; y < local_fft_ndata[YY]; y++)
851 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
852 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
853 for (z = 0; z < local_fft_ndata[ZZ]; z++)
855 fftgrid[indg + z] += overlap->recvbuf[indb + z];
862 void spread_on_grid(const gmx_pme_t *pme,
863 PmeAtomComm *atc, const pmegrids_t *grids,
864 gmx_bool bCalcSplines, gmx_bool bSpread,
865 real *fftgrid, gmx_bool bDoSplines, int grid_index)
868 #ifdef PME_TIME_THREADS
869 gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
870 static double cs1 = 0, cs2 = 0, cs3 = 0;
871 static double cs1a[6] = {0, 0, 0, 0, 0, 0};
875 nthread = pme->nthread;
878 #ifdef PME_TIME_THREADS
879 c1 = omp_cyc_start();
883 #pragma omp parallel for num_threads(nthread) schedule(static)
884 for (thread = 0; thread < nthread; thread++)
890 start = atc->numAtoms()* thread /nthread;
891 end = atc->numAtoms()*(thread+1)/nthread;
893 /* Compute fftgrid index for all atoms,
894 * with help of some extra variables.
896 calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
898 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
901 #ifdef PME_TIME_THREADS
902 c1 = omp_cyc_end(c1);
906 #ifdef PME_TIME_THREADS
907 c2 = omp_cyc_start();
909 #pragma omp parallel for num_threads(nthread) schedule(static)
910 for (thread = 0; thread < nthread; thread++)
914 splinedata_t *spline;
916 /* make local bsplines */
917 if (grids == nullptr || !pme->bUseThreads)
919 spline = &atc->spline[0];
921 spline->n = atc->numAtoms();
925 spline = &atc->spline[thread];
927 if (grids->nthread == 1)
929 /* One thread, we operate on all coefficients */
930 spline->n = atc->numAtoms();
934 /* Get the indices our thread should operate on */
935 make_thread_local_ind(atc, thread, spline);
941 make_bsplines(spline->theta.coefficients,
942 spline->dtheta.coefficients,
944 as_rvec_array(atc->fractx.data()), spline->n, spline->ind.data(),
945 atc->coefficient.data(), bDoSplines);
950 /* put local atoms on grid. */
951 const pmegrid_t *grid = pme->bUseThreads ? &grids->grid_th[thread] : &grids->grid;
953 #ifdef PME_TIME_SPREAD
954 ct1a = omp_cyc_start();
956 spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
958 if (pme->bUseThreads)
960 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
962 #ifdef PME_TIME_SPREAD
963 ct1a = omp_cyc_end(ct1a);
964 cs1a[thread] += (double)ct1a;
968 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
970 #ifdef PME_TIME_THREADS
971 c2 = omp_cyc_end(c2);
975 if (bSpread && pme->bUseThreads)
977 #ifdef PME_TIME_THREADS
978 c3 = omp_cyc_start();
980 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
981 for (thread = 0; thread < grids->nthread; thread++)
985 reduce_threadgrid_overlap(pme, grids, thread,
987 const_cast<real *>(pme->overlap[0].sendbuf.data()),
988 const_cast<real *>(pme->overlap[1].sendbuf.data()),
991 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
993 #ifdef PME_TIME_THREADS
994 c3 = omp_cyc_end(c3);
1000 /* Communicate the overlapping part of the fftgrid.
1001 * For this communication call we need to check pme->bUseThreads
1002 * to have all ranks communicate here, regardless of pme->nthread.
1004 sum_fftgrid_dd(pme, fftgrid, grid_index);
1008 #ifdef PME_TIME_THREADS
1012 printf("idx %.2f spread %.2f red %.2f",
1013 cs1*1e-9, cs2*1e-9, cs3*1e-9);
1014 #ifdef PME_TIME_SPREAD
1015 for (thread = 0; thread < nthread; thread++)
1017 printf(" %.2f", cs1a[thread]*1e-9);