2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 /* IMPORTANT FOR DEVELOPERS:
40 * Triclinic pme stuff isn't entirely trivial, and we've experienced
41 * some bugs during development (many of them due to me). To avoid
42 * this in the future, please check the following things if you make
43 * changes in this file:
45 * 1. You should obtain identical (at least to the PME precision)
46 * energies, forces, and virial for
47 * a rectangular box and a triclinic one where the z (or y) axis is
48 * tilted a whole box side. For instance you could use these boxes:
50 * rectangular triclinic
55 * 2. You should check the energy conservation in a triclinic box.
57 * It might seem an overkill, but better safe than sorry.
79 #include "gmxcomplex.h"
83 #include "gmx_fatal.h"
89 #include "gmx_wallcycle.h"
90 #include "gmx_parallel_3dfft.h"
92 #include "gmx_cyclecounter.h"
95 /* Single precision, with SSE2 or higher available */
96 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
98 #include "gmx_x86_simd_single.h"
101 /* Some old AMD processors could have problems with unaligned loads+stores */
103 #define PME_SSE_UNALIGNED
107 #include "mpelogging.h"
110 /* #define PRT_FORCE */
111 /* conditions for on the fly time-measurement */
112 /* #define TAKETIME (step > 1 && timesteps < 10) */
113 #define TAKETIME FALSE
115 /* #define PME_TIME_THREADS */
118 #define mpi_type MPI_DOUBLE
120 #define mpi_type MPI_FLOAT
123 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
124 #define GMX_CACHE_SEP 64
126 /* We only define a maximum to be able to use local arrays without allocation.
127 * An order larger than 12 should never be needed, even for test cases.
128 * If needed it can be changed here.
130 #define PME_ORDER_MAX 12
132 /* Internal datastructures */
138 int recv_size; /* Receive buffer width, used with OpenMP */
149 int *send_id, *recv_id;
150 int send_size; /* Send buffer width, used with OpenMP */
151 pme_grid_comm_t *comm_data;
157 int *n; /* Cumulative counts of the number of particles per thread */
158 int nalloc; /* Allocation size of i */
159 int *i; /* Particle indices ordered on thread index (n) */
173 int dimind; /* The index of the dimension, 0=x, 1=y */
180 int *node_dest; /* The nodes to send x and q to with DD */
181 int *node_src; /* The nodes to receive x and q from with DD */
182 int *buf_index; /* Index for commnode into the buffers */
189 int *count; /* The number of atoms to send to each node */
191 int *rcount; /* The number of atoms to receive */
198 gmx_bool bSpread; /* These coordinates are used for spreading */
201 rvec *fractx; /* Fractional coordinate relative to the
202 * lower cell boundary
205 int *thread_idx; /* Which thread should spread which charge */
206 thread_plist_t *thread_plist;
207 splinedata_t *spline;
214 ivec ci; /* The spatial location of this grid */
215 ivec n; /* The used size of *grid, including order-1 */
216 ivec offset; /* The grid offset from the full node grid */
217 int order; /* PME spreading order */
218 ivec s; /* The allocated size of *grid, s >= n */
219 real *grid; /* The grid local thread, size n */
223 pmegrid_t grid; /* The full node grid (non thread-local) */
224 int nthread; /* The number of threads operating on this grid */
225 ivec nc; /* The local spatial decomposition over the threads */
226 pmegrid_t *grid_th; /* Array of grids for each thread */
227 real *grid_all; /* Allocated array for the grids in *grid_th */
228 int **g2t; /* The grid to thread index */
229 ivec nthread_comm; /* The number of threads to communicate with */
235 /* Masks for SSE aligned spreading and gathering */
236 __m128 mask_SSE0[6], mask_SSE1[6];
238 int dummy; /* C89 requires that struct has at least one member */
243 /* work data for solve_pme */
259 typedef struct gmx_pme {
260 int ndecompdim; /* The number of decomposition dimensions */
261 int nodeid; /* Our nodeid in mpi->mpi_comm */
264 int nnodes; /* The number of nodes doing PME */
269 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
271 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
274 gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */
275 int nthread; /* The number of threads doing PME on our rank */
277 gmx_bool bPPnode; /* Node also does particle-particle forces */
278 gmx_bool bFEP; /* Compute Free energy contribution */
279 int nkx, nky, nkz; /* Grid dimensions */
280 gmx_bool bP3M; /* Do P3M: optimize the influence function */
284 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
286 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
287 int pmegrid_nx, pmegrid_ny, pmegrid_nz;
288 /* pmegrid_nz might be larger than strictly necessary to ensure
289 * memory alignment, pmegrid_nz_base gives the real base size.
292 /* The local PME grid starting indices */
293 int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
295 /* Work data for spreading and gathering */
296 pme_spline_work_t *spline_work;
298 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
299 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
300 int fftgrid_nx, fftgrid_ny, fftgrid_nz;
302 t_complex *cfftgridA; /* Grids for complex FFT data */
303 t_complex *cfftgridB;
304 int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
306 gmx_parallel_3dfft_t pfft_setupA;
307 gmx_parallel_3dfft_t pfft_setupB;
309 int *nnx, *nny, *nnz;
310 real *fshx, *fshy, *fshz;
312 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
316 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
318 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
320 rvec *bufv; /* Communication buffer */
321 real *bufr; /* Communication buffer */
322 int buf_nalloc; /* The communication buffer size */
324 /* thread local work data for solve_pme */
327 /* Work data for PME_redist */
328 gmx_bool redist_init;
336 int redist_buf_nalloc;
338 /* Work data for sum_qgrid */
339 real * sum_qgrid_tmp;
340 real * sum_qgrid_dd_tmp;
344 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
345 int start, int end, int thread)
348 int *idxptr, tix, tiy, tiz;
349 real *xptr, *fptr, tx, ty, tz;
350 real rxx, ryx, ryy, rzx, rzy, rzz;
352 int start_ix, start_iy, start_iz;
353 int *g2tx, *g2ty, *g2tz;
355 int *thread_idx = NULL;
356 thread_plist_t *tpl = NULL;
364 start_ix = pme->pmegrid_start_ix;
365 start_iy = pme->pmegrid_start_iy;
366 start_iz = pme->pmegrid_start_iz;
368 rxx = pme->recipbox[XX][XX];
369 ryx = pme->recipbox[YY][XX];
370 ryy = pme->recipbox[YY][YY];
371 rzx = pme->recipbox[ZZ][XX];
372 rzy = pme->recipbox[ZZ][YY];
373 rzz = pme->recipbox[ZZ][ZZ];
375 g2tx = pme->pmegridA.g2t[XX];
376 g2ty = pme->pmegridA.g2t[YY];
377 g2tz = pme->pmegridA.g2t[ZZ];
379 bThreads = (atc->nthread > 1);
382 thread_idx = atc->thread_idx;
384 tpl = &atc->thread_plist[thread];
386 for (i = 0; i < atc->nthread; i++)
392 for (i = start; i < end; i++)
395 idxptr = atc->idx[i];
396 fptr = atc->fractx[i];
398 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
399 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
400 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
401 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
407 /* Because decomposition only occurs in x and y,
408 * we never have a fraction correction in z.
410 fptr[XX] = tx - tix + pme->fshx[tix];
411 fptr[YY] = ty - tiy + pme->fshy[tiy];
414 idxptr[XX] = pme->nnx[tix];
415 idxptr[YY] = pme->nny[tiy];
416 idxptr[ZZ] = pme->nnz[tiz];
419 range_check(idxptr[XX], 0, pme->pmegrid_nx);
420 range_check(idxptr[YY], 0, pme->pmegrid_ny);
421 range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
426 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
427 thread_idx[i] = thread_i;
434 /* Make a list of particle indices sorted on thread */
436 /* Get the cumulative count */
437 for (i = 1; i < atc->nthread; i++)
439 tpl_n[i] += tpl_n[i-1];
441 /* The current implementation distributes particles equally
442 * over the threads, so we could actually allocate for that
443 * in pme_realloc_atomcomm_things.
445 if (tpl_n[atc->nthread-1] > tpl->nalloc)
447 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
448 srenew(tpl->i, tpl->nalloc);
450 /* Set tpl_n to the cumulative start */
451 for (i = atc->nthread-1; i >= 1; i--)
453 tpl_n[i] = tpl_n[i-1];
457 /* Fill our thread local array with indices sorted on thread */
458 for (i = start; i < end; i++)
460 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
462 /* Now tpl_n contains the cummulative count again */
466 static void make_thread_local_ind(pme_atomcomm_t *atc,
467 int thread, splinedata_t *spline)
469 int n, t, i, start, end;
472 /* Combine the indices made by each thread into one index */
476 for (t = 0; t < atc->nthread; t++)
478 tpl = &atc->thread_plist[t];
479 /* Copy our part (start - end) from the list of thread t */
482 start = tpl->n[thread-1];
484 end = tpl->n[thread];
485 for (i = start; i < end; i++)
487 spline->ind[n++] = tpl->i[i];
495 static void pme_calc_pidx(int start, int end,
496 matrix recipbox, rvec x[],
497 pme_atomcomm_t *atc, int *count)
502 real rxx, ryx, rzx, ryy, rzy;
505 /* Calculate PME task index (pidx) for each grid index.
506 * Here we always assign equally sized slabs to each node
507 * for load balancing reasons (the PME grid spacing is not used).
513 /* Reset the count */
514 for (i = 0; i < nslab; i++)
519 if (atc->dimind == 0)
521 rxx = recipbox[XX][XX];
522 ryx = recipbox[YY][XX];
523 rzx = recipbox[ZZ][XX];
524 /* Calculate the node index in x-dimension */
525 for (i = start; i < end; i++)
528 /* Fractional coordinates along box vectors */
529 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
530 si = (int)(s + 2*nslab) % nslab;
537 ryy = recipbox[YY][YY];
538 rzy = recipbox[ZZ][YY];
539 /* Calculate the node index in y-dimension */
540 for (i = start; i < end; i++)
543 /* Fractional coordinates along box vectors */
544 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
545 si = (int)(s + 2*nslab) % nslab;
552 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
555 int nthread, thread, slab;
557 nthread = atc->nthread;
559 #pragma omp parallel for num_threads(nthread) schedule(static)
560 for (thread = 0; thread < nthread; thread++)
562 pme_calc_pidx(natoms* thread /nthread,
563 natoms*(thread+1)/nthread,
564 recipbox, x, atc, atc->count_thread[thread]);
566 /* Non-parallel reduction, since nslab is small */
568 for (thread = 1; thread < nthread; thread++)
570 for (slab = 0; slab < atc->nslab; slab++)
572 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
577 static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
579 const int padding = 4;
582 srenew(th[XX], nalloc);
583 srenew(th[YY], nalloc);
584 /* In z we add padding, this is only required for the aligned SSE code */
585 srenew(*ptr_z, nalloc+2*padding);
586 th[ZZ] = *ptr_z + padding;
588 for (i = 0; i < padding; i++)
591 (*ptr_z)[padding+nalloc+i] = 0;
595 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
599 srenew(spline->ind, atc->nalloc);
600 /* Initialize the index to identity so it works without threads */
601 for (i = 0; i < atc->nalloc; i++)
606 realloc_splinevec(spline->theta, &spline->ptr_theta_z,
607 atc->pme_order*atc->nalloc);
608 realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
609 atc->pme_order*atc->nalloc);
612 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
614 int nalloc_old, i, j, nalloc_tpl;
616 /* We have to avoid a NULL pointer for atc->x to avoid
617 * possible fatal errors in MPI routines.
619 if (atc->n > atc->nalloc || atc->nalloc == 0)
621 nalloc_old = atc->nalloc;
622 atc->nalloc = over_alloc_dd(max(atc->n, 1));
626 srenew(atc->x, atc->nalloc);
627 srenew(atc->q, atc->nalloc);
628 srenew(atc->f, atc->nalloc);
629 for (i = nalloc_old; i < atc->nalloc; i++)
631 clear_rvec(atc->f[i]);
636 srenew(atc->fractx, atc->nalloc);
637 srenew(atc->idx, atc->nalloc);
639 if (atc->nthread > 1)
641 srenew(atc->thread_idx, atc->nalloc);
644 for (i = 0; i < atc->nthread; i++)
646 pme_realloc_splinedata(&atc->spline[i], atc);
652 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
653 int n, gmx_bool bXF, rvec *x_f, real *charge,
655 /* Redistribute particle data for PME calculation */
656 /* domain decomposition by x coordinate */
661 if (FALSE == pme->redist_init)
663 snew(pme->scounts, atc->nslab);
664 snew(pme->rcounts, atc->nslab);
665 snew(pme->sdispls, atc->nslab);
666 snew(pme->rdispls, atc->nslab);
667 snew(pme->sidx, atc->nslab);
668 pme->redist_init = TRUE;
670 if (n > pme->redist_buf_nalloc)
672 pme->redist_buf_nalloc = over_alloc_dd(n);
673 srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
681 /* forward, redistribution from pp to pme */
683 /* Calculate send counts and exchange them with other nodes */
684 for (i = 0; (i < atc->nslab); i++)
688 for (i = 0; (i < n); i++)
690 pme->scounts[pme->idxa[i]]++;
692 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
694 /* Calculate send and receive displacements and index into send
699 for (i = 1; i < atc->nslab; i++)
701 pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
702 pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
703 pme->sidx[i] = pme->sdispls[i];
705 /* Total # of particles to be received */
706 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
708 pme_realloc_atomcomm_things(atc);
710 /* Copy particle coordinates into send buffer and exchange*/
711 for (i = 0; (i < n); i++)
713 ii = DIM*pme->sidx[pme->idxa[i]];
714 pme->sidx[pme->idxa[i]]++;
715 pme->redist_buf[ii+XX] = x_f[i][XX];
716 pme->redist_buf[ii+YY] = x_f[i][YY];
717 pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
719 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
720 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
721 pme->rvec_mpi, atc->mpi_comm);
725 /* Copy charge into send buffer and exchange*/
726 for (i = 0; i < atc->nslab; i++)
728 pme->sidx[i] = pme->sdispls[i];
730 for (i = 0; (i < n); i++)
732 ii = pme->sidx[pme->idxa[i]];
733 pme->sidx[pme->idxa[i]]++;
734 pme->redist_buf[ii] = charge[i];
736 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
737 atc->q, pme->rcounts, pme->rdispls, mpi_type,
740 else /* backward, redistribution from pme to pp */
742 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
743 pme->redist_buf, pme->scounts, pme->sdispls,
744 pme->rvec_mpi, atc->mpi_comm);
746 /* Copy data from receive buffer */
747 for (i = 0; i < atc->nslab; i++)
749 pme->sidx[i] = pme->sdispls[i];
751 for (i = 0; (i < n); i++)
753 ii = DIM*pme->sidx[pme->idxa[i]];
754 x_f[i][XX] += pme->redist_buf[ii+XX];
755 x_f[i][YY] += pme->redist_buf[ii+YY];
756 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
757 pme->sidx[pme->idxa[i]]++;
763 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
764 gmx_bool bBackward, int shift,
765 void *buf_s, int nbyte_s,
766 void *buf_r, int nbyte_r)
772 if (bBackward == FALSE)
774 dest = atc->node_dest[shift];
775 src = atc->node_src[shift];
779 dest = atc->node_src[shift];
780 src = atc->node_dest[shift];
783 if (nbyte_s > 0 && nbyte_r > 0)
785 MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
787 buf_r, nbyte_r, MPI_BYTE,
789 atc->mpi_comm, &stat);
791 else if (nbyte_s > 0)
793 MPI_Send(buf_s, nbyte_s, MPI_BYTE,
797 else if (nbyte_r > 0)
799 MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
801 atc->mpi_comm, &stat);
806 static void dd_pmeredist_x_q(gmx_pme_t pme,
807 int n, gmx_bool bX, rvec *x, real *charge,
810 int *commnode, *buf_index;
811 int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
813 commnode = atc->node_dest;
814 buf_index = atc->buf_index;
816 nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
819 for (i = 0; i < nnodes_comm; i++)
821 buf_index[commnode[i]] = nsend;
822 nsend += atc->count[commnode[i]];
826 if (atc->count[atc->nodeid] + nsend != n)
828 gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
829 "This usually means that your system is not well equilibrated.",
830 n - (atc->count[atc->nodeid] + nsend),
831 pme->nodeid, 'x'+atc->dimind);
834 if (nsend > pme->buf_nalloc)
836 pme->buf_nalloc = over_alloc_dd(nsend);
837 srenew(pme->bufv, pme->buf_nalloc);
838 srenew(pme->bufr, pme->buf_nalloc);
841 atc->n = atc->count[atc->nodeid];
842 for (i = 0; i < nnodes_comm; i++)
844 scount = atc->count[commnode[i]];
845 /* Communicate the count */
848 fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
849 atc->dimind, atc->nodeid, commnode[i], scount);
851 pme_dd_sendrecv(atc, FALSE, i,
852 &scount, sizeof(int),
853 &atc->rcount[i], sizeof(int));
854 atc->n += atc->rcount[i];
857 pme_realloc_atomcomm_things(atc);
861 for (i = 0; i < n; i++)
864 if (node == atc->nodeid)
866 /* Copy direct to the receive buffer */
869 copy_rvec(x[i], atc->x[local_pos]);
871 atc->q[local_pos] = charge[i];
876 /* Copy to the send buffer */
879 copy_rvec(x[i], pme->bufv[buf_index[node]]);
881 pme->bufr[buf_index[node]] = charge[i];
887 for (i = 0; i < nnodes_comm; i++)
889 scount = atc->count[commnode[i]];
890 rcount = atc->rcount[i];
891 if (scount > 0 || rcount > 0)
895 /* Communicate the coordinates */
896 pme_dd_sendrecv(atc, FALSE, i,
897 pme->bufv[buf_pos], scount*sizeof(rvec),
898 atc->x[local_pos], rcount*sizeof(rvec));
900 /* Communicate the charges */
901 pme_dd_sendrecv(atc, FALSE, i,
902 pme->bufr+buf_pos, scount*sizeof(real),
903 atc->q+local_pos, rcount*sizeof(real));
905 local_pos += atc->rcount[i];
910 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
914 int *commnode, *buf_index;
915 int nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
917 commnode = atc->node_dest;
918 buf_index = atc->buf_index;
920 nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
922 local_pos = atc->count[atc->nodeid];
924 for (i = 0; i < nnodes_comm; i++)
926 scount = atc->rcount[i];
927 rcount = atc->count[commnode[i]];
928 if (scount > 0 || rcount > 0)
930 /* Communicate the forces */
931 pme_dd_sendrecv(atc, TRUE, i,
932 atc->f[local_pos], scount*sizeof(rvec),
933 pme->bufv[buf_pos], rcount*sizeof(rvec));
936 buf_index[commnode[i]] = buf_pos;
943 for (i = 0; i < n; i++)
946 if (node == atc->nodeid)
948 /* Add from the local force array */
949 rvec_inc(f[i], atc->f[local_pos]);
954 /* Add from the receive buffer */
955 rvec_inc(f[i], pme->bufv[buf_index[node]]);
962 for (i = 0; i < n; i++)
965 if (node == atc->nodeid)
967 /* Copy from the local force array */
968 copy_rvec(atc->f[local_pos], f[i]);
973 /* Copy from the receive buffer */
974 copy_rvec(pme->bufv[buf_index[node]], f[i]);
983 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
985 pme_overlap_t *overlap;
986 int send_index0, send_nindex;
987 int recv_index0, recv_nindex;
989 int i, j, k, ix, iy, iz, icnt;
990 int ipulse, send_id, recv_id, datasize;
992 real *sendptr, *recvptr;
994 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
995 overlap = &pme->overlap[1];
997 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
999 /* Since we have already (un)wrapped the overlap in the z-dimension,
1000 * we only have to communicate 0 to nkz (not pmegrid_nz).
1002 if (direction == GMX_SUM_QGRID_FORWARD)
1004 send_id = overlap->send_id[ipulse];
1005 recv_id = overlap->recv_id[ipulse];
1006 send_index0 = overlap->comm_data[ipulse].send_index0;
1007 send_nindex = overlap->comm_data[ipulse].send_nindex;
1008 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1009 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1013 send_id = overlap->recv_id[ipulse];
1014 recv_id = overlap->send_id[ipulse];
1015 send_index0 = overlap->comm_data[ipulse].recv_index0;
1016 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1017 recv_index0 = overlap->comm_data[ipulse].send_index0;
1018 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1021 /* Copy data to contiguous send buffer */
1024 fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1025 pme->nodeid, overlap->nodeid, send_id,
1026 pme->pmegrid_start_iy,
1027 send_index0-pme->pmegrid_start_iy,
1028 send_index0-pme->pmegrid_start_iy+send_nindex);
1031 for (i = 0; i < pme->pmegrid_nx; i++)
1034 for (j = 0; j < send_nindex; j++)
1036 iy = j + send_index0 - pme->pmegrid_start_iy;
1037 for (k = 0; k < pme->nkz; k++)
1040 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
1045 datasize = pme->pmegrid_nx * pme->nkz;
1047 MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
1049 overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
1051 overlap->mpi_comm, &stat);
1053 /* Get data from contiguous recv buffer */
1056 fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1057 pme->nodeid, overlap->nodeid, recv_id,
1058 pme->pmegrid_start_iy,
1059 recv_index0-pme->pmegrid_start_iy,
1060 recv_index0-pme->pmegrid_start_iy+recv_nindex);
1063 for (i = 0; i < pme->pmegrid_nx; i++)
1066 for (j = 0; j < recv_nindex; j++)
1068 iy = j + recv_index0 - pme->pmegrid_start_iy;
1069 for (k = 0; k < pme->nkz; k++)
1072 if (direction == GMX_SUM_QGRID_FORWARD)
1074 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1078 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1085 /* Major dimension is easier, no copying required,
1086 * but we might have to sum to separate array.
1087 * Since we don't copy, we have to communicate up to pmegrid_nz,
1088 * not nkz as for the minor direction.
1090 overlap = &pme->overlap[0];
1092 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1094 if (direction == GMX_SUM_QGRID_FORWARD)
1096 send_id = overlap->send_id[ipulse];
1097 recv_id = overlap->recv_id[ipulse];
1098 send_index0 = overlap->comm_data[ipulse].send_index0;
1099 send_nindex = overlap->comm_data[ipulse].send_nindex;
1100 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1101 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1102 recvptr = overlap->recvbuf;
1106 send_id = overlap->recv_id[ipulse];
1107 recv_id = overlap->send_id[ipulse];
1108 send_index0 = overlap->comm_data[ipulse].recv_index0;
1109 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1110 recv_index0 = overlap->comm_data[ipulse].send_index0;
1111 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1112 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1115 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1116 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1120 fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1121 pme->nodeid, overlap->nodeid, send_id,
1122 pme->pmegrid_start_ix,
1123 send_index0-pme->pmegrid_start_ix,
1124 send_index0-pme->pmegrid_start_ix+send_nindex);
1125 fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1126 pme->nodeid, overlap->nodeid, recv_id,
1127 pme->pmegrid_start_ix,
1128 recv_index0-pme->pmegrid_start_ix,
1129 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1132 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
1134 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
1136 overlap->mpi_comm, &stat);
1138 /* ADD data from contiguous recv buffer */
1139 if (direction == GMX_SUM_QGRID_FORWARD)
1141 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1142 for (i = 0; i < recv_nindex*datasize; i++)
1144 p[i] += overlap->recvbuf[i];
1153 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1155 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1156 ivec local_pme_size;
1160 /* Dimensions should be identical for A/B grid, so we just use A here */
1161 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1166 local_pme_size[0] = pme->pmegrid_nx;
1167 local_pme_size[1] = pme->pmegrid_ny;
1168 local_pme_size[2] = pme->pmegrid_nz;
1170 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1171 the offset is identical, and the PME grid always has more data (due to overlap)
1176 char fn[STRLEN], format[STRLEN];
1178 sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1179 fp = ffopen(fn, "w");
1180 sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1181 fp2 = ffopen(fn, "w");
1182 sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1185 for (ix = 0; ix < local_fft_ndata[XX]; ix++)
1187 for (iy = 0; iy < local_fft_ndata[YY]; iy++)
1189 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1191 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1192 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1193 fftgrid[fftidx] = pmegrid[pmeidx];
1195 val = 100*pmegrid[pmeidx];
1196 if (pmegrid[pmeidx] != 0)
1198 fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
1199 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
1201 if (pmegrid[pmeidx] != 0)
1203 fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n",
1205 pme->pmegrid_start_ix + ix,
1206 pme->pmegrid_start_iy + iy,
1207 pme->pmegrid_start_iz + iz,
1223 static gmx_cycles_t omp_cyc_start()
1225 return gmx_cycles_read();
1228 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1230 return gmx_cycles_read() - c;
1235 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1236 int nthread, int thread)
1238 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1239 ivec local_pme_size;
1240 int ixy0, ixy1, ixy, ix, iy, iz;
1242 #ifdef PME_TIME_THREADS
1244 static double cs1 = 0;
1248 #ifdef PME_TIME_THREADS
1249 c1 = omp_cyc_start();
1251 /* Dimensions should be identical for A/B grid, so we just use A here */
1252 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1257 local_pme_size[0] = pme->pmegrid_nx;
1258 local_pme_size[1] = pme->pmegrid_ny;
1259 local_pme_size[2] = pme->pmegrid_nz;
1261 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1262 the offset is identical, and the PME grid always has more data (due to overlap)
1264 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1265 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1267 for (ixy = ixy0; ixy < ixy1; ixy++)
1269 ix = ixy/local_fft_ndata[YY];
1270 iy = ixy - ix*local_fft_ndata[YY];
1272 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1273 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1274 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1276 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1280 #ifdef PME_TIME_THREADS
1281 c1 = omp_cyc_end(c1);
1286 printf("copy %.2f\n", cs1*1e-9);
1295 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1297 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1303 pnx = pme->pmegrid_nx;
1304 pny = pme->pmegrid_ny;
1305 pnz = pme->pmegrid_nz;
1307 overlap = pme->pme_order - 1;
1309 /* Add periodic overlap in z */
1310 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1312 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1314 for (iz = 0; iz < overlap; iz++)
1316 pmegrid[(ix*pny+iy)*pnz+iz] +=
1317 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1322 if (pme->nnodes_minor == 1)
1324 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1326 for (iy = 0; iy < overlap; iy++)
1328 for (iz = 0; iz < nz; iz++)
1330 pmegrid[(ix*pny+iy)*pnz+iz] +=
1331 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1337 if (pme->nnodes_major == 1)
1339 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1341 for (ix = 0; ix < overlap; ix++)
1343 for (iy = 0; iy < ny_x; iy++)
1345 for (iz = 0; iz < nz; iz++)
1347 pmegrid[(ix*pny+iy)*pnz+iz] +=
1348 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1357 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1359 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1365 pnx = pme->pmegrid_nx;
1366 pny = pme->pmegrid_ny;
1367 pnz = pme->pmegrid_nz;
1369 overlap = pme->pme_order - 1;
1371 if (pme->nnodes_major == 1)
1373 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1375 for (ix = 0; ix < overlap; ix++)
1379 for (iy = 0; iy < ny_x; iy++)
1381 for (iz = 0; iz < nz; iz++)
1383 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1384 pmegrid[(ix*pny+iy)*pnz+iz];
1390 if (pme->nnodes_minor == 1)
1392 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1393 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1397 for (iy = 0; iy < overlap; iy++)
1399 for (iz = 0; iz < nz; iz++)
1401 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1402 pmegrid[(ix*pny+iy)*pnz+iz];
1408 /* Copy periodic overlap in z */
1409 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1410 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1414 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1416 for (iz = 0; iz < overlap; iz++)
1418 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1419 pmegrid[(ix*pny+iy)*pnz+iz];
1425 static void clear_grid(int nx, int ny, int nz, real *grid,
1427 int fx, int fy, int fz,
1431 int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
1434 nc = 2 + (order - 2)/FLBS;
1435 ncz = 2 + (order - 2)/FLBSZ;
1437 for (fsx = fx; fsx < fx+nc; fsx++)
1439 for (fsy = fy; fsy < fy+nc; fsy++)
1441 for (fsz = fz; fsz < fz+ncz; fsz++)
1443 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1444 if (flag[flind] == 0)
1449 g0x = (gx*ny + gy)*nz + gz;
1450 for (x = 0; x < FLBS; x++)
1453 for (y = 0; y < FLBS; y++)
1455 for (z = 0; z < FLBSZ; z++)
1471 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1472 #define DO_BSPLINE(order) \
1473 for (ithx = 0; (ithx < order); ithx++) \
1475 index_x = (i0+ithx)*pny*pnz; \
1476 valx = qn*thx[ithx]; \
1478 for (ithy = 0; (ithy < order); ithy++) \
1480 valxy = valx*thy[ithy]; \
1481 index_xy = index_x+(j0+ithy)*pnz; \
1483 for (ithz = 0; (ithz < order); ithz++) \
1485 index_xyz = index_xy+(k0+ithz); \
1486 grid[index_xyz] += valxy*thz[ithz]; \
1492 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1493 pme_atomcomm_t *atc, splinedata_t *spline,
1494 pme_spline_work_t *work)
1497 /* spread charges from home atoms to local grid */
1500 int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1502 int order, norder, index_x, index_xy, index_xyz;
1503 real valx, valxy, qn;
1504 real *thx, *thy, *thz;
1505 int localsize, bndsize;
1506 int pnx, pny, pnz, ndatatot;
1507 int offx, offy, offz;
1509 pnx = pmegrid->s[XX];
1510 pny = pmegrid->s[YY];
1511 pnz = pmegrid->s[ZZ];
1513 offx = pmegrid->offset[XX];
1514 offy = pmegrid->offset[YY];
1515 offz = pmegrid->offset[ZZ];
1517 ndatatot = pnx*pny*pnz;
1518 grid = pmegrid->grid;
1519 for (i = 0; i < ndatatot; i++)
1524 order = pmegrid->order;
1526 for (nn = 0; nn < spline->n; nn++)
1528 n = spline->ind[nn];
1533 idxptr = atc->idx[n];
1536 i0 = idxptr[XX] - offx;
1537 j0 = idxptr[YY] - offy;
1538 k0 = idxptr[ZZ] - offz;
1540 thx = spline->theta[XX] + norder;
1541 thy = spline->theta[YY] + norder;
1542 thz = spline->theta[ZZ] + norder;
1548 #ifdef PME_SSE_UNALIGNED
1549 #define PME_SPREAD_SSE_ORDER4
1551 #define PME_SPREAD_SSE_ALIGNED
1554 #include "pme_sse_single.h"
1561 #define PME_SPREAD_SSE_ALIGNED
1563 #include "pme_sse_single.h"
1576 static void set_grid_alignment(int *pmegrid_nz, int pme_order)
1580 #ifndef PME_SSE_UNALIGNED
1585 /* Round nz up to a multiple of 4 to ensure alignment */
1586 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1591 static void set_gridsize_alignment(int *gridsize, int pme_order)
1594 #ifndef PME_SSE_UNALIGNED
1597 /* Add extra elements to ensured aligned operations do not go
1598 * beyond the allocated grid size.
1599 * Note that for pme_order=5, the pme grid z-size alignment
1600 * ensures that we will not go beyond the grid size.
1608 static void pmegrid_init(pmegrid_t *grid,
1609 int cx, int cy, int cz,
1610 int x0, int y0, int z0,
1611 int x1, int y1, int z1,
1612 gmx_bool set_alignment,
1621 grid->offset[XX] = x0;
1622 grid->offset[YY] = y0;
1623 grid->offset[ZZ] = z0;
1624 grid->n[XX] = x1 - x0 + pme_order - 1;
1625 grid->n[YY] = y1 - y0 + pme_order - 1;
1626 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1627 copy_ivec(grid->n, grid->s);
1630 set_grid_alignment(&nz, pme_order);
1635 else if (nz != grid->s[ZZ])
1637 gmx_incons("pmegrid_init call with an unaligned z size");
1640 grid->order = pme_order;
1643 gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1644 set_gridsize_alignment(&gridsize, pme_order);
1645 snew_aligned(grid->grid, gridsize, 16);
1653 static int div_round_up(int enumerator, int denominator)
1655 return (enumerator + denominator - 1)/denominator;
1658 static void make_subgrid_division(const ivec n, int ovl, int nthread,
1661 int gsize_opt, gsize;
1666 for (nsx = 1; nsx <= nthread; nsx++)
1668 if (nthread % nsx == 0)
1670 for (nsy = 1; nsy <= nthread; nsy++)
1672 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1674 nsz = nthread/(nsx*nsy);
1676 /* Determine the number of grid points per thread */
1678 (div_round_up(n[XX], nsx) + ovl)*
1679 (div_round_up(n[YY], nsy) + ovl)*
1680 (div_round_up(n[ZZ], nsz) + ovl);
1682 /* Minimize the number of grids points per thread
1683 * and, secondarily, the number of cuts in minor dimensions.
1685 if (gsize_opt == -1 ||
1686 gsize < gsize_opt ||
1687 (gsize == gsize_opt &&
1688 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1700 env = getenv("GMX_PME_THREAD_DIVISION");
1703 sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
1706 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1708 gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
1712 static void pmegrids_init(pmegrids_t *grids,
1713 int nx, int ny, int nz, int nz_base,
1715 gmx_bool bUseThreads,
1720 ivec n, n_base, g0, g1;
1721 int t, x, y, z, d, i, tfac;
1722 int max_comm_lines = -1;
1724 n[XX] = nx - (pme_order - 1);
1725 n[YY] = ny - (pme_order - 1);
1726 n[ZZ] = nz - (pme_order - 1);
1728 copy_ivec(n, n_base);
1729 n_base[ZZ] = nz_base;
1731 pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
1734 grids->nthread = nthread;
1736 make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1743 for (d = 0; d < DIM; d++)
1745 nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1747 set_grid_alignment(&nst[ZZ], pme_order);
1751 fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1752 grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
1753 fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1755 nst[XX], nst[YY], nst[ZZ]);
1758 snew(grids->grid_th, grids->nthread);
1760 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1761 set_gridsize_alignment(&gridsize, pme_order);
1762 snew_aligned(grids->grid_all,
1763 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1766 for (x = 0; x < grids->nc[XX]; x++)
1768 for (y = 0; y < grids->nc[YY]; y++)
1770 for (z = 0; z < grids->nc[ZZ]; z++)
1772 pmegrid_init(&grids->grid_th[t],
1774 (n[XX]*(x ))/grids->nc[XX],
1775 (n[YY]*(y ))/grids->nc[YY],
1776 (n[ZZ]*(z ))/grids->nc[ZZ],
1777 (n[XX]*(x+1))/grids->nc[XX],
1778 (n[YY]*(y+1))/grids->nc[YY],
1779 (n[ZZ]*(z+1))/grids->nc[ZZ],
1782 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1790 grids->grid_th = NULL;
1793 snew(grids->g2t, DIM);
1795 for (d = DIM-1; d >= 0; d--)
1797 snew(grids->g2t[d], n[d]);
1799 for (i = 0; i < n[d]; i++)
1801 /* The second check should match the parameters
1802 * of the pmegrid_init call above.
1804 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1808 grids->g2t[d][i] = t*tfac;
1811 tfac *= grids->nc[d];
1815 case XX: max_comm_lines = overlap_x; break;
1816 case YY: max_comm_lines = overlap_y; break;
1817 case ZZ: max_comm_lines = pme_order - 1; break;
1819 grids->nthread_comm[d] = 0;
1820 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1821 grids->nthread_comm[d] < grids->nc[d])
1823 grids->nthread_comm[d]++;
1827 fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1828 'x'+d, grids->nthread_comm[d]);
1830 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1831 * work, but this is not a problematic restriction.
1833 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1835 gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1841 static void pmegrids_destroy(pmegrids_t *grids)
1845 if (grids->grid.grid != NULL)
1847 sfree(grids->grid.grid);
1849 if (grids->nthread > 0)
1851 for (t = 0; t < grids->nthread; t++)
1853 sfree(grids->grid_th[t].grid);
1855 sfree(grids->grid_th);
1861 static void realloc_work(pme_work_t *work, int nkx)
1863 if (nkx > work->nalloc)
1866 srenew(work->mhx, work->nalloc);
1867 srenew(work->mhy, work->nalloc);
1868 srenew(work->mhz, work->nalloc);
1869 srenew(work->m2, work->nalloc);
1870 /* Allocate an aligned pointer for SSE operations, including 3 extra
1871 * elements at the end since SSE operates on 4 elements at a time.
1873 sfree_aligned(work->denom);
1874 sfree_aligned(work->tmp1);
1875 sfree_aligned(work->eterm);
1876 snew_aligned(work->denom, work->nalloc+3, 16);
1877 snew_aligned(work->tmp1, work->nalloc+3, 16);
1878 snew_aligned(work->eterm, work->nalloc+3, 16);
1879 srenew(work->m2inv, work->nalloc);
1884 static void free_work(pme_work_t *work)
1890 sfree_aligned(work->denom);
1891 sfree_aligned(work->tmp1);
1892 sfree_aligned(work->eterm);
1898 /* Calculate exponentials through SSE in float precision */
1899 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1902 const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
1905 __m128 tmp_d1, d_inv, tmp_r, tmp_e;
1907 f_sse = _mm_load1_ps(&f);
1908 for (kx = 0; kx < end; kx += 4)
1910 tmp_d1 = _mm_load_ps(d_aligned+kx);
1911 lu = _mm_rcp_ps(tmp_d1);
1912 d_inv = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
1913 tmp_r = _mm_load_ps(r_aligned+kx);
1914 tmp_r = gmx_mm_exp_ps(tmp_r);
1915 tmp_e = _mm_mul_ps(f_sse, d_inv);
1916 tmp_e = _mm_mul_ps(tmp_e, tmp_r);
1917 _mm_store_ps(e_aligned+kx, tmp_e);
1922 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1925 for (kx = start; kx < end; kx++)
1929 for (kx = start; kx < end; kx++)
1933 for (kx = start; kx < end; kx++)
1935 e[kx] = f*r[kx]*d[kx];
1941 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1942 real ewaldcoeff, real vol,
1944 int nthread, int thread)
1946 /* do recip sum over local cells in grid */
1947 /* y major, z middle, x minor or continuous */
1949 int kx, ky, kz, maxkx, maxky, maxkz;
1950 int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1952 real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1953 real ets2, struct2, vfactor, ets2vf;
1954 real d1, d2, energy = 0;
1956 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1957 real rxx, ryx, ryy, rzx, rzy, rzz;
1959 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1960 real mhxk, mhyk, mhzk, m2k;
1963 ivec local_ndata, local_offset, local_size;
1966 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1972 /* Dimensions should be identical for A/B grid, so we just use A here */
1973 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1979 rxx = pme->recipbox[XX][XX];
1980 ryx = pme->recipbox[YY][XX];
1981 ryy = pme->recipbox[YY][YY];
1982 rzx = pme->recipbox[ZZ][XX];
1983 rzy = pme->recipbox[ZZ][YY];
1984 rzz = pme->recipbox[ZZ][ZZ];
1990 work = &pme->work[thread];
1995 denom = work->denom;
1997 eterm = work->eterm;
1998 m2inv = work->m2inv;
2000 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
2001 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
2003 for (iyz = iyz0; iyz < iyz1; iyz++)
2005 iy = iyz/local_ndata[ZZ];
2006 iz = iyz - iy*local_ndata[ZZ];
2008 ky = iy + local_offset[YY];
2019 by = M_PI*vol*pme->bsp_mod[YY][ky];
2021 kz = iz + local_offset[ZZ];
2025 bz = pme->bsp_mod[ZZ][kz];
2027 /* 0.5 correction for corner points */
2029 if (kz == 0 || kz == (nz+1)/2)
2034 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2036 /* We should skip the k-space point (0,0,0) */
2037 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
2039 kxstart = local_offset[XX];
2043 kxstart = local_offset[XX] + 1;
2046 kxend = local_offset[XX] + local_ndata[XX];
2050 /* More expensive inner loop, especially because of the storage
2051 * of the mh elements in array's.
2052 * Because x is the minor grid index, all mh elements
2053 * depend on kx for triclinic unit cells.
2056 /* Two explicit loops to avoid a conditional inside the loop */
2057 for (kx = kxstart; kx < maxkx; kx++)
2062 mhyk = mx * ryx + my * ryy;
2063 mhzk = mx * rzx + my * rzy + mz * rzz;
2064 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2069 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2070 tmp1[kx] = -factor*m2k;
2073 for (kx = maxkx; kx < kxend; kx++)
2078 mhyk = mx * ryx + my * ryy;
2079 mhzk = mx * rzx + my * rzy + mz * rzz;
2080 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2085 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2086 tmp1[kx] = -factor*m2k;
2089 for (kx = kxstart; kx < kxend; kx++)
2091 m2inv[kx] = 1.0/m2[kx];
2094 calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2096 for (kx = kxstart; kx < kxend; kx++, p0++)
2101 p0->re = d1*eterm[kx];
2102 p0->im = d2*eterm[kx];
2104 struct2 = 2.0*(d1*d1+d2*d2);
2106 tmp1[kx] = eterm[kx]*struct2;
2109 for (kx = kxstart; kx < kxend; kx++)
2111 ets2 = corner_fac*tmp1[kx];
2112 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2115 ets2vf = ets2*vfactor;
2116 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2117 virxy += ets2vf*mhx[kx]*mhy[kx];
2118 virxz += ets2vf*mhx[kx]*mhz[kx];
2119 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2120 viryz += ets2vf*mhy[kx]*mhz[kx];
2121 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2126 /* We don't need to calculate the energy and the virial.
2127 * In this case the triclinic overhead is small.
2130 /* Two explicit loops to avoid a conditional inside the loop */
2132 for (kx = kxstart; kx < maxkx; kx++)
2137 mhyk = mx * ryx + my * ryy;
2138 mhzk = mx * rzx + my * rzy + mz * rzz;
2139 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2140 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2141 tmp1[kx] = -factor*m2k;
2144 for (kx = maxkx; kx < kxend; kx++)
2149 mhyk = mx * ryx + my * ryy;
2150 mhzk = mx * rzx + my * rzy + mz * rzz;
2151 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2152 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2153 tmp1[kx] = -factor*m2k;
2156 calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2158 for (kx = kxstart; kx < kxend; kx++, p0++)
2163 p0->re = d1*eterm[kx];
2164 p0->im = d2*eterm[kx];
2171 /* Update virial with local values.
2172 * The virial is symmetric by definition.
2173 * this virial seems ok for isotropic scaling, but I'm
2174 * experiencing problems on semiisotropic membranes.
2175 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2177 work->vir[XX][XX] = 0.25*virxx;
2178 work->vir[YY][YY] = 0.25*viryy;
2179 work->vir[ZZ][ZZ] = 0.25*virzz;
2180 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2181 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2182 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2184 /* This energy should be corrected for a charged system */
2185 work->energy = 0.5*energy;
2188 /* Return the loop count */
2189 return local_ndata[YY]*local_ndata[XX];
2192 static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
2193 real *mesh_energy, matrix vir)
2195 /* This function sums output over threads
2196 * and should therefore only be called after thread synchronization.
2200 *mesh_energy = pme->work[0].energy;
2201 copy_mat(pme->work[0].vir, vir);
2203 for (thread = 1; thread < nthread; thread++)
2205 *mesh_energy += pme->work[thread].energy;
2206 m_add(vir, pme->work[thread].vir, vir);
2210 #define DO_FSPLINE(order) \
2211 for (ithx = 0; (ithx < order); ithx++) \
2213 index_x = (i0+ithx)*pny*pnz; \
2217 for (ithy = 0; (ithy < order); ithy++) \
2219 index_xy = index_x+(j0+ithy)*pnz; \
2224 for (ithz = 0; (ithz < order); ithz++) \
2226 gval = grid[index_xy+(k0+ithz)]; \
2227 fxy1 += thz[ithz]*gval; \
2228 fz1 += dthz[ithz]*gval; \
2237 static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2238 gmx_bool bClearF, pme_atomcomm_t *atc,
2239 splinedata_t *spline,
2242 /* sum forces for local particles */
2243 int nn, n, ithx, ithy, ithz, i0, j0, k0;
2244 int index_x, index_xy;
2245 int nx, ny, nz, pnx, pny, pnz;
2247 real tx, ty, dx, dy, qn;
2248 real fx, fy, fz, gval;
2250 real *thx, *thy, *thz, *dthx, *dthy, *dthz;
2252 real rxx, ryx, ryy, rzx, rzy, rzz;
2255 pme_spline_work_t *work;
2257 work = pme->spline_work;
2259 order = pme->pme_order;
2260 thx = spline->theta[XX];
2261 thy = spline->theta[YY];
2262 thz = spline->theta[ZZ];
2263 dthx = spline->dtheta[XX];
2264 dthy = spline->dtheta[YY];
2265 dthz = spline->dtheta[ZZ];
2269 pnx = pme->pmegrid_nx;
2270 pny = pme->pmegrid_ny;
2271 pnz = pme->pmegrid_nz;
2273 rxx = pme->recipbox[XX][XX];
2274 ryx = pme->recipbox[YY][XX];
2275 ryy = pme->recipbox[YY][YY];
2276 rzx = pme->recipbox[ZZ][XX];
2277 rzy = pme->recipbox[ZZ][YY];
2278 rzz = pme->recipbox[ZZ][ZZ];
2280 for (nn = 0; nn < spline->n; nn++)
2282 n = spline->ind[nn];
2283 qn = scale*atc->q[n];
2296 idxptr = atc->idx[n];
2303 /* Pointer arithmetic alert, next six statements */
2304 thx = spline->theta[XX] + norder;
2305 thy = spline->theta[YY] + norder;
2306 thz = spline->theta[ZZ] + norder;
2307 dthx = spline->dtheta[XX] + norder;
2308 dthy = spline->dtheta[YY] + norder;
2309 dthz = spline->dtheta[ZZ] + norder;
2315 #ifdef PME_SSE_UNALIGNED
2316 #define PME_GATHER_F_SSE_ORDER4
2318 #define PME_GATHER_F_SSE_ALIGNED
2321 #include "pme_sse_single.h"
2328 #define PME_GATHER_F_SSE_ALIGNED
2330 #include "pme_sse_single.h"
2340 atc->f[n][XX] += -qn*( fx*nx*rxx );
2341 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2342 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2345 /* Since the energy and not forces are interpolated
2346 * the net force might not be exactly zero.
2347 * This can be solved by also interpolating F, but
2348 * that comes at a cost.
2349 * A better hack is to remove the net force every
2350 * step, but that must be done at a higher level
2351 * since this routine doesn't see all atoms if running
2352 * in parallel. Don't know how important it is? EL 990726
2357 static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2358 pme_atomcomm_t *atc)
2360 splinedata_t *spline;
2361 int n, ithx, ithy, ithz, i0, j0, k0;
2362 int index_x, index_xy;
2364 real energy, pot, tx, ty, qn, gval;
2365 real *thx, *thy, *thz;
2369 spline = &atc->spline[0];
2371 order = pme->pme_order;
2374 for (n = 0; (n < atc->n); n++)
2380 idxptr = atc->idx[n];
2387 /* Pointer arithmetic alert, next three statements */
2388 thx = spline->theta[XX] + norder;
2389 thy = spline->theta[YY] + norder;
2390 thz = spline->theta[ZZ] + norder;
2393 for (ithx = 0; (ithx < order); ithx++)
2395 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2398 for (ithy = 0; (ithy < order); ithy++)
2400 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2403 for (ithz = 0; (ithz < order); ithz++)
2405 gval = grid[index_xy+(k0+ithz)];
2406 pot += tx*ty*thz[ithz]*gval;
2419 /* Macro to force loop unrolling by fixing order.
2420 * This gives a significant performance gain.
2422 #define CALC_SPLINE(order) \
2426 real data[PME_ORDER_MAX]; \
2427 real ddata[PME_ORDER_MAX]; \
2429 for (j = 0; (j < DIM); j++) \
2433 /* dr is relative offset from lower cell limit */ \
2434 data[order-1] = 0; \
2438 for (k = 3; (k < order); k++) \
2440 div = 1.0/(k - 1.0); \
2441 data[k-1] = div*dr*data[k-2]; \
2442 for (l = 1; (l < (k-1)); l++) \
2444 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2447 data[0] = div*(1-dr)*data[0]; \
2449 /* differentiate */ \
2450 ddata[0] = -data[0]; \
2451 for (k = 1; (k < order); k++) \
2453 ddata[k] = data[k-1] - data[k]; \
2456 div = 1.0/(order - 1); \
2457 data[order-1] = div*dr*data[order-2]; \
2458 for (l = 1; (l < (order-1)); l++) \
2460 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2461 (order-l-dr)*data[order-l-1]); \
2463 data[0] = div*(1 - dr)*data[0]; \
2465 for (k = 0; k < order; k++) \
2467 theta[j][i*order+k] = data[k]; \
2468 dtheta[j][i*order+k] = ddata[k]; \
2473 void make_bsplines(splinevec theta, splinevec dtheta, int order,
2474 rvec fractx[], int nr, int ind[], real charge[],
2475 gmx_bool bFreeEnergy)
2477 /* construct splines for local atoms */
2481 for (i = 0; i < nr; i++)
2483 /* With free energy we do not use the charge check.
2484 * In most cases this will be more efficient than calling make_bsplines
2485 * twice, since usually more than half the particles have charges.
2488 if (bFreeEnergy || charge[ii] != 0.0)
2493 case 4: CALC_SPLINE(4); break;
2494 case 5: CALC_SPLINE(5); break;
2495 default: CALC_SPLINE(order); break;
2502 void make_dft_mod(real *mod, real *data, int ndata)
2507 for (i = 0; i < ndata; i++)
2510 for (j = 0; j < ndata; j++)
2512 arg = (2.0*M_PI*i*j)/ndata;
2513 sc += data[j]*cos(arg);
2514 ss += data[j]*sin(arg);
2516 mod[i] = sc*sc+ss*ss;
2518 for (i = 0; i < ndata; i++)
2522 mod[i] = (mod[i-1]+mod[i+1])*0.5;
2528 static void make_bspline_moduli(splinevec bsp_mod,
2529 int nx, int ny, int nz, int order)
2531 int nmax = max(nx, max(ny, nz));
2532 real *data, *ddata, *bsp_data;
2538 snew(bsp_data, nmax);
2544 for (k = 3; k < order; k++)
2548 for (l = 1; l < (k-1); l++)
2550 data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2552 data[0] = div*data[0];
2555 ddata[0] = -data[0];
2556 for (k = 1; k < order; k++)
2558 ddata[k] = data[k-1]-data[k];
2560 div = 1.0/(order-1);
2562 for (l = 1; l < (order-1); l++)
2564 data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2566 data[0] = div*data[0];
2568 for (i = 0; i < nmax; i++)
2572 for (i = 1; i <= order; i++)
2574 bsp_data[i] = data[i-1];
2577 make_dft_mod(bsp_mod[XX], bsp_data, nx);
2578 make_dft_mod(bsp_mod[YY], bsp_data, ny);
2579 make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
2587 /* Return the P3M optimal influence function */
2588 static double do_p3m_influence(double z, int order)
2595 /* The formula and most constants can be found in:
2596 * Ballenegger et al., JCTC 8, 936 (2012)
2601 return 1.0 - 2.0*z2/3.0;
2604 return 1.0 - z2 + 2.0*z4/15.0;
2607 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2610 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2613 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2616 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2618 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2625 /* Calculate the P3M B-spline moduli for one dimension */
2626 static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2628 double zarg, zai, sinzai, infl;
2633 gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
2640 for (i = -maxk; i < 0; i++)
2644 infl = do_p3m_influence(sinzai, order);
2645 bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2648 for (i = 1; i < maxk; i++)
2652 infl = do_p3m_influence(sinzai, order);
2653 bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2657 /* Calculate the P3M B-spline moduli */
2658 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2659 int nx, int ny, int nz, int order)
2661 make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
2662 make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
2663 make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
2667 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2675 for (i = 1; i <= nslab/2; i++)
2677 fw = (atc->nodeid + i) % nslab;
2678 bw = (atc->nodeid - i + nslab) % nslab;
2681 atc->node_dest[n] = fw;
2682 atc->node_src[n] = bw;
2687 atc->node_dest[n] = bw;
2688 atc->node_src[n] = fw;
2694 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2700 fprintf(log, "Destroying PME data structures.\n");
2703 sfree((*pmedata)->nnx);
2704 sfree((*pmedata)->nny);
2705 sfree((*pmedata)->nnz);
2707 pmegrids_destroy(&(*pmedata)->pmegridA);
2709 sfree((*pmedata)->fftgridA);
2710 sfree((*pmedata)->cfftgridA);
2711 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2713 if ((*pmedata)->pmegridB.grid.grid != NULL)
2715 pmegrids_destroy(&(*pmedata)->pmegridB);
2716 sfree((*pmedata)->fftgridB);
2717 sfree((*pmedata)->cfftgridB);
2718 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2720 for (thread = 0; thread < (*pmedata)->nthread; thread++)
2722 free_work(&(*pmedata)->work[thread]);
2724 sfree((*pmedata)->work);
2732 static int mult_up(int n, int f)
2734 return ((n + f - 1)/f)*f;
2738 static double pme_load_imbalance(gmx_pme_t pme)
2743 nma = pme->nnodes_major;
2744 nmi = pme->nnodes_minor;
2746 n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
2747 n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
2748 n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
2750 /* pme_solve is roughly double the cost of an fft */
2752 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2755 static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
2756 int dimind, gmx_bool bSpread)
2758 int nk, k, s, thread;
2760 atc->dimind = dimind;
2765 if (pme->nnodes > 1)
2767 atc->mpi_comm = pme->mpi_comm_d[dimind];
2768 MPI_Comm_size(atc->mpi_comm, &atc->nslab);
2769 MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
2773 fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
2777 atc->bSpread = bSpread;
2778 atc->pme_order = pme->pme_order;
2782 /* These three allocations are not required for particle decomp. */
2783 snew(atc->node_dest, atc->nslab);
2784 snew(atc->node_src, atc->nslab);
2785 setup_coordinate_communication(atc);
2787 snew(atc->count_thread, pme->nthread);
2788 for (thread = 0; thread < pme->nthread; thread++)
2790 snew(atc->count_thread[thread], atc->nslab);
2792 atc->count = atc->count_thread[0];
2793 snew(atc->rcount, atc->nslab);
2794 snew(atc->buf_index, atc->nslab);
2797 atc->nthread = pme->nthread;
2798 if (atc->nthread > 1)
2800 snew(atc->thread_plist, atc->nthread);
2802 snew(atc->spline, atc->nthread);
2803 for (thread = 0; thread < atc->nthread; thread++)
2805 if (atc->nthread > 1)
2807 snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
2808 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2810 snew(atc->spline[thread].thread_one, pme->nthread);
2811 atc->spline[thread].thread_one[thread] = 1;
2816 init_overlap_comm(pme_overlap_t * ol,
2826 int lbnd, rbnd, maxlr, b, i;
2829 pme_grid_comm_t *pgc;
2831 int fft_start, fft_end, send_index1, recv_index1;
2835 ol->mpi_comm = comm;
2838 ol->nnodes = nnodes;
2839 ol->nodeid = nodeid;
2841 /* Linear translation of the PME grid won't affect reciprocal space
2842 * calculations, so to optimize we only interpolate "upwards",
2843 * which also means we only have to consider overlap in one direction.
2844 * I.e., particles on this node might also be spread to grid indices
2845 * that belong to higher nodes (modulo nnodes)
2848 snew(ol->s2g0, ol->nnodes+1);
2849 snew(ol->s2g1, ol->nnodes);
2852 fprintf(debug, "PME slab boundaries:");
2854 for (i = 0; i < nnodes; i++)
2856 /* s2g0 the local interpolation grid start.
2857 * s2g1 the local interpolation grid end.
2858 * Because grid overlap communication only goes forward,
2859 * the grid the slabs for fft's should be rounded down.
2861 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2862 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2866 fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
2869 ol->s2g0[nnodes] = ndata;
2872 fprintf(debug, "\n");
2875 /* Determine with how many nodes we need to communicate the grid overlap */
2881 for (i = 0; i < nnodes; i++)
2883 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2884 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2890 while (bCont && b < nnodes);
2891 ol->noverlap_nodes = b - 1;
2893 snew(ol->send_id, ol->noverlap_nodes);
2894 snew(ol->recv_id, ol->noverlap_nodes);
2895 for (b = 0; b < ol->noverlap_nodes; b++)
2897 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2898 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2900 snew(ol->comm_data, ol->noverlap_nodes);
2903 for (b = 0; b < ol->noverlap_nodes; b++)
2905 pgc = &ol->comm_data[b];
2907 fft_start = ol->s2g0[ol->send_id[b]];
2908 fft_end = ol->s2g0[ol->send_id[b]+1];
2909 if (ol->send_id[b] < nodeid)
2914 send_index1 = ol->s2g1[nodeid];
2915 send_index1 = min(send_index1, fft_end);
2916 pgc->send_index0 = fft_start;
2917 pgc->send_nindex = max(0, send_index1 - pgc->send_index0);
2918 ol->send_size += pgc->send_nindex;
2920 /* We always start receiving to the first index of our slab */
2921 fft_start = ol->s2g0[ol->nodeid];
2922 fft_end = ol->s2g0[ol->nodeid+1];
2923 recv_index1 = ol->s2g1[ol->recv_id[b]];
2924 if (ol->recv_id[b] > nodeid)
2926 recv_index1 -= ndata;
2928 recv_index1 = min(recv_index1, fft_end);
2929 pgc->recv_index0 = fft_start;
2930 pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
2934 /* Communicate the buffer sizes to receive */
2935 for (b = 0; b < ol->noverlap_nodes; b++)
2937 MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
2938 &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
2939 ol->mpi_comm, &stat);
2943 /* For non-divisible grid we need pme_order iso pme_order-1 */
2944 snew(ol->sendbuf, norder*commplainsize);
2945 snew(ol->recvbuf, norder*commplainsize);
2949 make_gridindex5_to_localindex(int n, int local_start, int local_range,
2950 int **global_to_local,
2951 real **fraction_shift)
2959 for (i = 0; (i < 5*n); i++)
2961 /* Determine the global to local grid index */
2962 gtl[i] = (i - local_start + n) % n;
2963 /* For coordinates that fall within the local grid the fraction
2964 * is correct, we don't need to shift it.
2967 if (local_range < n)
2969 /* Due to rounding issues i could be 1 beyond the lower or
2970 * upper boundary of the local grid. Correct the index for this.
2971 * If we shift the index, we need to shift the fraction by
2972 * the same amount in the other direction to not affect
2974 * Note that due to this shifting the weights at the end of
2975 * the spline might change, but that will only involve values
2976 * between zero and values close to the precision of a real,
2977 * which is anyhow the accuracy of the whole mesh calculation.
2979 /* With local_range=0 we should not change i=local_start */
2980 if (i % n != local_start)
2987 else if (gtl[i] == local_range)
2989 gtl[i] = local_range - 1;
2996 *global_to_local = gtl;
2997 *fraction_shift = fsh;
3000 static pme_spline_work_t *make_pme_spline_work(int order)
3002 pme_spline_work_t *work;
3009 snew_aligned(work, 1, 16);
3011 zero_SSE = _mm_setzero_ps();
3013 /* Generate bit masks to mask out the unused grid entries,
3014 * as we only operate on order of the 8 grid entries that are
3015 * load into 2 SSE float registers.
3017 for (of = 0; of < 8-(order-1); of++)
3019 for (i = 0; i < 8; i++)
3021 tmp[i] = (i >= of && i < of+order ? 1 : 0);
3023 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
3024 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
3025 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
3026 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
3036 gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
3040 if (*nk % nnodes != 0)
3042 nk_new = nnodes*(*nk/nnodes + 1);
3044 if (2*nk_new >= 3*(*nk))
3046 gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
3047 dim, *nk, dim, nnodes, dim);
3052 fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
3053 dim, *nk, dim, nnodes, dim, nk_new, dim);
3060 int gmx_pme_init(gmx_pme_t * pmedata,
3066 gmx_bool bFreeEnergy,
3067 gmx_bool bReproducible,
3070 gmx_pme_t pme = NULL;
3072 int use_threads, sum_use_threads;
3077 fprintf(debug, "Creating PME data structures.\n");
3081 pme->redist_init = FALSE;
3082 pme->sum_qgrid_tmp = NULL;
3083 pme->sum_qgrid_dd_tmp = NULL;
3084 pme->buf_nalloc = 0;
3085 pme->redist_buf_nalloc = 0;
3088 pme->bPPnode = TRUE;
3090 pme->nnodes_major = nnodes_major;
3091 pme->nnodes_minor = nnodes_minor;
3094 if (nnodes_major*nnodes_minor > 1)
3096 pme->mpi_comm = cr->mpi_comm_mygroup;
3098 MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3099 MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3100 if (pme->nnodes != nnodes_major*nnodes_minor)
3102 gmx_incons("PME node count mismatch");
3107 pme->mpi_comm = MPI_COMM_NULL;
3111 if (pme->nnodes == 1)
3114 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3115 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3117 pme->ndecompdim = 0;
3118 pme->nodeid_major = 0;
3119 pme->nodeid_minor = 0;
3121 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3126 if (nnodes_minor == 1)
3129 pme->mpi_comm_d[0] = pme->mpi_comm;
3130 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3132 pme->ndecompdim = 1;
3133 pme->nodeid_major = pme->nodeid;
3134 pme->nodeid_minor = 0;
3137 else if (nnodes_major == 1)
3140 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3141 pme->mpi_comm_d[1] = pme->mpi_comm;
3143 pme->ndecompdim = 1;
3144 pme->nodeid_major = 0;
3145 pme->nodeid_minor = pme->nodeid;
3149 if (pme->nnodes % nnodes_major != 0)
3151 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3153 pme->ndecompdim = 2;
3156 MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3157 pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
3158 MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3159 pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3161 MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3162 MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3163 MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3164 MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3167 pme->bPPnode = (cr->duty & DUTY_PP);
3170 pme->nthread = nthread;
3172 /* Check if any of the PME MPI ranks uses threads */
3173 use_threads = (pme->nthread > 1 ? 1 : 0);
3175 if (pme->nnodes > 1)
3177 MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
3178 MPI_SUM, pme->mpi_comm);
3183 sum_use_threads = use_threads;
3185 pme->bUseThreads = (sum_use_threads > 0);
3187 if (ir->ePBC == epbcSCREW)
3189 gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
3192 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
3196 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3197 pme->pme_order = ir->pme_order;
3198 pme->epsilon_r = ir->epsilon_r;
3200 if (pme->pme_order > PME_ORDER_MAX)
3202 gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3203 pme->pme_order, PME_ORDER_MAX);
3206 /* Currently pme.c supports only the fft5d FFT code.
3207 * Therefore the grid always needs to be divisible by nnodes.
3208 * When the old 1D code is also supported again, change this check.
3210 * This check should be done before calling gmx_pme_init
3211 * and fplog should be passed iso stderr.
3213 if (pme->ndecompdim >= 2)
3215 if (pme->ndecompdim >= 1)
3218 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3219 'x',nnodes_major,&pme->nkx);
3220 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3221 'y',nnodes_minor,&pme->nky);
3225 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3226 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3227 pme->nkz <= pme->pme_order)
3229 gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
3232 if (pme->nnodes > 1)
3237 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3238 MPI_Type_commit(&(pme->rvec_mpi));
3241 /* Note that the charge spreading and force gathering, which usually
3242 * takes about the same amount of time as FFT+solve_pme,
3243 * is always fully load balanced
3244 * (unless the charge distribution is inhomogeneous).
3247 imbal = pme_load_imbalance(pme);
3248 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3252 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3253 " For optimal PME load balancing\n"
3254 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3255 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3257 (int)((imbal-1)*100 + 0.5),
3258 pme->nkx, pme->nky, pme->nnodes_major,
3259 pme->nky, pme->nkz, pme->nnodes_minor);
3263 /* For non-divisible grid we need pme_order iso pme_order-1 */
3264 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3265 * y is always copied through a buffer: we don't need padding in z,
3266 * but we do need the overlap in x because of the communication order.
3268 init_overlap_comm(&pme->overlap[0], pme->pme_order,
3272 pme->nnodes_major, pme->nodeid_major,
3274 (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3276 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3277 * We do this with an offset buffer of equal size, so we need to allocate
3278 * extra for the offset. That's what the (+1)*pme->nkz is for.
3280 init_overlap_comm(&pme->overlap[1], pme->pme_order,
3284 pme->nnodes_minor, pme->nodeid_minor,
3286 (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3288 /* Check for a limitation of the (current) sum_fftgrid_dd code.
3289 * We only allow multiple communication pulses in dim 1, not in dim 0.
3291 if (pme->bUseThreads && (pme->overlap[0].noverlap_nodes > 1 ||
3292 pme->nkx < pme->nnodes_major*pme->pme_order))
3294 gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3295 pme->nkx/(double)pme->nnodes_major, pme->pme_order);
3298 snew(pme->bsp_mod[XX], pme->nkx);
3299 snew(pme->bsp_mod[YY], pme->nky);
3300 snew(pme->bsp_mod[ZZ], pme->nkz);
3302 /* The required size of the interpolation grid, including overlap.
3303 * The allocated size (pmegrid_n?) might be slightly larger.
3305 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3306 pme->overlap[0].s2g0[pme->nodeid_major];
3307 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3308 pme->overlap[1].s2g0[pme->nodeid_minor];
3309 pme->pmegrid_nz_base = pme->nkz;
3310 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3311 set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3313 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3314 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3315 pme->pmegrid_start_iz = 0;
3317 make_gridindex5_to_localindex(pme->nkx,
3318 pme->pmegrid_start_ix,
3319 pme->pmegrid_nx - (pme->pme_order-1),
3320 &pme->nnx, &pme->fshx);
3321 make_gridindex5_to_localindex(pme->nky,
3322 pme->pmegrid_start_iy,
3323 pme->pmegrid_ny - (pme->pme_order-1),
3324 &pme->nny, &pme->fshy);
3325 make_gridindex5_to_localindex(pme->nkz,
3326 pme->pmegrid_start_iz,
3327 pme->pmegrid_nz_base,
3328 &pme->nnz, &pme->fshz);
3330 pmegrids_init(&pme->pmegridA,
3331 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3332 pme->pmegrid_nz_base,
3336 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3337 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3339 pme->spline_work = make_pme_spline_work(pme->pme_order);
3341 ndata[0] = pme->nkx;
3342 ndata[1] = pme->nky;
3343 ndata[2] = pme->nkz;
3345 /* This routine will allocate the grid data to fit the FFTs */
3346 gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
3347 &pme->fftgridA, &pme->cfftgridA,
3349 pme->overlap[0].s2g0, pme->overlap[1].s2g0,
3350 bReproducible, pme->nthread);
3354 pmegrids_init(&pme->pmegridB,
3355 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3356 pme->pmegrid_nz_base,
3360 pme->nkx % pme->nnodes_major != 0,
3361 pme->nky % pme->nnodes_minor != 0);
3363 gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
3364 &pme->fftgridB, &pme->cfftgridB,
3366 pme->overlap[0].s2g0, pme->overlap[1].s2g0,
3367 bReproducible, pme->nthread);
3371 pme->pmegridB.grid.grid = NULL;
3372 pme->fftgridB = NULL;
3373 pme->cfftgridB = NULL;
3378 /* Use plain SPME B-spline interpolation */
3379 make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3383 /* Use the P3M grid-optimized influence function */
3384 make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3387 /* Use atc[0] for spreading */
3388 init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
3389 if (pme->ndecompdim >= 2)
3391 init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
3394 if (pme->nnodes == 1)
3396 pme->atc[0].n = homenr;
3397 pme_realloc_atomcomm_things(&pme->atc[0]);
3403 /* Use fft5d, order after FFT is y major, z, x minor */
3405 snew(pme->work, pme->nthread);
3406 for (thread = 0; thread < pme->nthread; thread++)
3408 realloc_work(&pme->work[thread], pme->nkx);
3417 static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3421 for (d = 0; d < DIM; d++)
3423 if (new->grid.n[d] > old->grid.n[d])
3429 sfree_aligned(new->grid.grid);
3430 new->grid.grid = old->grid.grid;
3432 if (new->grid_th != NULL && new->nthread == old->nthread)
3434 sfree_aligned(new->grid_all);
3435 for (t = 0; t < new->nthread; t++)
3437 new->grid_th[t].grid = old->grid_th[t].grid;
3442 int gmx_pme_reinit(gmx_pme_t * pmedata,
3445 const t_inputrec * ir,
3453 irc.nkx = grid_size[XX];
3454 irc.nky = grid_size[YY];
3455 irc.nkz = grid_size[ZZ];
3457 if (pme_src->nnodes == 1)
3459 homenr = pme_src->atc[0].n;
3466 ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3467 &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
3471 /* We can easily reuse the allocated pme grids in pme_src */
3472 reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
3473 /* We would like to reuse the fft grids, but that's harder */
3480 static void copy_local_grid(gmx_pme_t pme,
3481 pmegrids_t *pmegrids, int thread, real *fftgrid)
3483 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3487 int offx, offy, offz, x, y, z, i0, i0t;
3492 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3496 fft_my = local_fft_size[YY];
3497 fft_mz = local_fft_size[ZZ];
3499 pmegrid = &pmegrids->grid_th[thread];
3501 nsx = pmegrid->s[XX];
3502 nsy = pmegrid->s[YY];
3503 nsz = pmegrid->s[ZZ];
3505 for (d = 0; d < DIM; d++)
3507 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3508 local_fft_ndata[d] - pmegrid->offset[d]);
3511 offx = pmegrid->offset[XX];
3512 offy = pmegrid->offset[YY];
3513 offz = pmegrid->offset[ZZ];
3515 /* Directly copy the non-overlapping parts of the local grids.
3516 * This also initializes the full grid.
3518 grid_th = pmegrid->grid;
3519 for (x = 0; x < nf[XX]; x++)
3521 for (y = 0; y < nf[YY]; y++)
3523 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3524 i0t = (x*nsy + y)*nsz;
3525 for (z = 0; z < nf[ZZ]; z++)
3527 fftgrid[i0+z] = grid_th[i0t+z];
3534 reduce_threadgrid_overlap(gmx_pme_t pme,
3535 const pmegrids_t *pmegrids, int thread,
3536 real *fftgrid, real *commbuf_x, real *commbuf_y)
3538 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3539 int fft_nx, fft_ny, fft_nz;
3544 int offx, offy, offz, x, y, z, i0, i0t;
3545 int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3546 gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3547 gmx_bool bCommX, bCommY;
3550 const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3551 const real *grid_th;
3552 real *commbuf = NULL;
3554 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3558 fft_nx = local_fft_ndata[XX];
3559 fft_ny = local_fft_ndata[YY];
3560 fft_nz = local_fft_ndata[ZZ];
3562 fft_my = local_fft_size[YY];
3563 fft_mz = local_fft_size[ZZ];
3565 /* This routine is called when all thread have finished spreading.
3566 * Here each thread sums grid contributions calculated by other threads
3567 * to the thread local grid volume.
3568 * To minimize the number of grid copying operations,
3569 * this routines sums immediately from the pmegrid to the fftgrid.
3572 /* Determine which part of the full node grid we should operate on,
3573 * this is our thread local part of the full grid.
3575 pmegrid = &pmegrids->grid_th[thread];
3577 for (d = 0; d < DIM; d++)
3579 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3580 local_fft_ndata[d]);
3583 offx = pmegrid->offset[XX];
3584 offy = pmegrid->offset[YY];
3585 offz = pmegrid->offset[ZZ];
3592 /* Now loop over all the thread data blocks that contribute
3593 * to the grid region we (our thread) are operating on.
3595 /* Note that ffy_nx/y is equal to the number of grid points
3596 * between the first point of our node grid and the one of the next node.
3598 for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
3600 fx = pmegrid->ci[XX] + sx;
3605 fx += pmegrids->nc[XX];
3607 bCommX = (pme->nnodes_major > 1);
3609 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3610 ox += pmegrid_g->offset[XX];
3613 tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
3617 tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
3620 for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
3622 fy = pmegrid->ci[YY] + sy;
3627 fy += pmegrids->nc[YY];
3629 bCommY = (pme->nnodes_minor > 1);
3631 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3632 oy += pmegrid_g->offset[YY];
3635 ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
3639 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
3642 for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
3644 fz = pmegrid->ci[ZZ] + sz;
3648 fz += pmegrids->nc[ZZ];
3651 pmegrid_g = &pmegrids->grid_th[fz];
3652 oz += pmegrid_g->offset[ZZ];
3653 tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
3655 if (sx == 0 && sy == 0 && sz == 0)
3657 /* We have already added our local contribution
3658 * before calling this routine, so skip it here.
3663 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3665 pmegrid_f = &pmegrids->grid_th[thread_f];
3667 grid_th = pmegrid_f->grid;
3669 nsx = pmegrid_f->s[XX];
3670 nsy = pmegrid_f->s[YY];
3671 nsz = pmegrid_f->s[ZZ];
3673 #ifdef DEBUG_PME_REDUCE
3674 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3675 pme->nodeid, thread, thread_f,
3676 pme->pmegrid_start_ix,
3677 pme->pmegrid_start_iy,
3678 pme->pmegrid_start_iz,
3680 offx-ox, tx1-ox, offx, tx1,
3681 offy-oy, ty1-oy, offy, ty1,
3682 offz-oz, tz1-oz, offz, tz1);
3685 if (!(bCommX || bCommY))
3687 /* Copy from the thread local grid to the node grid */
3688 for (x = offx; x < tx1; x++)
3690 for (y = offy; y < ty1; y++)
3692 i0 = (x*fft_my + y)*fft_mz;
3693 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3694 for (z = offz; z < tz1; z++)
3696 fftgrid[i0+z] += grid_th[i0t+z];
3703 /* The order of this conditional decides
3704 * where the corner volume gets stored with x+y decomp.
3708 commbuf = commbuf_y;
3709 buf_my = ty1 - offy;
3712 /* We index commbuf modulo the local grid size */
3713 commbuf += buf_my*fft_nx*fft_nz;
3715 bClearBuf = bClearBufXY;
3716 bClearBufXY = FALSE;
3720 bClearBuf = bClearBufY;
3726 commbuf = commbuf_x;
3728 bClearBuf = bClearBufX;
3732 /* Copy to the communication buffer */
3733 for (x = offx; x < tx1; x++)
3735 for (y = offy; y < ty1; y++)
3737 i0 = (x*buf_my + y)*fft_nz;
3738 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3742 /* First access of commbuf, initialize it */
3743 for (z = offz; z < tz1; z++)
3745 commbuf[i0+z] = grid_th[i0t+z];
3750 for (z = offz; z < tz1; z++)
3752 commbuf[i0+z] += grid_th[i0t+z];
3764 static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
3766 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3767 pme_overlap_t *overlap;
3768 int send_index0, send_nindex;
3773 int send_size_y, recv_size_y;
3774 int ipulse, send_id, recv_id, datasize, gridsize, size_yx;
3775 real *sendptr, *recvptr;
3776 int x, y, z, indg, indb;
3778 /* Note that this routine is only used for forward communication.
3779 * Since the force gathering, unlike the charge spreading,
3780 * can be trivially parallelized over the particles,
3781 * the backwards process is much simpler and can use the "old"
3782 * communication setup.
3785 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3790 if (pme->nnodes_minor > 1)
3792 /* Major dimension */
3793 overlap = &pme->overlap[1];
3795 if (pme->nnodes_major > 1)
3797 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3803 datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
3805 send_size_y = overlap->send_size;
3807 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
3809 send_id = overlap->send_id[ipulse];
3810 recv_id = overlap->recv_id[ipulse];
3812 overlap->comm_data[ipulse].send_index0 -
3813 overlap->comm_data[0].send_index0;
3814 send_nindex = overlap->comm_data[ipulse].send_nindex;
3815 /* We don't use recv_index0, as we always receive starting at 0 */
3816 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3817 recv_size_y = overlap->comm_data[ipulse].recv_size;
3819 sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
3820 recvptr = overlap->recvbuf;
3823 MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
3825 recvptr, recv_size_y*datasize, GMX_MPI_REAL,
3827 overlap->mpi_comm, &stat);
3830 for (x = 0; x < local_fft_ndata[XX]; x++)
3832 for (y = 0; y < recv_nindex; y++)
3834 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3835 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
3836 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3838 fftgrid[indg+z] += recvptr[indb+z];
3843 if (pme->nnodes_major > 1)
3845 /* Copy from the received buffer to the send buffer for dim 0 */
3846 sendptr = pme->overlap[0].sendbuf;
3847 for (x = 0; x < size_yx; x++)
3849 for (y = 0; y < recv_nindex; y++)
3851 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3852 indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
3853 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3855 sendptr[indg+z] += recvptr[indb+z];
3863 /* We only support a single pulse here.
3864 * This is not a severe limitation, as this code is only used
3865 * with OpenMP and with OpenMP the (PME) domains can be larger.
3867 if (pme->nnodes_major > 1)
3869 /* Major dimension */
3870 overlap = &pme->overlap[0];
3872 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3873 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3877 send_id = overlap->send_id[ipulse];
3878 recv_id = overlap->recv_id[ipulse];
3879 send_nindex = overlap->comm_data[ipulse].send_nindex;
3880 /* We don't use recv_index0, as we always receive starting at 0 */
3881 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3883 sendptr = overlap->sendbuf;
3884 recvptr = overlap->recvbuf;
3888 fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
3889 send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
3893 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
3895 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
3897 overlap->mpi_comm, &stat);
3900 for (x = 0; x < recv_nindex; x++)
3902 for (y = 0; y < local_fft_ndata[YY]; y++)
3904 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3905 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3906 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3908 fftgrid[indg+z] += recvptr[indb+z];
3916 static void spread_on_grid(gmx_pme_t pme,
3917 pme_atomcomm_t *atc, pmegrids_t *grids,
3918 gmx_bool bCalcSplines, gmx_bool bSpread,
3921 int nthread, thread;
3922 #ifdef PME_TIME_THREADS
3923 gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
3924 static double cs1 = 0, cs2 = 0, cs3 = 0;
3925 static double cs1a[6] = {0, 0, 0, 0, 0, 0};
3929 nthread = pme->nthread;
3930 assert(nthread > 0);
3932 #ifdef PME_TIME_THREADS
3933 c1 = omp_cyc_start();
3937 #pragma omp parallel for num_threads(nthread) schedule(static)
3938 for (thread = 0; thread < nthread; thread++)
3942 start = atc->n* thread /nthread;
3943 end = atc->n*(thread+1)/nthread;
3945 /* Compute fftgrid index for all atoms,
3946 * with help of some extra variables.
3948 calc_interpolation_idx(pme, atc, start, end, thread);
3951 #ifdef PME_TIME_THREADS
3952 c1 = omp_cyc_end(c1);
3956 #ifdef PME_TIME_THREADS
3957 c2 = omp_cyc_start();
3959 #pragma omp parallel for num_threads(nthread) schedule(static)
3960 for (thread = 0; thread < nthread; thread++)
3962 splinedata_t *spline;
3963 pmegrid_t *grid = NULL;
3965 /* make local bsplines */
3966 if (grids == NULL || !pme->bUseThreads)
3968 spline = &atc->spline[0];
3974 grid = &grids->grid;
3979 spline = &atc->spline[thread];
3981 if (grids->nthread == 1)
3983 /* One thread, we operate on all charges */
3988 /* Get the indices our thread should operate on */
3989 make_thread_local_ind(atc, thread, spline);
3992 grid = &grids->grid_th[thread];
3997 make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
3998 atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
4003 /* put local atoms on grid. */
4004 #ifdef PME_TIME_SPREAD
4005 ct1a = omp_cyc_start();
4007 spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
4009 if (pme->bUseThreads)
4011 copy_local_grid(pme, grids, thread, fftgrid);
4013 #ifdef PME_TIME_SPREAD
4014 ct1a = omp_cyc_end(ct1a);
4015 cs1a[thread] += (double)ct1a;
4019 #ifdef PME_TIME_THREADS
4020 c2 = omp_cyc_end(c2);
4024 if (bSpread && pme->bUseThreads)
4026 #ifdef PME_TIME_THREADS
4027 c3 = omp_cyc_start();
4029 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
4030 for (thread = 0; thread < grids->nthread; thread++)
4032 reduce_threadgrid_overlap(pme, grids, thread,
4034 pme->overlap[0].sendbuf,
4035 pme->overlap[1].sendbuf);
4037 #ifdef PME_TIME_THREADS
4038 c3 = omp_cyc_end(c3);
4042 if (pme->nnodes > 1)
4044 /* Communicate the overlapping part of the fftgrid.
4045 * For this communication call we need to check pme->bUseThreads
4046 * to have all ranks communicate here, regardless of pme->nthread.
4048 sum_fftgrid_dd(pme, fftgrid);
4052 #ifdef PME_TIME_THREADS
4056 printf("idx %.2f spread %.2f red %.2f",
4057 cs1*1e-9, cs2*1e-9, cs3*1e-9);
4058 #ifdef PME_TIME_SPREAD
4059 for (thread = 0; thread < nthread; thread++)
4061 printf(" %.2f", cs1a[thread]*1e-9);
4070 static void dump_grid(FILE *fp,
4071 int sx, int sy, int sz, int nx, int ny, int nz,
4072 int my, int mz, const real *g)
4076 for (x = 0; x < nx; x++)
4078 for (y = 0; y < ny; y++)
4080 for (z = 0; z < nz; z++)
4082 fprintf(fp, "%2d %2d %2d %6.3f\n",
4083 sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4089 static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4091 ivec local_fft_ndata, local_fft_offset, local_fft_size;
4093 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
4099 pme->pmegrid_start_ix,
4100 pme->pmegrid_start_iy,
4101 pme->pmegrid_start_iz,
4102 pme->pmegrid_nx-pme->pme_order+1,
4103 pme->pmegrid_ny-pme->pme_order+1,
4104 pme->pmegrid_nz-pme->pme_order+1,
4111 void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4113 pme_atomcomm_t *atc;
4116 if (pme->nnodes > 1)
4118 gmx_incons("gmx_pme_calc_energy called in parallel");
4122 gmx_incons("gmx_pme_calc_energy with free energy");
4125 atc = &pme->atc_energy;
4127 if (atc->spline == NULL)
4129 snew(atc->spline, atc->nthread);
4132 atc->bSpread = TRUE;
4133 atc->pme_order = pme->pme_order;
4135 pme_realloc_atomcomm_things(atc);
4139 /* We only use the A-charges grid */
4140 grid = &pme->pmegridA;
4142 /* Only calculate the spline coefficients, don't actually spread */
4143 spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
4145 *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4149 static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
4150 t_nrnb *nrnb, t_inputrec *ir,
4151 gmx_large_int_t step)
4153 /* Reset all the counters related to performance over the run */
4154 wallcycle_stop(wcycle, ewcRUN);
4155 wallcycle_reset_all(wcycle);
4157 if (ir->nsteps >= 0)
4159 /* ir->nsteps is not used here, but we update it for consistency */
4160 ir->nsteps -= step - ir->init_step;
4162 ir->init_step = step;
4163 wallcycle_start(wcycle, ewcRUN);
4167 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4169 t_commrec *cr, t_inputrec *ir,
4173 gmx_pme_t pme = NULL;
4176 while (ind < *npmedata)
4178 pme = (*pmedata)[ind];
4179 if (pme->nkx == grid_size[XX] &&
4180 pme->nky == grid_size[YY] &&
4181 pme->nkz == grid_size[ZZ])
4192 srenew(*pmedata, *npmedata);
4194 /* Generate a new PME data structure, copying part of the old pointers */
4195 gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4197 *pme_ret = (*pmedata)[ind];
4201 int gmx_pmeonly(gmx_pme_t pme,
4202 t_commrec *cr, t_nrnb *nrnb,
4203 gmx_wallcycle_t wcycle,
4204 real ewaldcoeff, gmx_bool bGatherOnly,
4209 gmx_pme_pp_t pme_pp;
4213 rvec *x_pp = NULL, *f_pp = NULL;
4214 real *chargeA = NULL, *chargeB = NULL;
4216 int maxshift_x = 0, maxshift_y = 0;
4217 real energy, dvdlambda;
4222 gmx_large_int_t step, step_rel;
4225 /* This data will only use with PME tuning, i.e. switching PME grids */
4227 snew(pmedata, npmedata);
4230 pme_pp = gmx_pme_pp_init(cr);
4235 do /****** this is a quasi-loop over time steps! */
4237 /* The reason for having a loop here is PME grid tuning/switching */
4240 /* Domain decomposition */
4241 ret = gmx_pme_recv_q_x(pme_pp,
4243 &chargeA, &chargeB, box, &x_pp, &f_pp,
4244 &maxshift_x, &maxshift_y,
4245 &pme->bFEP, &lambda,
4248 grid_switch, &ewaldcoeff);
4250 if (ret == pmerecvqxSWITCHGRID)
4252 /* Switch the PME grid to grid_switch */
4253 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4256 if (ret == pmerecvqxRESETCOUNTERS)
4258 /* Reset the cycle and flop counters */
4259 reset_pmeonly_counters(cr, wcycle, nrnb, ir, step);
4262 while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4264 if (ret == pmerecvqxFINISH)
4266 /* We should stop: break out of the loop */
4270 step_rel = step - ir->init_step;
4274 wallcycle_start(wcycle, ewcRUN);
4277 wallcycle_start(wcycle, ewcPMEMESH);
4281 gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
4282 cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
4283 &energy, lambda, &dvdlambda,
4284 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4286 cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4288 gmx_pme_send_force_vir_ener(pme_pp,
4289 f_pp, vir, energy, dvdlambda,
4293 } /***** end of quasi-loop, we stop with the break above */
4299 int gmx_pme_do(gmx_pme_t pme,
4300 int start, int homenr,
4302 real *chargeA, real *chargeB,
4303 matrix box, t_commrec *cr,
4304 int maxshift_x, int maxshift_y,
4305 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4306 matrix vir, real ewaldcoeff,
4307 real *energy, real lambda,
4308 real *dvdlambda, int flags)
4310 int q, d, i, j, ntot, npme;
4313 pme_atomcomm_t *atc = NULL;
4314 pmegrids_t *pmegrid = NULL;
4318 real *charge = NULL, *q_d;
4322 gmx_parallel_3dfft_t pfft_setup;
4324 t_complex * cfftgrid;
4326 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4327 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4329 assert(pme->nnodes > 0);
4330 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4332 if (pme->nnodes > 1)
4336 if (atc->npd > atc->pd_nalloc)
4338 atc->pd_nalloc = over_alloc_dd(atc->npd);
4339 srenew(atc->pd, atc->pd_nalloc);
4341 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4345 /* This could be necessary for TPI */
4346 pme->atc[0].n = homenr;
4349 for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
4353 pmegrid = &pme->pmegridA;
4354 fftgrid = pme->fftgridA;
4355 cfftgrid = pme->cfftgridA;
4356 pfft_setup = pme->pfft_setupA;
4357 charge = chargeA+start;
4361 pmegrid = &pme->pmegridB;
4362 fftgrid = pme->fftgridB;
4363 cfftgrid = pme->cfftgridB;
4364 pfft_setup = pme->pfft_setupB;
4365 charge = chargeB+start;
4367 grid = pmegrid->grid.grid;
4368 /* Unpack structure */
4371 fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
4372 cr->nnodes, cr->nodeid);
4373 fprintf(debug, "Grid = %p\n", (void*)grid);
4376 gmx_fatal(FARGS, "No grid!");
4381 m_inv_ur0(box, pme->recipbox);
4383 if (pme->nnodes == 1)
4386 if (DOMAINDECOMP(cr))
4389 pme_realloc_atomcomm_things(atc);
4397 wallcycle_start(wcycle, ewcPME_REDISTXF);
4398 for (d = pme->ndecompdim-1; d >= 0; d--)
4400 if (d == pme->ndecompdim-1)
4408 n_d = pme->atc[d+1].n;
4414 if (atc->npd > atc->pd_nalloc)
4416 atc->pd_nalloc = over_alloc_dd(atc->npd);
4417 srenew(atc->pd, atc->pd_nalloc);
4419 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4420 pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4423 GMX_BARRIER(cr->mpi_comm_mygroup);
4424 /* Redistribute x (only once) and qA or qB */
4425 if (DOMAINDECOMP(cr))
4427 dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
4431 pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
4436 wallcycle_stop(wcycle, ewcPME_REDISTXF);
4441 fprintf(debug, "Node= %6d, pme local particles=%6d\n",
4442 cr->nodeid, atc->n);
4445 if (flags & GMX_PME_SPREAD_Q)
4447 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4449 /* Spread the charges on a grid */
4450 GMX_MPE_LOG(ev_spread_on_grid_start);
4452 /* Spread the charges on a grid */
4453 spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
4454 GMX_MPE_LOG(ev_spread_on_grid_finish);
4458 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
4460 inc_nrnb(nrnb, eNR_SPREADQBSP,
4461 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4463 if (!pme->bUseThreads)
4465 wrap_periodic_pmegrid(pme, grid);
4467 /* sum contributions to local grid from other nodes */
4469 if (pme->nnodes > 1)
4471 GMX_BARRIER(cr->mpi_comm_mygroup);
4472 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
4477 copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
4480 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4483 dump_local_fftgrid(pme,fftgrid);
4488 /* Here we start a large thread parallel region */
4489 #pragma omp parallel num_threads(pme->nthread) private(thread)
4491 thread = gmx_omp_get_thread_num();
4492 if (flags & GMX_PME_SOLVE)
4499 GMX_BARRIER(cr->mpi_comm_mygroup);
4500 GMX_MPE_LOG(ev_gmxfft3d_start);
4501 wallcycle_start(wcycle, ewcPME_FFT);
4503 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4504 fftgrid, cfftgrid, thread, wcycle);
4507 wallcycle_stop(wcycle, ewcPME_FFT);
4508 GMX_MPE_LOG(ev_gmxfft3d_finish);
4512 /* solve in k-space for our local cells */
4515 GMX_BARRIER(cr->mpi_comm_mygroup);
4516 GMX_MPE_LOG(ev_solve_pme_start);
4517 wallcycle_start(wcycle, ewcPME_SOLVE);
4520 solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
4521 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4523 pme->nthread, thread);
4526 wallcycle_stop(wcycle, ewcPME_SOLVE);
4528 GMX_MPE_LOG(ev_solve_pme_finish);
4529 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
4538 GMX_BARRIER(cr->mpi_comm_mygroup);
4539 GMX_MPE_LOG(ev_gmxfft3d_start);
4541 wallcycle_start(wcycle, ewcPME_FFT);
4543 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4544 cfftgrid, fftgrid, thread, wcycle);
4547 wallcycle_stop(wcycle, ewcPME_FFT);
4550 GMX_MPE_LOG(ev_gmxfft3d_finish);
4552 if (pme->nodeid == 0)
4554 ntot = pme->nkx*pme->nky*pme->nkz;
4555 npme = ntot*log((real)ntot)/log(2.0);
4556 inc_nrnb(nrnb, eNR_FFT, 2*npme);
4559 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4562 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
4565 /* End of thread parallel section.
4566 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4571 /* distribute local grid to all nodes */
4573 if (pme->nnodes > 1)
4575 GMX_BARRIER(cr->mpi_comm_mygroup);
4576 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
4581 unwrap_periodic_pmegrid(pme, grid);
4583 /* interpolate forces for our local atoms */
4584 GMX_BARRIER(cr->mpi_comm_mygroup);
4585 GMX_MPE_LOG(ev_gather_f_bsplines_start);
4589 /* If we are running without parallelization,
4590 * atc->f is the actual force array, not a buffer,
4591 * therefore we should not clear it.
4593 bClearF = (q == 0 && PAR(cr));
4594 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4595 for (thread = 0; thread < pme->nthread; thread++)
4597 gather_f_bsplines(pme, grid, bClearF, atc,
4598 &atc->spline[thread],
4599 pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
4604 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
4606 inc_nrnb(nrnb, eNR_GATHERFBSP,
4607 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4608 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4613 /* This should only be called on the master thread
4614 * and after the threads have synchronized.
4616 get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
4620 if (bCalcF && pme->nnodes > 1)
4622 wallcycle_start(wcycle, ewcPME_REDISTXF);
4623 for (d = 0; d < pme->ndecompdim; d++)
4626 if (d == pme->ndecompdim - 1)
4633 n_d = pme->atc[d+1].n;
4634 f_d = pme->atc[d+1].f;
4636 GMX_BARRIER(cr->mpi_comm_mygroup);
4637 if (DOMAINDECOMP(cr))
4639 dd_pmeredist_f(pme, atc, n_d, f_d,
4640 d == pme->ndecompdim-1 && pme->bPPnode);
4644 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4648 wallcycle_stop(wcycle, ewcPME_REDISTXF);
4656 *energy = energy_AB[0];
4657 m_add(vir, vir_AB[0], vir);
4661 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4662 *dvdlambda += energy_AB[1] - energy_AB[0];
4663 for (i = 0; i < DIM; i++)
4665 for (j = 0; j < DIM; j++)
4667 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4668 lambda*vir_AB[1][i][j];
4680 fprintf(debug, "PME mesh energy: %g\n", *energy);