1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
90 #include "gmx_cyclecounter.h"
94 /* Single precision, with SSE2 or higher available */
95 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
97 #include "gmx_x86_simd_single.h"
100 /* Some old AMD processors could have problems with unaligned loads+stores */
102 #define PME_SSE_UNALIGNED
107 /* #define PRT_FORCE */
108 /* conditions for on the fly time-measurement */
109 /* #define TAKETIME (step > 1 && timesteps < 10) */
110 #define TAKETIME FALSE
112 /* #define PME_TIME_THREADS */
115 #define mpi_type MPI_DOUBLE
117 #define mpi_type MPI_FLOAT
120 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
121 #define GMX_CACHE_SEP 64
123 /* We only define a maximum to be able to use local arrays without allocation.
124 * An order larger than 12 should never be needed, even for test cases.
125 * If needed it can be changed here.
127 #define PME_ORDER_MAX 12
129 /* Internal datastructures */
135 int recv_size; /* Receive buffer width, used with OpenMP */
146 int *send_id, *recv_id;
147 int send_size; /* Send buffer width, used with OpenMP */
148 pme_grid_comm_t *comm_data;
154 int *n; /* Cumulative counts of the number of particles per thread */
155 int nalloc; /* Allocation size of i */
156 int *i; /* Particle indices ordered on thread index (n) */
170 int dimind; /* The index of the dimension, 0=x, 1=y */
177 int *node_dest; /* The nodes to send x and q to with DD */
178 int *node_src; /* The nodes to receive x and q from with DD */
179 int *buf_index; /* Index for commnode into the buffers */
186 int *count; /* The number of atoms to send to each node */
188 int *rcount; /* The number of atoms to receive */
195 gmx_bool bSpread; /* These coordinates are used for spreading */
198 rvec *fractx; /* Fractional coordinate relative to the
199 * lower cell boundary
202 int *thread_idx; /* Which thread should spread which charge */
203 thread_plist_t *thread_plist;
204 splinedata_t *spline;
211 ivec ci; /* The spatial location of this grid */
212 ivec n; /* The used size of *grid, including order-1 */
213 ivec offset; /* The grid offset from the full node grid */
214 int order; /* PME spreading order */
215 ivec s; /* The allocated size of *grid, s >= n */
216 real *grid; /* The grid local thread, size n */
220 pmegrid_t grid; /* The full node grid (non thread-local) */
221 int nthread; /* The number of threads operating on this grid */
222 ivec nc; /* The local spatial decomposition over the threads */
223 pmegrid_t *grid_th; /* Array of grids for each thread */
224 real *grid_all; /* Allocated array for the grids in *grid_th */
225 int **g2t; /* The grid to thread index */
226 ivec nthread_comm; /* The number of threads to communicate with */
232 /* Masks for SSE aligned spreading and gathering */
233 __m128 mask_SSE0[6], mask_SSE1[6];
235 int dummy; /* C89 requires that struct has at least one member */
240 /* work data for solve_pme */
256 typedef struct gmx_pme {
257 int ndecompdim; /* The number of decomposition dimensions */
258 int nodeid; /* Our nodeid in mpi->mpi_comm */
261 int nnodes; /* The number of nodes doing PME */
266 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
268 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
271 int nthread; /* The number of threads doing PME */
273 gmx_bool bPPnode; /* Node also does particle-particle forces */
274 gmx_bool bFEP; /* Compute Free energy contribution */
275 int nkx, nky, nkz; /* Grid dimensions */
276 gmx_bool bP3M; /* Do P3M: optimize the influence function */
280 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
282 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
283 int pmegrid_nx, pmegrid_ny, pmegrid_nz;
284 /* pmegrid_nz might be larger than strictly necessary to ensure
285 * memory alignment, pmegrid_nz_base gives the real base size.
288 /* The local PME grid starting indices */
289 int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
291 /* Work data for spreading and gathering */
292 pme_spline_work_t *spline_work;
294 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
295 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
296 int fftgrid_nx, fftgrid_ny, fftgrid_nz;
298 t_complex *cfftgridA; /* Grids for complex FFT data */
299 t_complex *cfftgridB;
300 int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
302 gmx_parallel_3dfft_t pfft_setupA;
303 gmx_parallel_3dfft_t pfft_setupB;
305 int *nnx, *nny, *nnz;
306 real *fshx, *fshy, *fshz;
308 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
312 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
314 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
316 rvec *bufv; /* Communication buffer */
317 real *bufr; /* Communication buffer */
318 int buf_nalloc; /* The communication buffer size */
320 /* thread local work data for solve_pme */
323 /* Work data for PME_redist */
324 gmx_bool redist_init;
332 int redist_buf_nalloc;
334 /* Work data for sum_qgrid */
335 real * sum_qgrid_tmp;
336 real * sum_qgrid_dd_tmp;
340 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
341 int start, int end, int thread)
344 int *idxptr, tix, tiy, tiz;
345 real *xptr, *fptr, tx, ty, tz;
346 real rxx, ryx, ryy, rzx, rzy, rzz;
348 int start_ix, start_iy, start_iz;
349 int *g2tx, *g2ty, *g2tz;
351 int *thread_idx = NULL;
352 thread_plist_t *tpl = NULL;
360 start_ix = pme->pmegrid_start_ix;
361 start_iy = pme->pmegrid_start_iy;
362 start_iz = pme->pmegrid_start_iz;
364 rxx = pme->recipbox[XX][XX];
365 ryx = pme->recipbox[YY][XX];
366 ryy = pme->recipbox[YY][YY];
367 rzx = pme->recipbox[ZZ][XX];
368 rzy = pme->recipbox[ZZ][YY];
369 rzz = pme->recipbox[ZZ][ZZ];
371 g2tx = pme->pmegridA.g2t[XX];
372 g2ty = pme->pmegridA.g2t[YY];
373 g2tz = pme->pmegridA.g2t[ZZ];
375 bThreads = (atc->nthread > 1);
378 thread_idx = atc->thread_idx;
380 tpl = &atc->thread_plist[thread];
382 for (i = 0; i < atc->nthread; i++)
388 for (i = start; i < end; i++)
391 idxptr = atc->idx[i];
392 fptr = atc->fractx[i];
394 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
395 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
396 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
397 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
403 /* Because decomposition only occurs in x and y,
404 * we never have a fraction correction in z.
406 fptr[XX] = tx - tix + pme->fshx[tix];
407 fptr[YY] = ty - tiy + pme->fshy[tiy];
410 idxptr[XX] = pme->nnx[tix];
411 idxptr[YY] = pme->nny[tiy];
412 idxptr[ZZ] = pme->nnz[tiz];
415 range_check(idxptr[XX], 0, pme->pmegrid_nx);
416 range_check(idxptr[YY], 0, pme->pmegrid_ny);
417 range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
422 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
423 thread_idx[i] = thread_i;
430 /* Make a list of particle indices sorted on thread */
432 /* Get the cumulative count */
433 for (i = 1; i < atc->nthread; i++)
435 tpl_n[i] += tpl_n[i-1];
437 /* The current implementation distributes particles equally
438 * over the threads, so we could actually allocate for that
439 * in pme_realloc_atomcomm_things.
441 if (tpl_n[atc->nthread-1] > tpl->nalloc)
443 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
444 srenew(tpl->i, tpl->nalloc);
446 /* Set tpl_n to the cumulative start */
447 for (i = atc->nthread-1; i >= 1; i--)
449 tpl_n[i] = tpl_n[i-1];
453 /* Fill our thread local array with indices sorted on thread */
454 for (i = start; i < end; i++)
456 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
458 /* Now tpl_n contains the cummulative count again */
462 static void make_thread_local_ind(pme_atomcomm_t *atc,
463 int thread, splinedata_t *spline)
465 int n, t, i, start, end;
468 /* Combine the indices made by each thread into one index */
472 for (t = 0; t < atc->nthread; t++)
474 tpl = &atc->thread_plist[t];
475 /* Copy our part (start - end) from the list of thread t */
478 start = tpl->n[thread-1];
480 end = tpl->n[thread];
481 for (i = start; i < end; i++)
483 spline->ind[n++] = tpl->i[i];
491 static void pme_calc_pidx(int start, int end,
492 matrix recipbox, rvec x[],
493 pme_atomcomm_t *atc, int *count)
498 real rxx, ryx, rzx, ryy, rzy;
501 /* Calculate PME task index (pidx) for each grid index.
502 * Here we always assign equally sized slabs to each node
503 * for load balancing reasons (the PME grid spacing is not used).
509 /* Reset the count */
510 for (i = 0; i < nslab; i++)
515 if (atc->dimind == 0)
517 rxx = recipbox[XX][XX];
518 ryx = recipbox[YY][XX];
519 rzx = recipbox[ZZ][XX];
520 /* Calculate the node index in x-dimension */
521 for (i = start; i < end; i++)
524 /* Fractional coordinates along box vectors */
525 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
526 si = (int)(s + 2*nslab) % nslab;
533 ryy = recipbox[YY][YY];
534 rzy = recipbox[ZZ][YY];
535 /* Calculate the node index in y-dimension */
536 for (i = start; i < end; i++)
539 /* Fractional coordinates along box vectors */
540 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
541 si = (int)(s + 2*nslab) % nslab;
548 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
551 int nthread, thread, slab;
553 nthread = atc->nthread;
555 #pragma omp parallel for num_threads(nthread) schedule(static)
556 for (thread = 0; thread < nthread; thread++)
558 pme_calc_pidx(natoms* thread /nthread,
559 natoms*(thread+1)/nthread,
560 recipbox, x, atc, atc->count_thread[thread]);
562 /* Non-parallel reduction, since nslab is small */
564 for (thread = 1; thread < nthread; thread++)
566 for (slab = 0; slab < atc->nslab; slab++)
568 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
573 static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
575 const int padding = 4;
578 srenew(th[XX], nalloc);
579 srenew(th[YY], nalloc);
580 /* In z we add padding, this is only required for the aligned SSE code */
581 srenew(*ptr_z, nalloc+2*padding);
582 th[ZZ] = *ptr_z + padding;
584 for (i = 0; i < padding; i++)
587 (*ptr_z)[padding+nalloc+i] = 0;
591 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
595 srenew(spline->ind, atc->nalloc);
596 /* Initialize the index to identity so it works without threads */
597 for (i = 0; i < atc->nalloc; i++)
602 realloc_splinevec(spline->theta, &spline->ptr_theta_z,
603 atc->pme_order*atc->nalloc);
604 realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
605 atc->pme_order*atc->nalloc);
608 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
610 int nalloc_old, i, j, nalloc_tpl;
612 /* We have to avoid a NULL pointer for atc->x to avoid
613 * possible fatal errors in MPI routines.
615 if (atc->n > atc->nalloc || atc->nalloc == 0)
617 nalloc_old = atc->nalloc;
618 atc->nalloc = over_alloc_dd(max(atc->n, 1));
622 srenew(atc->x, atc->nalloc);
623 srenew(atc->q, atc->nalloc);
624 srenew(atc->f, atc->nalloc);
625 for (i = nalloc_old; i < atc->nalloc; i++)
627 clear_rvec(atc->f[i]);
632 srenew(atc->fractx, atc->nalloc);
633 srenew(atc->idx, atc->nalloc);
635 if (atc->nthread > 1)
637 srenew(atc->thread_idx, atc->nalloc);
640 for (i = 0; i < atc->nthread; i++)
642 pme_realloc_splinedata(&atc->spline[i], atc);
648 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
649 int n, gmx_bool bXF, rvec *x_f, real *charge,
651 /* Redistribute particle data for PME calculation */
652 /* domain decomposition by x coordinate */
657 if (FALSE == pme->redist_init)
659 snew(pme->scounts, atc->nslab);
660 snew(pme->rcounts, atc->nslab);
661 snew(pme->sdispls, atc->nslab);
662 snew(pme->rdispls, atc->nslab);
663 snew(pme->sidx, atc->nslab);
664 pme->redist_init = TRUE;
666 if (n > pme->redist_buf_nalloc)
668 pme->redist_buf_nalloc = over_alloc_dd(n);
669 srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
677 /* forward, redistribution from pp to pme */
679 /* Calculate send counts and exchange them with other nodes */
680 for (i = 0; (i < atc->nslab); i++)
684 for (i = 0; (i < n); i++)
686 pme->scounts[pme->idxa[i]]++;
688 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
690 /* Calculate send and receive displacements and index into send
695 for (i = 1; i < atc->nslab; i++)
697 pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
698 pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
699 pme->sidx[i] = pme->sdispls[i];
701 /* Total # of particles to be received */
702 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
704 pme_realloc_atomcomm_things(atc);
706 /* Copy particle coordinates into send buffer and exchange*/
707 for (i = 0; (i < n); i++)
709 ii = DIM*pme->sidx[pme->idxa[i]];
710 pme->sidx[pme->idxa[i]]++;
711 pme->redist_buf[ii+XX] = x_f[i][XX];
712 pme->redist_buf[ii+YY] = x_f[i][YY];
713 pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
715 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
716 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
717 pme->rvec_mpi, atc->mpi_comm);
721 /* Copy charge into send buffer and exchange*/
722 for (i = 0; i < atc->nslab; i++)
724 pme->sidx[i] = pme->sdispls[i];
726 for (i = 0; (i < n); i++)
728 ii = pme->sidx[pme->idxa[i]];
729 pme->sidx[pme->idxa[i]]++;
730 pme->redist_buf[ii] = charge[i];
732 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
733 atc->q, pme->rcounts, pme->rdispls, mpi_type,
736 else /* backward, redistribution from pme to pp */
738 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
739 pme->redist_buf, pme->scounts, pme->sdispls,
740 pme->rvec_mpi, atc->mpi_comm);
742 /* Copy data from receive buffer */
743 for (i = 0; i < atc->nslab; i++)
745 pme->sidx[i] = pme->sdispls[i];
747 for (i = 0; (i < n); i++)
749 ii = DIM*pme->sidx[pme->idxa[i]];
750 x_f[i][XX] += pme->redist_buf[ii+XX];
751 x_f[i][YY] += pme->redist_buf[ii+YY];
752 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
753 pme->sidx[pme->idxa[i]]++;
759 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
760 gmx_bool bBackward, int shift,
761 void *buf_s, int nbyte_s,
762 void *buf_r, int nbyte_r)
768 if (bBackward == FALSE)
770 dest = atc->node_dest[shift];
771 src = atc->node_src[shift];
775 dest = atc->node_src[shift];
776 src = atc->node_dest[shift];
779 if (nbyte_s > 0 && nbyte_r > 0)
781 MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
783 buf_r, nbyte_r, MPI_BYTE,
785 atc->mpi_comm, &stat);
787 else if (nbyte_s > 0)
789 MPI_Send(buf_s, nbyte_s, MPI_BYTE,
793 else if (nbyte_r > 0)
795 MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
797 atc->mpi_comm, &stat);
802 static void dd_pmeredist_x_q(gmx_pme_t pme,
803 int n, gmx_bool bX, rvec *x, real *charge,
806 int *commnode, *buf_index;
807 int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
809 commnode = atc->node_dest;
810 buf_index = atc->buf_index;
812 nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
815 for (i = 0; i < nnodes_comm; i++)
817 buf_index[commnode[i]] = nsend;
818 nsend += atc->count[commnode[i]];
822 if (atc->count[atc->nodeid] + nsend != n)
824 gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
825 "This usually means that your system is not well equilibrated.",
826 n - (atc->count[atc->nodeid] + nsend),
827 pme->nodeid, 'x'+atc->dimind);
830 if (nsend > pme->buf_nalloc)
832 pme->buf_nalloc = over_alloc_dd(nsend);
833 srenew(pme->bufv, pme->buf_nalloc);
834 srenew(pme->bufr, pme->buf_nalloc);
837 atc->n = atc->count[atc->nodeid];
838 for (i = 0; i < nnodes_comm; i++)
840 scount = atc->count[commnode[i]];
841 /* Communicate the count */
844 fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
845 atc->dimind, atc->nodeid, commnode[i], scount);
847 pme_dd_sendrecv(atc, FALSE, i,
848 &scount, sizeof(int),
849 &atc->rcount[i], sizeof(int));
850 atc->n += atc->rcount[i];
853 pme_realloc_atomcomm_things(atc);
857 for (i = 0; i < n; i++)
860 if (node == atc->nodeid)
862 /* Copy direct to the receive buffer */
865 copy_rvec(x[i], atc->x[local_pos]);
867 atc->q[local_pos] = charge[i];
872 /* Copy to the send buffer */
875 copy_rvec(x[i], pme->bufv[buf_index[node]]);
877 pme->bufr[buf_index[node]] = charge[i];
883 for (i = 0; i < nnodes_comm; i++)
885 scount = atc->count[commnode[i]];
886 rcount = atc->rcount[i];
887 if (scount > 0 || rcount > 0)
891 /* Communicate the coordinates */
892 pme_dd_sendrecv(atc, FALSE, i,
893 pme->bufv[buf_pos], scount*sizeof(rvec),
894 atc->x[local_pos], rcount*sizeof(rvec));
896 /* Communicate the charges */
897 pme_dd_sendrecv(atc, FALSE, i,
898 pme->bufr+buf_pos, scount*sizeof(real),
899 atc->q+local_pos, rcount*sizeof(real));
901 local_pos += atc->rcount[i];
906 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
910 int *commnode, *buf_index;
911 int nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
913 commnode = atc->node_dest;
914 buf_index = atc->buf_index;
916 nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
918 local_pos = atc->count[atc->nodeid];
920 for (i = 0; i < nnodes_comm; i++)
922 scount = atc->rcount[i];
923 rcount = atc->count[commnode[i]];
924 if (scount > 0 || rcount > 0)
926 /* Communicate the forces */
927 pme_dd_sendrecv(atc, TRUE, i,
928 atc->f[local_pos], scount*sizeof(rvec),
929 pme->bufv[buf_pos], rcount*sizeof(rvec));
932 buf_index[commnode[i]] = buf_pos;
939 for (i = 0; i < n; i++)
942 if (node == atc->nodeid)
944 /* Add from the local force array */
945 rvec_inc(f[i], atc->f[local_pos]);
950 /* Add from the receive buffer */
951 rvec_inc(f[i], pme->bufv[buf_index[node]]);
958 for (i = 0; i < n; i++)
961 if (node == atc->nodeid)
963 /* Copy from the local force array */
964 copy_rvec(atc->f[local_pos], f[i]);
969 /* Copy from the receive buffer */
970 copy_rvec(pme->bufv[buf_index[node]], f[i]);
979 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
981 pme_overlap_t *overlap;
982 int send_index0, send_nindex;
983 int recv_index0, recv_nindex;
985 int i, j, k, ix, iy, iz, icnt;
986 int ipulse, send_id, recv_id, datasize;
988 real *sendptr, *recvptr;
990 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
991 overlap = &pme->overlap[1];
993 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
995 /* Since we have already (un)wrapped the overlap in the z-dimension,
996 * we only have to communicate 0 to nkz (not pmegrid_nz).
998 if (direction == GMX_SUM_QGRID_FORWARD)
1000 send_id = overlap->send_id[ipulse];
1001 recv_id = overlap->recv_id[ipulse];
1002 send_index0 = overlap->comm_data[ipulse].send_index0;
1003 send_nindex = overlap->comm_data[ipulse].send_nindex;
1004 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1005 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1009 send_id = overlap->recv_id[ipulse];
1010 recv_id = overlap->send_id[ipulse];
1011 send_index0 = overlap->comm_data[ipulse].recv_index0;
1012 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1013 recv_index0 = overlap->comm_data[ipulse].send_index0;
1014 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1017 /* Copy data to contiguous send buffer */
1020 fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1021 pme->nodeid, overlap->nodeid, send_id,
1022 pme->pmegrid_start_iy,
1023 send_index0-pme->pmegrid_start_iy,
1024 send_index0-pme->pmegrid_start_iy+send_nindex);
1027 for (i = 0; i < pme->pmegrid_nx; i++)
1030 for (j = 0; j < send_nindex; j++)
1032 iy = j + send_index0 - pme->pmegrid_start_iy;
1033 for (k = 0; k < pme->nkz; k++)
1036 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
1041 datasize = pme->pmegrid_nx * pme->nkz;
1043 MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
1045 overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
1047 overlap->mpi_comm, &stat);
1049 /* Get data from contiguous recv buffer */
1052 fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1053 pme->nodeid, overlap->nodeid, recv_id,
1054 pme->pmegrid_start_iy,
1055 recv_index0-pme->pmegrid_start_iy,
1056 recv_index0-pme->pmegrid_start_iy+recv_nindex);
1059 for (i = 0; i < pme->pmegrid_nx; i++)
1062 for (j = 0; j < recv_nindex; j++)
1064 iy = j + recv_index0 - pme->pmegrid_start_iy;
1065 for (k = 0; k < pme->nkz; k++)
1068 if (direction == GMX_SUM_QGRID_FORWARD)
1070 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1074 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1081 /* Major dimension is easier, no copying required,
1082 * but we might have to sum to separate array.
1083 * Since we don't copy, we have to communicate up to pmegrid_nz,
1084 * not nkz as for the minor direction.
1086 overlap = &pme->overlap[0];
1088 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1090 if (direction == GMX_SUM_QGRID_FORWARD)
1092 send_id = overlap->send_id[ipulse];
1093 recv_id = overlap->recv_id[ipulse];
1094 send_index0 = overlap->comm_data[ipulse].send_index0;
1095 send_nindex = overlap->comm_data[ipulse].send_nindex;
1096 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1097 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1098 recvptr = overlap->recvbuf;
1102 send_id = overlap->recv_id[ipulse];
1103 recv_id = overlap->send_id[ipulse];
1104 send_index0 = overlap->comm_data[ipulse].recv_index0;
1105 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1106 recv_index0 = overlap->comm_data[ipulse].send_index0;
1107 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1108 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1111 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1112 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1116 fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1117 pme->nodeid, overlap->nodeid, send_id,
1118 pme->pmegrid_start_ix,
1119 send_index0-pme->pmegrid_start_ix,
1120 send_index0-pme->pmegrid_start_ix+send_nindex);
1121 fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1122 pme->nodeid, overlap->nodeid, recv_id,
1123 pme->pmegrid_start_ix,
1124 recv_index0-pme->pmegrid_start_ix,
1125 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1128 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
1130 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
1132 overlap->mpi_comm, &stat);
1134 /* ADD data from contiguous recv buffer */
1135 if (direction == GMX_SUM_QGRID_FORWARD)
1137 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1138 for (i = 0; i < recv_nindex*datasize; i++)
1140 p[i] += overlap->recvbuf[i];
1149 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1151 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1152 ivec local_pme_size;
1156 /* Dimensions should be identical for A/B grid, so we just use A here */
1157 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1162 local_pme_size[0] = pme->pmegrid_nx;
1163 local_pme_size[1] = pme->pmegrid_ny;
1164 local_pme_size[2] = pme->pmegrid_nz;
1166 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1167 the offset is identical, and the PME grid always has more data (due to overlap)
1172 char fn[STRLEN], format[STRLEN];
1174 sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1175 fp = ffopen(fn, "w");
1176 sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1177 fp2 = ffopen(fn, "w");
1178 sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1181 for (ix = 0; ix < local_fft_ndata[XX]; ix++)
1183 for (iy = 0; iy < local_fft_ndata[YY]; iy++)
1185 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1187 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1188 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1189 fftgrid[fftidx] = pmegrid[pmeidx];
1191 val = 100*pmegrid[pmeidx];
1192 if (pmegrid[pmeidx] != 0)
1194 fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
1195 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
1197 if (pmegrid[pmeidx] != 0)
1199 fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n",
1201 pme->pmegrid_start_ix + ix,
1202 pme->pmegrid_start_iy + iy,
1203 pme->pmegrid_start_iz + iz,
1219 static gmx_cycles_t omp_cyc_start()
1221 return gmx_cycles_read();
1224 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1226 return gmx_cycles_read() - c;
1231 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1232 int nthread, int thread)
1234 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1235 ivec local_pme_size;
1236 int ixy0, ixy1, ixy, ix, iy, iz;
1238 #ifdef PME_TIME_THREADS
1240 static double cs1 = 0;
1244 #ifdef PME_TIME_THREADS
1245 c1 = omp_cyc_start();
1247 /* Dimensions should be identical for A/B grid, so we just use A here */
1248 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1253 local_pme_size[0] = pme->pmegrid_nx;
1254 local_pme_size[1] = pme->pmegrid_ny;
1255 local_pme_size[2] = pme->pmegrid_nz;
1257 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1258 the offset is identical, and the PME grid always has more data (due to overlap)
1260 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1261 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1263 for (ixy = ixy0; ixy < ixy1; ixy++)
1265 ix = ixy/local_fft_ndata[YY];
1266 iy = ixy - ix*local_fft_ndata[YY];
1268 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1269 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1270 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1272 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1276 #ifdef PME_TIME_THREADS
1277 c1 = omp_cyc_end(c1);
1282 printf("copy %.2f\n", cs1*1e-9);
1291 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1293 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1299 pnx = pme->pmegrid_nx;
1300 pny = pme->pmegrid_ny;
1301 pnz = pme->pmegrid_nz;
1303 overlap = pme->pme_order - 1;
1305 /* Add periodic overlap in z */
1306 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1308 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1310 for (iz = 0; iz < overlap; iz++)
1312 pmegrid[(ix*pny+iy)*pnz+iz] +=
1313 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1318 if (pme->nnodes_minor == 1)
1320 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1322 for (iy = 0; iy < overlap; iy++)
1324 for (iz = 0; iz < nz; iz++)
1326 pmegrid[(ix*pny+iy)*pnz+iz] +=
1327 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1333 if (pme->nnodes_major == 1)
1335 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1337 for (ix = 0; ix < overlap; ix++)
1339 for (iy = 0; iy < ny_x; iy++)
1341 for (iz = 0; iz < nz; iz++)
1343 pmegrid[(ix*pny+iy)*pnz+iz] +=
1344 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1353 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1355 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1361 pnx = pme->pmegrid_nx;
1362 pny = pme->pmegrid_ny;
1363 pnz = pme->pmegrid_nz;
1365 overlap = pme->pme_order - 1;
1367 if (pme->nnodes_major == 1)
1369 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1371 for (ix = 0; ix < overlap; ix++)
1375 for (iy = 0; iy < ny_x; iy++)
1377 for (iz = 0; iz < nz; iz++)
1379 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1380 pmegrid[(ix*pny+iy)*pnz+iz];
1386 if (pme->nnodes_minor == 1)
1388 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1389 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1393 for (iy = 0; iy < overlap; iy++)
1395 for (iz = 0; iz < nz; iz++)
1397 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1398 pmegrid[(ix*pny+iy)*pnz+iz];
1404 /* Copy periodic overlap in z */
1405 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1406 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1410 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1412 for (iz = 0; iz < overlap; iz++)
1414 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1415 pmegrid[(ix*pny+iy)*pnz+iz];
1421 static void clear_grid(int nx, int ny, int nz, real *grid,
1423 int fx, int fy, int fz,
1427 int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
1430 nc = 2 + (order - 2)/FLBS;
1431 ncz = 2 + (order - 2)/FLBSZ;
1433 for (fsx = fx; fsx < fx+nc; fsx++)
1435 for (fsy = fy; fsy < fy+nc; fsy++)
1437 for (fsz = fz; fsz < fz+ncz; fsz++)
1439 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1440 if (flag[flind] == 0)
1445 g0x = (gx*ny + gy)*nz + gz;
1446 for (x = 0; x < FLBS; x++)
1449 for (y = 0; y < FLBS; y++)
1451 for (z = 0; z < FLBSZ; z++)
1467 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1468 #define DO_BSPLINE(order) \
1469 for (ithx = 0; (ithx < order); ithx++) \
1471 index_x = (i0+ithx)*pny*pnz; \
1472 valx = qn*thx[ithx]; \
1474 for (ithy = 0; (ithy < order); ithy++) \
1476 valxy = valx*thy[ithy]; \
1477 index_xy = index_x+(j0+ithy)*pnz; \
1479 for (ithz = 0; (ithz < order); ithz++) \
1481 index_xyz = index_xy+(k0+ithz); \
1482 grid[index_xyz] += valxy*thz[ithz]; \
1488 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1489 pme_atomcomm_t *atc, splinedata_t *spline,
1490 pme_spline_work_t *work)
1493 /* spread charges from home atoms to local grid */
1496 int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1498 int order, norder, index_x, index_xy, index_xyz;
1499 real valx, valxy, qn;
1500 real *thx, *thy, *thz;
1501 int localsize, bndsize;
1502 int pnx, pny, pnz, ndatatot;
1503 int offx, offy, offz;
1505 pnx = pmegrid->s[XX];
1506 pny = pmegrid->s[YY];
1507 pnz = pmegrid->s[ZZ];
1509 offx = pmegrid->offset[XX];
1510 offy = pmegrid->offset[YY];
1511 offz = pmegrid->offset[ZZ];
1513 ndatatot = pnx*pny*pnz;
1514 grid = pmegrid->grid;
1515 for (i = 0; i < ndatatot; i++)
1520 order = pmegrid->order;
1522 for (nn = 0; nn < spline->n; nn++)
1524 n = spline->ind[nn];
1529 idxptr = atc->idx[n];
1532 i0 = idxptr[XX] - offx;
1533 j0 = idxptr[YY] - offy;
1534 k0 = idxptr[ZZ] - offz;
1536 thx = spline->theta[XX] + norder;
1537 thy = spline->theta[YY] + norder;
1538 thz = spline->theta[ZZ] + norder;
1544 #ifdef PME_SSE_UNALIGNED
1545 #define PME_SPREAD_SSE_ORDER4
1547 #define PME_SPREAD_SSE_ALIGNED
1550 #include "pme_sse_single.h"
1557 #define PME_SPREAD_SSE_ALIGNED
1559 #include "pme_sse_single.h"
1572 static void set_grid_alignment(int *pmegrid_nz, int pme_order)
1576 #ifndef PME_SSE_UNALIGNED
1581 /* Round nz up to a multiple of 4 to ensure alignment */
1582 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1587 static void set_gridsize_alignment(int *gridsize, int pme_order)
1590 #ifndef PME_SSE_UNALIGNED
1593 /* Add extra elements to ensured aligned operations do not go
1594 * beyond the allocated grid size.
1595 * Note that for pme_order=5, the pme grid z-size alignment
1596 * ensures that we will not go beyond the grid size.
1604 static void pmegrid_init(pmegrid_t *grid,
1605 int cx, int cy, int cz,
1606 int x0, int y0, int z0,
1607 int x1, int y1, int z1,
1608 gmx_bool set_alignment,
1617 grid->offset[XX] = x0;
1618 grid->offset[YY] = y0;
1619 grid->offset[ZZ] = z0;
1620 grid->n[XX] = x1 - x0 + pme_order - 1;
1621 grid->n[YY] = y1 - y0 + pme_order - 1;
1622 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1623 copy_ivec(grid->n, grid->s);
1626 set_grid_alignment(&nz, pme_order);
1631 else if (nz != grid->s[ZZ])
1633 gmx_incons("pmegrid_init call with an unaligned z size");
1636 grid->order = pme_order;
1639 gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1640 set_gridsize_alignment(&gridsize, pme_order);
1641 snew_aligned(grid->grid, gridsize, 16);
1649 static int div_round_up(int enumerator, int denominator)
1651 return (enumerator + denominator - 1)/denominator;
1654 static void make_subgrid_division(const ivec n, int ovl, int nthread,
1657 int gsize_opt, gsize;
1662 for (nsx = 1; nsx <= nthread; nsx++)
1664 if (nthread % nsx == 0)
1666 for (nsy = 1; nsy <= nthread; nsy++)
1668 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1670 nsz = nthread/(nsx*nsy);
1672 /* Determine the number of grid points per thread */
1674 (div_round_up(n[XX], nsx) + ovl)*
1675 (div_round_up(n[YY], nsy) + ovl)*
1676 (div_round_up(n[ZZ], nsz) + ovl);
1678 /* Minimize the number of grids points per thread
1679 * and, secondarily, the number of cuts in minor dimensions.
1681 if (gsize_opt == -1 ||
1682 gsize < gsize_opt ||
1683 (gsize == gsize_opt &&
1684 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1696 env = getenv("GMX_PME_THREAD_DIVISION");
1699 sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
1702 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1704 gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
1708 static void pmegrids_init(pmegrids_t *grids,
1709 int nx, int ny, int nz, int nz_base,
1715 ivec n, n_base, g0, g1;
1716 int t, x, y, z, d, i, tfac;
1717 int max_comm_lines = -1;
1719 n[XX] = nx - (pme_order - 1);
1720 n[YY] = ny - (pme_order - 1);
1721 n[ZZ] = nz - (pme_order - 1);
1723 copy_ivec(n, n_base);
1724 n_base[ZZ] = nz_base;
1726 pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
1729 grids->nthread = nthread;
1731 make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1733 if (grids->nthread > 1)
1738 for (d = 0; d < DIM; d++)
1740 nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1742 set_grid_alignment(&nst[ZZ], pme_order);
1746 fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1747 grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
1748 fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1750 nst[XX], nst[YY], nst[ZZ]);
1753 snew(grids->grid_th, grids->nthread);
1755 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1756 set_gridsize_alignment(&gridsize, pme_order);
1757 snew_aligned(grids->grid_all,
1758 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1761 for (x = 0; x < grids->nc[XX]; x++)
1763 for (y = 0; y < grids->nc[YY]; y++)
1765 for (z = 0; z < grids->nc[ZZ]; z++)
1767 pmegrid_init(&grids->grid_th[t],
1769 (n[XX]*(x ))/grids->nc[XX],
1770 (n[YY]*(y ))/grids->nc[YY],
1771 (n[ZZ]*(z ))/grids->nc[ZZ],
1772 (n[XX]*(x+1))/grids->nc[XX],
1773 (n[YY]*(y+1))/grids->nc[YY],
1774 (n[ZZ]*(z+1))/grids->nc[ZZ],
1777 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1784 snew(grids->g2t, DIM);
1786 for (d = DIM-1; d >= 0; d--)
1788 snew(grids->g2t[d], n[d]);
1790 for (i = 0; i < n[d]; i++)
1792 /* The second check should match the parameters
1793 * of the pmegrid_init call above.
1795 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1799 grids->g2t[d][i] = t*tfac;
1802 tfac *= grids->nc[d];
1806 case XX: max_comm_lines = overlap_x; break;
1807 case YY: max_comm_lines = overlap_y; break;
1808 case ZZ: max_comm_lines = pme_order - 1; break;
1810 grids->nthread_comm[d] = 0;
1811 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1812 grids->nthread_comm[d] < grids->nc[d])
1814 grids->nthread_comm[d]++;
1818 fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1819 'x'+d, grids->nthread_comm[d]);
1821 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1822 * work, but this is not a problematic restriction.
1824 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1826 gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1832 static void pmegrids_destroy(pmegrids_t *grids)
1836 if (grids->grid.grid != NULL)
1838 sfree(grids->grid.grid);
1840 if (grids->nthread > 0)
1842 for (t = 0; t < grids->nthread; t++)
1844 sfree(grids->grid_th[t].grid);
1846 sfree(grids->grid_th);
1852 static void realloc_work(pme_work_t *work, int nkx)
1854 if (nkx > work->nalloc)
1857 srenew(work->mhx, work->nalloc);
1858 srenew(work->mhy, work->nalloc);
1859 srenew(work->mhz, work->nalloc);
1860 srenew(work->m2, work->nalloc);
1861 /* Allocate an aligned pointer for SSE operations, including 3 extra
1862 * elements at the end since SSE operates on 4 elements at a time.
1864 sfree_aligned(work->denom);
1865 sfree_aligned(work->tmp1);
1866 sfree_aligned(work->eterm);
1867 snew_aligned(work->denom, work->nalloc+3, 16);
1868 snew_aligned(work->tmp1, work->nalloc+3, 16);
1869 snew_aligned(work->eterm, work->nalloc+3, 16);
1870 srenew(work->m2inv, work->nalloc);
1875 static void free_work(pme_work_t *work)
1881 sfree_aligned(work->denom);
1882 sfree_aligned(work->tmp1);
1883 sfree_aligned(work->eterm);
1889 /* Calculate exponentials through SSE in float precision */
1890 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1893 const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
1896 __m128 tmp_d1, d_inv, tmp_r, tmp_e;
1898 f_sse = _mm_load1_ps(&f);
1899 for (kx = 0; kx < end; kx += 4)
1901 tmp_d1 = _mm_load_ps(d_aligned+kx);
1902 lu = _mm_rcp_ps(tmp_d1);
1903 d_inv = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
1904 tmp_r = _mm_load_ps(r_aligned+kx);
1905 tmp_r = gmx_mm_exp_ps(tmp_r);
1906 tmp_e = _mm_mul_ps(f_sse, d_inv);
1907 tmp_e = _mm_mul_ps(tmp_e, tmp_r);
1908 _mm_store_ps(e_aligned+kx, tmp_e);
1913 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1916 for (kx = start; kx < end; kx++)
1920 for (kx = start; kx < end; kx++)
1924 for (kx = start; kx < end; kx++)
1926 e[kx] = f*r[kx]*d[kx];
1932 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1933 real ewaldcoeff, real vol,
1935 int nthread, int thread)
1937 /* do recip sum over local cells in grid */
1938 /* y major, z middle, x minor or continuous */
1940 int kx, ky, kz, maxkx, maxky, maxkz;
1941 int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1943 real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1944 real ets2, struct2, vfactor, ets2vf;
1945 real d1, d2, energy = 0;
1947 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1948 real rxx, ryx, ryy, rzx, rzy, rzz;
1950 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1951 real mhxk, mhyk, mhzk, m2k;
1954 ivec local_ndata, local_offset, local_size;
1957 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1963 /* Dimensions should be identical for A/B grid, so we just use A here */
1964 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1970 rxx = pme->recipbox[XX][XX];
1971 ryx = pme->recipbox[YY][XX];
1972 ryy = pme->recipbox[YY][YY];
1973 rzx = pme->recipbox[ZZ][XX];
1974 rzy = pme->recipbox[ZZ][YY];
1975 rzz = pme->recipbox[ZZ][ZZ];
1981 work = &pme->work[thread];
1986 denom = work->denom;
1988 eterm = work->eterm;
1989 m2inv = work->m2inv;
1991 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1992 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1994 for (iyz = iyz0; iyz < iyz1; iyz++)
1996 iy = iyz/local_ndata[ZZ];
1997 iz = iyz - iy*local_ndata[ZZ];
1999 ky = iy + local_offset[YY];
2010 by = M_PI*vol*pme->bsp_mod[YY][ky];
2012 kz = iz + local_offset[ZZ];
2016 bz = pme->bsp_mod[ZZ][kz];
2018 /* 0.5 correction for corner points */
2020 if (kz == 0 || kz == (nz+1)/2)
2025 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2027 /* We should skip the k-space point (0,0,0) */
2028 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
2030 kxstart = local_offset[XX];
2034 kxstart = local_offset[XX] + 1;
2037 kxend = local_offset[XX] + local_ndata[XX];
2041 /* More expensive inner loop, especially because of the storage
2042 * of the mh elements in array's.
2043 * Because x is the minor grid index, all mh elements
2044 * depend on kx for triclinic unit cells.
2047 /* Two explicit loops to avoid a conditional inside the loop */
2048 for (kx = kxstart; kx < maxkx; kx++)
2053 mhyk = mx * ryx + my * ryy;
2054 mhzk = mx * rzx + my * rzy + mz * rzz;
2055 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2060 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2061 tmp1[kx] = -factor*m2k;
2064 for (kx = maxkx; kx < kxend; kx++)
2069 mhyk = mx * ryx + my * ryy;
2070 mhzk = mx * rzx + my * rzy + mz * rzz;
2071 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2076 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2077 tmp1[kx] = -factor*m2k;
2080 for (kx = kxstart; kx < kxend; kx++)
2082 m2inv[kx] = 1.0/m2[kx];
2085 calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2087 for (kx = kxstart; kx < kxend; kx++, p0++)
2092 p0->re = d1*eterm[kx];
2093 p0->im = d2*eterm[kx];
2095 struct2 = 2.0*(d1*d1+d2*d2);
2097 tmp1[kx] = eterm[kx]*struct2;
2100 for (kx = kxstart; kx < kxend; kx++)
2102 ets2 = corner_fac*tmp1[kx];
2103 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2106 ets2vf = ets2*vfactor;
2107 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2108 virxy += ets2vf*mhx[kx]*mhy[kx];
2109 virxz += ets2vf*mhx[kx]*mhz[kx];
2110 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2111 viryz += ets2vf*mhy[kx]*mhz[kx];
2112 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2117 /* We don't need to calculate the energy and the virial.
2118 * In this case the triclinic overhead is small.
2121 /* Two explicit loops to avoid a conditional inside the loop */
2123 for (kx = kxstart; kx < maxkx; kx++)
2128 mhyk = mx * ryx + my * ryy;
2129 mhzk = mx * rzx + my * rzy + mz * rzz;
2130 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2131 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2132 tmp1[kx] = -factor*m2k;
2135 for (kx = maxkx; kx < kxend; kx++)
2140 mhyk = mx * ryx + my * ryy;
2141 mhzk = mx * rzx + my * rzy + mz * rzz;
2142 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2143 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2144 tmp1[kx] = -factor*m2k;
2147 calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2149 for (kx = kxstart; kx < kxend; kx++, p0++)
2154 p0->re = d1*eterm[kx];
2155 p0->im = d2*eterm[kx];
2162 /* Update virial with local values.
2163 * The virial is symmetric by definition.
2164 * this virial seems ok for isotropic scaling, but I'm
2165 * experiencing problems on semiisotropic membranes.
2166 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2168 work->vir[XX][XX] = 0.25*virxx;
2169 work->vir[YY][YY] = 0.25*viryy;
2170 work->vir[ZZ][ZZ] = 0.25*virzz;
2171 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2172 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2173 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2175 /* This energy should be corrected for a charged system */
2176 work->energy = 0.5*energy;
2179 /* Return the loop count */
2180 return local_ndata[YY]*local_ndata[XX];
2183 static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
2184 real *mesh_energy, matrix vir)
2186 /* This function sums output over threads
2187 * and should therefore only be called after thread synchronization.
2191 *mesh_energy = pme->work[0].energy;
2192 copy_mat(pme->work[0].vir, vir);
2194 for (thread = 1; thread < nthread; thread++)
2196 *mesh_energy += pme->work[thread].energy;
2197 m_add(vir, pme->work[thread].vir, vir);
2201 #define DO_FSPLINE(order) \
2202 for (ithx = 0; (ithx < order); ithx++) \
2204 index_x = (i0+ithx)*pny*pnz; \
2208 for (ithy = 0; (ithy < order); ithy++) \
2210 index_xy = index_x+(j0+ithy)*pnz; \
2215 for (ithz = 0; (ithz < order); ithz++) \
2217 gval = grid[index_xy+(k0+ithz)]; \
2218 fxy1 += thz[ithz]*gval; \
2219 fz1 += dthz[ithz]*gval; \
2228 static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2229 gmx_bool bClearF, pme_atomcomm_t *atc,
2230 splinedata_t *spline,
2233 /* sum forces for local particles */
2234 int nn, n, ithx, ithy, ithz, i0, j0, k0;
2235 int index_x, index_xy;
2236 int nx, ny, nz, pnx, pny, pnz;
2238 real tx, ty, dx, dy, qn;
2239 real fx, fy, fz, gval;
2241 real *thx, *thy, *thz, *dthx, *dthy, *dthz;
2243 real rxx, ryx, ryy, rzx, rzy, rzz;
2246 pme_spline_work_t *work;
2248 work = pme->spline_work;
2250 order = pme->pme_order;
2251 thx = spline->theta[XX];
2252 thy = spline->theta[YY];
2253 thz = spline->theta[ZZ];
2254 dthx = spline->dtheta[XX];
2255 dthy = spline->dtheta[YY];
2256 dthz = spline->dtheta[ZZ];
2260 pnx = pme->pmegrid_nx;
2261 pny = pme->pmegrid_ny;
2262 pnz = pme->pmegrid_nz;
2264 rxx = pme->recipbox[XX][XX];
2265 ryx = pme->recipbox[YY][XX];
2266 ryy = pme->recipbox[YY][YY];
2267 rzx = pme->recipbox[ZZ][XX];
2268 rzy = pme->recipbox[ZZ][YY];
2269 rzz = pme->recipbox[ZZ][ZZ];
2271 for (nn = 0; nn < spline->n; nn++)
2273 n = spline->ind[nn];
2274 qn = scale*atc->q[n];
2287 idxptr = atc->idx[n];
2294 /* Pointer arithmetic alert, next six statements */
2295 thx = spline->theta[XX] + norder;
2296 thy = spline->theta[YY] + norder;
2297 thz = spline->theta[ZZ] + norder;
2298 dthx = spline->dtheta[XX] + norder;
2299 dthy = spline->dtheta[YY] + norder;
2300 dthz = spline->dtheta[ZZ] + norder;
2306 #ifdef PME_SSE_UNALIGNED
2307 #define PME_GATHER_F_SSE_ORDER4
2309 #define PME_GATHER_F_SSE_ALIGNED
2312 #include "pme_sse_single.h"
2319 #define PME_GATHER_F_SSE_ALIGNED
2321 #include "pme_sse_single.h"
2331 atc->f[n][XX] += -qn*( fx*nx*rxx );
2332 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2333 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2336 /* Since the energy and not forces are interpolated
2337 * the net force might not be exactly zero.
2338 * This can be solved by also interpolating F, but
2339 * that comes at a cost.
2340 * A better hack is to remove the net force every
2341 * step, but that must be done at a higher level
2342 * since this routine doesn't see all atoms if running
2343 * in parallel. Don't know how important it is? EL 990726
2348 static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2349 pme_atomcomm_t *atc)
2351 splinedata_t *spline;
2352 int n, ithx, ithy, ithz, i0, j0, k0;
2353 int index_x, index_xy;
2355 real energy, pot, tx, ty, qn, gval;
2356 real *thx, *thy, *thz;
2360 spline = &atc->spline[0];
2362 order = pme->pme_order;
2365 for (n = 0; (n < atc->n); n++)
2371 idxptr = atc->idx[n];
2378 /* Pointer arithmetic alert, next three statements */
2379 thx = spline->theta[XX] + norder;
2380 thy = spline->theta[YY] + norder;
2381 thz = spline->theta[ZZ] + norder;
2384 for (ithx = 0; (ithx < order); ithx++)
2386 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2389 for (ithy = 0; (ithy < order); ithy++)
2391 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2394 for (ithz = 0; (ithz < order); ithz++)
2396 gval = grid[index_xy+(k0+ithz)];
2397 pot += tx*ty*thz[ithz]*gval;
2410 /* Macro to force loop unrolling by fixing order.
2411 * This gives a significant performance gain.
2413 #define CALC_SPLINE(order) \
2417 real data[PME_ORDER_MAX]; \
2418 real ddata[PME_ORDER_MAX]; \
2420 for (j = 0; (j < DIM); j++) \
2424 /* dr is relative offset from lower cell limit */ \
2425 data[order-1] = 0; \
2429 for (k = 3; (k < order); k++) \
2431 div = 1.0/(k - 1.0); \
2432 data[k-1] = div*dr*data[k-2]; \
2433 for (l = 1; (l < (k-1)); l++) \
2435 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2438 data[0] = div*(1-dr)*data[0]; \
2440 /* differentiate */ \
2441 ddata[0] = -data[0]; \
2442 for (k = 1; (k < order); k++) \
2444 ddata[k] = data[k-1] - data[k]; \
2447 div = 1.0/(order - 1); \
2448 data[order-1] = div*dr*data[order-2]; \
2449 for (l = 1; (l < (order-1)); l++) \
2451 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2452 (order-l-dr)*data[order-l-1]); \
2454 data[0] = div*(1 - dr)*data[0]; \
2456 for (k = 0; k < order; k++) \
2458 theta[j][i*order+k] = data[k]; \
2459 dtheta[j][i*order+k] = ddata[k]; \
2464 void make_bsplines(splinevec theta, splinevec dtheta, int order,
2465 rvec fractx[], int nr, int ind[], real charge[],
2466 gmx_bool bFreeEnergy)
2468 /* construct splines for local atoms */
2472 for (i = 0; i < nr; i++)
2474 /* With free energy we do not use the charge check.
2475 * In most cases this will be more efficient than calling make_bsplines
2476 * twice, since usually more than half the particles have charges.
2479 if (bFreeEnergy || charge[ii] != 0.0)
2484 case 4: CALC_SPLINE(4); break;
2485 case 5: CALC_SPLINE(5); break;
2486 default: CALC_SPLINE(order); break;
2493 void make_dft_mod(real *mod, real *data, int ndata)
2498 for (i = 0; i < ndata; i++)
2501 for (j = 0; j < ndata; j++)
2503 arg = (2.0*M_PI*i*j)/ndata;
2504 sc += data[j]*cos(arg);
2505 ss += data[j]*sin(arg);
2507 mod[i] = sc*sc+ss*ss;
2509 for (i = 0; i < ndata; i++)
2513 mod[i] = (mod[i-1]+mod[i+1])*0.5;
2519 static void make_bspline_moduli(splinevec bsp_mod,
2520 int nx, int ny, int nz, int order)
2522 int nmax = max(nx, max(ny, nz));
2523 real *data, *ddata, *bsp_data;
2529 snew(bsp_data, nmax);
2535 for (k = 3; k < order; k++)
2539 for (l = 1; l < (k-1); l++)
2541 data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2543 data[0] = div*data[0];
2546 ddata[0] = -data[0];
2547 for (k = 1; k < order; k++)
2549 ddata[k] = data[k-1]-data[k];
2551 div = 1.0/(order-1);
2553 for (l = 1; l < (order-1); l++)
2555 data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2557 data[0] = div*data[0];
2559 for (i = 0; i < nmax; i++)
2563 for (i = 1; i <= order; i++)
2565 bsp_data[i] = data[i-1];
2568 make_dft_mod(bsp_mod[XX], bsp_data, nx);
2569 make_dft_mod(bsp_mod[YY], bsp_data, ny);
2570 make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
2578 /* Return the P3M optimal influence function */
2579 static double do_p3m_influence(double z, int order)
2586 /* The formula and most constants can be found in:
2587 * Ballenegger et al., JCTC 8, 936 (2012)
2592 return 1.0 - 2.0*z2/3.0;
2595 return 1.0 - z2 + 2.0*z4/15.0;
2598 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2601 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2604 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2607 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2609 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2616 /* Calculate the P3M B-spline moduli for one dimension */
2617 static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2619 double zarg, zai, sinzai, infl;
2624 gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
2631 for (i = -maxk; i < 0; i++)
2635 infl = do_p3m_influence(sinzai, order);
2636 bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2639 for (i = 1; i < maxk; i++)
2643 infl = do_p3m_influence(sinzai, order);
2644 bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2648 /* Calculate the P3M B-spline moduli */
2649 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2650 int nx, int ny, int nz, int order)
2652 make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
2653 make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
2654 make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
2658 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2666 for (i = 1; i <= nslab/2; i++)
2668 fw = (atc->nodeid + i) % nslab;
2669 bw = (atc->nodeid - i + nslab) % nslab;
2672 atc->node_dest[n] = fw;
2673 atc->node_src[n] = bw;
2678 atc->node_dest[n] = bw;
2679 atc->node_src[n] = fw;
2685 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2691 fprintf(log, "Destroying PME data structures.\n");
2694 sfree((*pmedata)->nnx);
2695 sfree((*pmedata)->nny);
2696 sfree((*pmedata)->nnz);
2698 pmegrids_destroy(&(*pmedata)->pmegridA);
2700 sfree((*pmedata)->fftgridA);
2701 sfree((*pmedata)->cfftgridA);
2702 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2704 if ((*pmedata)->pmegridB.grid.grid != NULL)
2706 pmegrids_destroy(&(*pmedata)->pmegridB);
2707 sfree((*pmedata)->fftgridB);
2708 sfree((*pmedata)->cfftgridB);
2709 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2711 for (thread = 0; thread < (*pmedata)->nthread; thread++)
2713 free_work(&(*pmedata)->work[thread]);
2715 sfree((*pmedata)->work);
2723 static int mult_up(int n, int f)
2725 return ((n + f - 1)/f)*f;
2729 static double pme_load_imbalance(gmx_pme_t pme)
2734 nma = pme->nnodes_major;
2735 nmi = pme->nnodes_minor;
2737 n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
2738 n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
2739 n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
2741 /* pme_solve is roughly double the cost of an fft */
2743 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2746 static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
2747 int dimind, gmx_bool bSpread)
2749 int nk, k, s, thread;
2751 atc->dimind = dimind;
2756 if (pme->nnodes > 1)
2758 atc->mpi_comm = pme->mpi_comm_d[dimind];
2759 MPI_Comm_size(atc->mpi_comm, &atc->nslab);
2760 MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
2764 fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
2768 atc->bSpread = bSpread;
2769 atc->pme_order = pme->pme_order;
2773 /* These three allocations are not required for particle decomp. */
2774 snew(atc->node_dest, atc->nslab);
2775 snew(atc->node_src, atc->nslab);
2776 setup_coordinate_communication(atc);
2778 snew(atc->count_thread, pme->nthread);
2779 for (thread = 0; thread < pme->nthread; thread++)
2781 snew(atc->count_thread[thread], atc->nslab);
2783 atc->count = atc->count_thread[0];
2784 snew(atc->rcount, atc->nslab);
2785 snew(atc->buf_index, atc->nslab);
2788 atc->nthread = pme->nthread;
2789 if (atc->nthread > 1)
2791 snew(atc->thread_plist, atc->nthread);
2793 snew(atc->spline, atc->nthread);
2794 for (thread = 0; thread < atc->nthread; thread++)
2796 if (atc->nthread > 1)
2798 snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
2799 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2801 snew(atc->spline[thread].thread_one, pme->nthread);
2802 atc->spline[thread].thread_one[thread] = 1;
2807 init_overlap_comm(pme_overlap_t * ol,
2817 int lbnd, rbnd, maxlr, b, i;
2820 pme_grid_comm_t *pgc;
2822 int fft_start, fft_end, send_index1, recv_index1;
2826 ol->mpi_comm = comm;
2829 ol->nnodes = nnodes;
2830 ol->nodeid = nodeid;
2832 /* Linear translation of the PME grid won't affect reciprocal space
2833 * calculations, so to optimize we only interpolate "upwards",
2834 * which also means we only have to consider overlap in one direction.
2835 * I.e., particles on this node might also be spread to grid indices
2836 * that belong to higher nodes (modulo nnodes)
2839 snew(ol->s2g0, ol->nnodes+1);
2840 snew(ol->s2g1, ol->nnodes);
2843 fprintf(debug, "PME slab boundaries:");
2845 for (i = 0; i < nnodes; i++)
2847 /* s2g0 the local interpolation grid start.
2848 * s2g1 the local interpolation grid end.
2849 * Because grid overlap communication only goes forward,
2850 * the grid the slabs for fft's should be rounded down.
2852 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2853 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2857 fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
2860 ol->s2g0[nnodes] = ndata;
2863 fprintf(debug, "\n");
2866 /* Determine with how many nodes we need to communicate the grid overlap */
2872 for (i = 0; i < nnodes; i++)
2874 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2875 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2881 while (bCont && b < nnodes);
2882 ol->noverlap_nodes = b - 1;
2884 snew(ol->send_id, ol->noverlap_nodes);
2885 snew(ol->recv_id, ol->noverlap_nodes);
2886 for (b = 0; b < ol->noverlap_nodes; b++)
2888 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2889 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2891 snew(ol->comm_data, ol->noverlap_nodes);
2894 for (b = 0; b < ol->noverlap_nodes; b++)
2896 pgc = &ol->comm_data[b];
2898 fft_start = ol->s2g0[ol->send_id[b]];
2899 fft_end = ol->s2g0[ol->send_id[b]+1];
2900 if (ol->send_id[b] < nodeid)
2905 send_index1 = ol->s2g1[nodeid];
2906 send_index1 = min(send_index1, fft_end);
2907 pgc->send_index0 = fft_start;
2908 pgc->send_nindex = max(0, send_index1 - pgc->send_index0);
2909 ol->send_size += pgc->send_nindex;
2911 /* We always start receiving to the first index of our slab */
2912 fft_start = ol->s2g0[ol->nodeid];
2913 fft_end = ol->s2g0[ol->nodeid+1];
2914 recv_index1 = ol->s2g1[ol->recv_id[b]];
2915 if (ol->recv_id[b] > nodeid)
2917 recv_index1 -= ndata;
2919 recv_index1 = min(recv_index1, fft_end);
2920 pgc->recv_index0 = fft_start;
2921 pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
2925 /* Communicate the buffer sizes to receive */
2926 for (b = 0; b < ol->noverlap_nodes; b++)
2928 MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
2929 &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
2930 ol->mpi_comm, &stat);
2934 /* For non-divisible grid we need pme_order iso pme_order-1 */
2935 snew(ol->sendbuf, norder*commplainsize);
2936 snew(ol->recvbuf, norder*commplainsize);
2940 make_gridindex5_to_localindex(int n, int local_start, int local_range,
2941 int **global_to_local,
2942 real **fraction_shift)
2950 for (i = 0; (i < 5*n); i++)
2952 /* Determine the global to local grid index */
2953 gtl[i] = (i - local_start + n) % n;
2954 /* For coordinates that fall within the local grid the fraction
2955 * is correct, we don't need to shift it.
2958 if (local_range < n)
2960 /* Due to rounding issues i could be 1 beyond the lower or
2961 * upper boundary of the local grid. Correct the index for this.
2962 * If we shift the index, we need to shift the fraction by
2963 * the same amount in the other direction to not affect
2965 * Note that due to this shifting the weights at the end of
2966 * the spline might change, but that will only involve values
2967 * between zero and values close to the precision of a real,
2968 * which is anyhow the accuracy of the whole mesh calculation.
2970 /* With local_range=0 we should not change i=local_start */
2971 if (i % n != local_start)
2978 else if (gtl[i] == local_range)
2980 gtl[i] = local_range - 1;
2987 *global_to_local = gtl;
2988 *fraction_shift = fsh;
2991 static pme_spline_work_t *make_pme_spline_work(int order)
2993 pme_spline_work_t *work;
3000 snew_aligned(work, 1, 16);
3002 zero_SSE = _mm_setzero_ps();
3004 /* Generate bit masks to mask out the unused grid entries,
3005 * as we only operate on order of the 8 grid entries that are
3006 * load into 2 SSE float registers.
3008 for (of = 0; of < 8-(order-1); of++)
3010 for (i = 0; i < 8; i++)
3012 tmp[i] = (i >= of && i < of+order ? 1 : 0);
3014 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
3015 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
3016 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
3017 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
3027 gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
3031 if (*nk % nnodes != 0)
3033 nk_new = nnodes*(*nk/nnodes + 1);
3035 if (2*nk_new >= 3*(*nk))
3037 gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
3038 dim, *nk, dim, nnodes, dim);
3043 fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
3044 dim, *nk, dim, nnodes, dim, nk_new, dim);
3051 int gmx_pme_init(gmx_pme_t * pmedata,
3057 gmx_bool bFreeEnergy,
3058 gmx_bool bReproducible,
3061 gmx_pme_t pme = NULL;
3063 pme_atomcomm_t *atc;
3068 fprintf(debug, "Creating PME data structures.\n");
3072 pme->redist_init = FALSE;
3073 pme->sum_qgrid_tmp = NULL;
3074 pme->sum_qgrid_dd_tmp = NULL;
3075 pme->buf_nalloc = 0;
3076 pme->redist_buf_nalloc = 0;
3079 pme->bPPnode = TRUE;
3081 pme->nnodes_major = nnodes_major;
3082 pme->nnodes_minor = nnodes_minor;
3085 if (nnodes_major*nnodes_minor > 1)
3087 pme->mpi_comm = cr->mpi_comm_mygroup;
3089 MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3090 MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3091 if (pme->nnodes != nnodes_major*nnodes_minor)
3093 gmx_incons("PME node count mismatch");
3098 pme->mpi_comm = MPI_COMM_NULL;
3102 if (pme->nnodes == 1)
3105 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3106 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3108 pme->ndecompdim = 0;
3109 pme->nodeid_major = 0;
3110 pme->nodeid_minor = 0;
3112 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3117 if (nnodes_minor == 1)
3120 pme->mpi_comm_d[0] = pme->mpi_comm;
3121 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3123 pme->ndecompdim = 1;
3124 pme->nodeid_major = pme->nodeid;
3125 pme->nodeid_minor = 0;
3128 else if (nnodes_major == 1)
3131 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3132 pme->mpi_comm_d[1] = pme->mpi_comm;
3134 pme->ndecompdim = 1;
3135 pme->nodeid_major = 0;
3136 pme->nodeid_minor = pme->nodeid;
3140 if (pme->nnodes % nnodes_major != 0)
3142 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3144 pme->ndecompdim = 2;
3147 MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3148 pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
3149 MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3150 pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3152 MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3153 MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3154 MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3155 MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3158 pme->bPPnode = (cr->duty & DUTY_PP);
3161 pme->nthread = nthread;
3163 if (ir->ePBC == epbcSCREW)
3165 gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
3168 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
3172 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3173 pme->pme_order = ir->pme_order;
3174 pme->epsilon_r = ir->epsilon_r;
3176 if (pme->pme_order > PME_ORDER_MAX)
3178 gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3179 pme->pme_order, PME_ORDER_MAX);
3182 /* Currently pme.c supports only the fft5d FFT code.
3183 * Therefore the grid always needs to be divisible by nnodes.
3184 * When the old 1D code is also supported again, change this check.
3186 * This check should be done before calling gmx_pme_init
3187 * and fplog should be passed iso stderr.
3189 if (pme->ndecompdim >= 2)
3191 if (pme->ndecompdim >= 1)
3194 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3195 'x',nnodes_major,&pme->nkx);
3196 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3197 'y',nnodes_minor,&pme->nky);
3201 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3202 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3203 pme->nkz <= pme->pme_order)
3205 gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
3208 if (pme->nnodes > 1)
3213 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3214 MPI_Type_commit(&(pme->rvec_mpi));
3217 /* Note that the charge spreading and force gathering, which usually
3218 * takes about the same amount of time as FFT+solve_pme,
3219 * is always fully load balanced
3220 * (unless the charge distribution is inhomogeneous).
3223 imbal = pme_load_imbalance(pme);
3224 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3228 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3229 " For optimal PME load balancing\n"
3230 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3231 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3233 (int)((imbal-1)*100 + 0.5),
3234 pme->nkx, pme->nky, pme->nnodes_major,
3235 pme->nky, pme->nkz, pme->nnodes_minor);
3239 /* For non-divisible grid we need pme_order iso pme_order-1 */
3240 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3241 * y is always copied through a buffer: we don't need padding in z,
3242 * but we do need the overlap in x because of the communication order.
3244 init_overlap_comm(&pme->overlap[0], pme->pme_order,
3248 pme->nnodes_major, pme->nodeid_major,
3250 (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3252 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3253 * We do this with an offset buffer of equal size, so we need to allocate
3254 * extra for the offset. That's what the (+1)*pme->nkz is for.
3256 init_overlap_comm(&pme->overlap[1], pme->pme_order,
3260 pme->nnodes_minor, pme->nodeid_minor,
3262 (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3264 /* Check for a limitation of the (current) sum_fftgrid_dd code.
3265 * We only allow multiple communication pulses in dim 1, not in dim 0.
3267 if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
3268 pme->nkx < pme->nnodes_major*pme->pme_order))
3270 gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3271 pme->nkx/(double)pme->nnodes_major, pme->pme_order);
3274 snew(pme->bsp_mod[XX], pme->nkx);
3275 snew(pme->bsp_mod[YY], pme->nky);
3276 snew(pme->bsp_mod[ZZ], pme->nkz);
3278 /* The required size of the interpolation grid, including overlap.
3279 * The allocated size (pmegrid_n?) might be slightly larger.
3281 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3282 pme->overlap[0].s2g0[pme->nodeid_major];
3283 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3284 pme->overlap[1].s2g0[pme->nodeid_minor];
3285 pme->pmegrid_nz_base = pme->nkz;
3286 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3287 set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3289 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3290 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3291 pme->pmegrid_start_iz = 0;
3293 make_gridindex5_to_localindex(pme->nkx,
3294 pme->pmegrid_start_ix,
3295 pme->pmegrid_nx - (pme->pme_order-1),
3296 &pme->nnx, &pme->fshx);
3297 make_gridindex5_to_localindex(pme->nky,
3298 pme->pmegrid_start_iy,
3299 pme->pmegrid_ny - (pme->pme_order-1),
3300 &pme->nny, &pme->fshy);
3301 make_gridindex5_to_localindex(pme->nkz,
3302 pme->pmegrid_start_iz,
3303 pme->pmegrid_nz_base,
3304 &pme->nnz, &pme->fshz);
3306 pmegrids_init(&pme->pmegridA,
3307 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3308 pme->pmegrid_nz_base,
3311 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3312 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3314 pme->spline_work = make_pme_spline_work(pme->pme_order);
3316 ndata[0] = pme->nkx;
3317 ndata[1] = pme->nky;
3318 ndata[2] = pme->nkz;
3320 /* This routine will allocate the grid data to fit the FFTs */
3321 gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
3322 &pme->fftgridA, &pme->cfftgridA,
3324 pme->overlap[0].s2g0, pme->overlap[1].s2g0,
3325 bReproducible, pme->nthread);
3329 pmegrids_init(&pme->pmegridB,
3330 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3331 pme->pmegrid_nz_base,
3334 pme->nkx % pme->nnodes_major != 0,
3335 pme->nky % pme->nnodes_minor != 0);
3337 gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
3338 &pme->fftgridB, &pme->cfftgridB,
3340 pme->overlap[0].s2g0, pme->overlap[1].s2g0,
3341 bReproducible, pme->nthread);
3345 pme->pmegridB.grid.grid = NULL;
3346 pme->fftgridB = NULL;
3347 pme->cfftgridB = NULL;
3352 /* Use plain SPME B-spline interpolation */
3353 make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3357 /* Use the P3M grid-optimized influence function */
3358 make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3361 /* Use atc[0] for spreading */
3362 init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
3363 if (pme->ndecompdim >= 2)
3365 init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
3368 if (pme->nnodes == 1)
3370 pme->atc[0].n = homenr;
3371 pme_realloc_atomcomm_things(&pme->atc[0]);
3377 /* Use fft5d, order after FFT is y major, z, x minor */
3379 snew(pme->work, pme->nthread);
3380 for (thread = 0; thread < pme->nthread; thread++)
3382 realloc_work(&pme->work[thread], pme->nkx);
3391 static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3395 for (d = 0; d < DIM; d++)
3397 if (new->grid.n[d] > old->grid.n[d])
3403 sfree_aligned(new->grid.grid);
3404 new->grid.grid = old->grid.grid;
3406 if (new->nthread > 1 && new->nthread == old->nthread)
3408 sfree_aligned(new->grid_all);
3409 for (t = 0; t < new->nthread; t++)
3411 new->grid_th[t].grid = old->grid_th[t].grid;
3416 int gmx_pme_reinit(gmx_pme_t * pmedata,
3419 const t_inputrec * ir,
3427 irc.nkx = grid_size[XX];
3428 irc.nky = grid_size[YY];
3429 irc.nkz = grid_size[ZZ];
3431 if (pme_src->nnodes == 1)
3433 homenr = pme_src->atc[0].n;
3440 ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3441 &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
3445 /* We can easily reuse the allocated pme grids in pme_src */
3446 reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
3447 /* We would like to reuse the fft grids, but that's harder */
3454 static void copy_local_grid(gmx_pme_t pme,
3455 pmegrids_t *pmegrids, int thread, real *fftgrid)
3457 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3461 int offx, offy, offz, x, y, z, i0, i0t;
3466 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3470 fft_my = local_fft_size[YY];
3471 fft_mz = local_fft_size[ZZ];
3473 pmegrid = &pmegrids->grid_th[thread];
3475 nsx = pmegrid->s[XX];
3476 nsy = pmegrid->s[YY];
3477 nsz = pmegrid->s[ZZ];
3479 for (d = 0; d < DIM; d++)
3481 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3482 local_fft_ndata[d] - pmegrid->offset[d]);
3485 offx = pmegrid->offset[XX];
3486 offy = pmegrid->offset[YY];
3487 offz = pmegrid->offset[ZZ];
3489 /* Directly copy the non-overlapping parts of the local grids.
3490 * This also initializes the full grid.
3492 grid_th = pmegrid->grid;
3493 for (x = 0; x < nf[XX]; x++)
3495 for (y = 0; y < nf[YY]; y++)
3497 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3498 i0t = (x*nsy + y)*nsz;
3499 for (z = 0; z < nf[ZZ]; z++)
3501 fftgrid[i0+z] = grid_th[i0t+z];
3508 reduce_threadgrid_overlap(gmx_pme_t pme,
3509 const pmegrids_t *pmegrids, int thread,
3510 real *fftgrid, real *commbuf_x, real *commbuf_y)
3512 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3513 int fft_nx, fft_ny, fft_nz;
3518 int offx, offy, offz, x, y, z, i0, i0t;
3519 int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3520 gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3521 gmx_bool bCommX, bCommY;
3524 const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3525 const real *grid_th;
3526 real *commbuf = NULL;
3528 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3532 fft_nx = local_fft_ndata[XX];
3533 fft_ny = local_fft_ndata[YY];
3534 fft_nz = local_fft_ndata[ZZ];
3536 fft_my = local_fft_size[YY];
3537 fft_mz = local_fft_size[ZZ];
3539 /* This routine is called when all thread have finished spreading.
3540 * Here each thread sums grid contributions calculated by other threads
3541 * to the thread local grid volume.
3542 * To minimize the number of grid copying operations,
3543 * this routines sums immediately from the pmegrid to the fftgrid.
3546 /* Determine which part of the full node grid we should operate on,
3547 * this is our thread local part of the full grid.
3549 pmegrid = &pmegrids->grid_th[thread];
3551 for (d = 0; d < DIM; d++)
3553 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3554 local_fft_ndata[d]);
3557 offx = pmegrid->offset[XX];
3558 offy = pmegrid->offset[YY];
3559 offz = pmegrid->offset[ZZ];
3566 /* Now loop over all the thread data blocks that contribute
3567 * to the grid region we (our thread) are operating on.
3569 /* Note that ffy_nx/y is equal to the number of grid points
3570 * between the first point of our node grid and the one of the next node.
3572 for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
3574 fx = pmegrid->ci[XX] + sx;
3579 fx += pmegrids->nc[XX];
3581 bCommX = (pme->nnodes_major > 1);
3583 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3584 ox += pmegrid_g->offset[XX];
3587 tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
3591 tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
3594 for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
3596 fy = pmegrid->ci[YY] + sy;
3601 fy += pmegrids->nc[YY];
3603 bCommY = (pme->nnodes_minor > 1);
3605 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3606 oy += pmegrid_g->offset[YY];
3609 ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
3613 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
3616 for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
3618 fz = pmegrid->ci[ZZ] + sz;
3622 fz += pmegrids->nc[ZZ];
3625 pmegrid_g = &pmegrids->grid_th[fz];
3626 oz += pmegrid_g->offset[ZZ];
3627 tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
3629 if (sx == 0 && sy == 0 && sz == 0)
3631 /* We have already added our local contribution
3632 * before calling this routine, so skip it here.
3637 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3639 pmegrid_f = &pmegrids->grid_th[thread_f];
3641 grid_th = pmegrid_f->grid;
3643 nsx = pmegrid_f->s[XX];
3644 nsy = pmegrid_f->s[YY];
3645 nsz = pmegrid_f->s[ZZ];
3647 #ifdef DEBUG_PME_REDUCE
3648 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3649 pme->nodeid, thread, thread_f,
3650 pme->pmegrid_start_ix,
3651 pme->pmegrid_start_iy,
3652 pme->pmegrid_start_iz,
3654 offx-ox, tx1-ox, offx, tx1,
3655 offy-oy, ty1-oy, offy, ty1,
3656 offz-oz, tz1-oz, offz, tz1);
3659 if (!(bCommX || bCommY))
3661 /* Copy from the thread local grid to the node grid */
3662 for (x = offx; x < tx1; x++)
3664 for (y = offy; y < ty1; y++)
3666 i0 = (x*fft_my + y)*fft_mz;
3667 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3668 for (z = offz; z < tz1; z++)
3670 fftgrid[i0+z] += grid_th[i0t+z];
3677 /* The order of this conditional decides
3678 * where the corner volume gets stored with x+y decomp.
3682 commbuf = commbuf_y;
3683 buf_my = ty1 - offy;
3686 /* We index commbuf modulo the local grid size */
3687 commbuf += buf_my*fft_nx*fft_nz;
3689 bClearBuf = bClearBufXY;
3690 bClearBufXY = FALSE;
3694 bClearBuf = bClearBufY;
3700 commbuf = commbuf_x;
3702 bClearBuf = bClearBufX;
3706 /* Copy to the communication buffer */
3707 for (x = offx; x < tx1; x++)
3709 for (y = offy; y < ty1; y++)
3711 i0 = (x*buf_my + y)*fft_nz;
3712 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3716 /* First access of commbuf, initialize it */
3717 for (z = offz; z < tz1; z++)
3719 commbuf[i0+z] = grid_th[i0t+z];
3724 for (z = offz; z < tz1; z++)
3726 commbuf[i0+z] += grid_th[i0t+z];
3738 static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
3740 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3741 pme_overlap_t *overlap;
3742 int send_index0, send_nindex;
3747 int send_size_y, recv_size_y;
3748 int ipulse, send_id, recv_id, datasize, gridsize, size_yx;
3749 real *sendptr, *recvptr;
3750 int x, y, z, indg, indb;
3752 /* Note that this routine is only used for forward communication.
3753 * Since the force gathering, unlike the charge spreading,
3754 * can be trivially parallelized over the particles,
3755 * the backwards process is much simpler and can use the "old"
3756 * communication setup.
3759 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3764 if (pme->nnodes_minor > 1)
3766 /* Major dimension */
3767 overlap = &pme->overlap[1];
3769 if (pme->nnodes_major > 1)
3771 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3777 datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
3779 send_size_y = overlap->send_size;
3781 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
3783 send_id = overlap->send_id[ipulse];
3784 recv_id = overlap->recv_id[ipulse];
3786 overlap->comm_data[ipulse].send_index0 -
3787 overlap->comm_data[0].send_index0;
3788 send_nindex = overlap->comm_data[ipulse].send_nindex;
3789 /* We don't use recv_index0, as we always receive starting at 0 */
3790 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3791 recv_size_y = overlap->comm_data[ipulse].recv_size;
3793 sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
3794 recvptr = overlap->recvbuf;
3797 MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
3799 recvptr, recv_size_y*datasize, GMX_MPI_REAL,
3801 overlap->mpi_comm, &stat);
3804 for (x = 0; x < local_fft_ndata[XX]; x++)
3806 for (y = 0; y < recv_nindex; y++)
3808 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3809 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
3810 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3812 fftgrid[indg+z] += recvptr[indb+z];
3817 if (pme->nnodes_major > 1)
3819 /* Copy from the received buffer to the send buffer for dim 0 */
3820 sendptr = pme->overlap[0].sendbuf;
3821 for (x = 0; x < size_yx; x++)
3823 for (y = 0; y < recv_nindex; y++)
3825 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3826 indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
3827 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3829 sendptr[indg+z] += recvptr[indb+z];
3837 /* We only support a single pulse here.
3838 * This is not a severe limitation, as this code is only used
3839 * with OpenMP and with OpenMP the (PME) domains can be larger.
3841 if (pme->nnodes_major > 1)
3843 /* Major dimension */
3844 overlap = &pme->overlap[0];
3846 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3847 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3851 send_id = overlap->send_id[ipulse];
3852 recv_id = overlap->recv_id[ipulse];
3853 send_nindex = overlap->comm_data[ipulse].send_nindex;
3854 /* We don't use recv_index0, as we always receive starting at 0 */
3855 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3857 sendptr = overlap->sendbuf;
3858 recvptr = overlap->recvbuf;
3862 fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
3863 send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
3867 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
3869 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
3871 overlap->mpi_comm, &stat);
3874 for (x = 0; x < recv_nindex; x++)
3876 for (y = 0; y < local_fft_ndata[YY]; y++)
3878 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3879 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3880 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3882 fftgrid[indg+z] += recvptr[indb+z];
3890 static void spread_on_grid(gmx_pme_t pme,
3891 pme_atomcomm_t *atc, pmegrids_t *grids,
3892 gmx_bool bCalcSplines, gmx_bool bSpread,
3895 int nthread, thread;
3896 #ifdef PME_TIME_THREADS
3897 gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
3898 static double cs1 = 0, cs2 = 0, cs3 = 0;
3899 static double cs1a[6] = {0, 0, 0, 0, 0, 0};
3903 nthread = pme->nthread;
3904 assert(nthread > 0);
3906 #ifdef PME_TIME_THREADS
3907 c1 = omp_cyc_start();
3911 #pragma omp parallel for num_threads(nthread) schedule(static)
3912 for (thread = 0; thread < nthread; thread++)
3916 start = atc->n* thread /nthread;
3917 end = atc->n*(thread+1)/nthread;
3919 /* Compute fftgrid index for all atoms,
3920 * with help of some extra variables.
3922 calc_interpolation_idx(pme, atc, start, end, thread);
3925 #ifdef PME_TIME_THREADS
3926 c1 = omp_cyc_end(c1);
3930 #ifdef PME_TIME_THREADS
3931 c2 = omp_cyc_start();
3933 #pragma omp parallel for num_threads(nthread) schedule(static)
3934 for (thread = 0; thread < nthread; thread++)
3936 splinedata_t *spline;
3939 /* make local bsplines */
3940 if (grids == NULL || grids->nthread == 1)
3942 spline = &atc->spline[0];
3946 grid = &grids->grid;
3950 spline = &atc->spline[thread];
3952 make_thread_local_ind(atc, thread, spline);
3954 grid = &grids->grid_th[thread];
3959 make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
3960 atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
3965 /* put local atoms on grid. */
3966 #ifdef PME_TIME_SPREAD
3967 ct1a = omp_cyc_start();
3969 spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
3971 if (grids->nthread > 1)
3973 copy_local_grid(pme, grids, thread, fftgrid);
3975 #ifdef PME_TIME_SPREAD
3976 ct1a = omp_cyc_end(ct1a);
3977 cs1a[thread] += (double)ct1a;
3981 #ifdef PME_TIME_THREADS
3982 c2 = omp_cyc_end(c2);
3986 if (bSpread && grids->nthread > 1)
3988 #ifdef PME_TIME_THREADS
3989 c3 = omp_cyc_start();
3991 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3992 for (thread = 0; thread < grids->nthread; thread++)
3994 reduce_threadgrid_overlap(pme, grids, thread,
3996 pme->overlap[0].sendbuf,
3997 pme->overlap[1].sendbuf);
3999 #ifdef PME_TIME_THREADS
4000 c3 = omp_cyc_end(c3);
4004 if (pme->nnodes > 1)
4006 /* Communicate the overlapping part of the fftgrid */
4007 sum_fftgrid_dd(pme, fftgrid);
4011 #ifdef PME_TIME_THREADS
4015 printf("idx %.2f spread %.2f red %.2f",
4016 cs1*1e-9, cs2*1e-9, cs3*1e-9);
4017 #ifdef PME_TIME_SPREAD
4018 for (thread = 0; thread < nthread; thread++)
4020 printf(" %.2f", cs1a[thread]*1e-9);
4029 static void dump_grid(FILE *fp,
4030 int sx, int sy, int sz, int nx, int ny, int nz,
4031 int my, int mz, const real *g)
4035 for (x = 0; x < nx; x++)
4037 for (y = 0; y < ny; y++)
4039 for (z = 0; z < nz; z++)
4041 fprintf(fp, "%2d %2d %2d %6.3f\n",
4042 sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4048 static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4050 ivec local_fft_ndata, local_fft_offset, local_fft_size;
4052 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
4058 pme->pmegrid_start_ix,
4059 pme->pmegrid_start_iy,
4060 pme->pmegrid_start_iz,
4061 pme->pmegrid_nx-pme->pme_order+1,
4062 pme->pmegrid_ny-pme->pme_order+1,
4063 pme->pmegrid_nz-pme->pme_order+1,
4070 void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4072 pme_atomcomm_t *atc;
4075 if (pme->nnodes > 1)
4077 gmx_incons("gmx_pme_calc_energy called in parallel");
4081 gmx_incons("gmx_pme_calc_energy with free energy");
4084 atc = &pme->atc_energy;
4086 if (atc->spline == NULL)
4088 snew(atc->spline, atc->nthread);
4091 atc->bSpread = TRUE;
4092 atc->pme_order = pme->pme_order;
4094 pme_realloc_atomcomm_things(atc);
4098 /* We only use the A-charges grid */
4099 grid = &pme->pmegridA;
4101 spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
4103 *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4107 static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
4108 t_nrnb *nrnb, t_inputrec *ir,
4109 gmx_large_int_t step)
4111 /* Reset all the counters related to performance over the run */
4112 wallcycle_stop(wcycle, ewcRUN);
4113 wallcycle_reset_all(wcycle);
4115 if (ir->nsteps >= 0)
4117 /* ir->nsteps is not used here, but we update it for consistency */
4118 ir->nsteps -= step - ir->init_step;
4120 ir->init_step = step;
4121 wallcycle_start(wcycle, ewcRUN);
4125 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4127 t_commrec *cr, t_inputrec *ir,
4131 gmx_pme_t pme = NULL;
4134 while (ind < *npmedata)
4136 pme = (*pmedata)[ind];
4137 if (pme->nkx == grid_size[XX] &&
4138 pme->nky == grid_size[YY] &&
4139 pme->nkz == grid_size[ZZ])
4150 srenew(*pmedata, *npmedata);
4152 /* Generate a new PME data structure, copying part of the old pointers */
4153 gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4155 *pme_ret = (*pmedata)[ind];
4159 int gmx_pmeonly(gmx_pme_t pme,
4160 t_commrec *cr, t_nrnb *nrnb,
4161 gmx_wallcycle_t wcycle,
4162 real ewaldcoeff, gmx_bool bGatherOnly,
4167 gmx_pme_pp_t pme_pp;
4171 rvec *x_pp = NULL, *f_pp = NULL;
4172 real *chargeA = NULL, *chargeB = NULL;
4174 int maxshift_x = 0, maxshift_y = 0;
4175 real energy, dvdlambda;
4180 gmx_large_int_t step, step_rel;
4183 /* This data will only use with PME tuning, i.e. switching PME grids */
4185 snew(pmedata, npmedata);
4188 pme_pp = gmx_pme_pp_init(cr);
4193 do /****** this is a quasi-loop over time steps! */
4195 /* The reason for having a loop here is PME grid tuning/switching */
4198 /* Domain decomposition */
4199 ret = gmx_pme_recv_q_x(pme_pp,
4201 &chargeA, &chargeB, box, &x_pp, &f_pp,
4202 &maxshift_x, &maxshift_y,
4203 &pme->bFEP, &lambda,
4206 grid_switch, &ewaldcoeff);
4208 if (ret == pmerecvqxSWITCHGRID)
4210 /* Switch the PME grid to grid_switch */
4211 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4214 if (ret == pmerecvqxRESETCOUNTERS)
4216 /* Reset the cycle and flop counters */
4217 reset_pmeonly_counters(cr, wcycle, nrnb, ir, step);
4220 while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4222 if (ret == pmerecvqxFINISH)
4224 /* We should stop: break out of the loop */
4228 step_rel = step - ir->init_step;
4232 wallcycle_start(wcycle, ewcRUN);
4235 wallcycle_start(wcycle, ewcPMEMESH);
4239 gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
4240 cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
4241 &energy, lambda, &dvdlambda,
4242 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4244 cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4246 gmx_pme_send_force_vir_ener(pme_pp,
4247 f_pp, vir, energy, dvdlambda,
4251 } /***** end of quasi-loop, we stop with the break above */
4257 int gmx_pme_do(gmx_pme_t pme,
4258 int start, int homenr,
4260 real *chargeA, real *chargeB,
4261 matrix box, t_commrec *cr,
4262 int maxshift_x, int maxshift_y,
4263 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4264 matrix vir, real ewaldcoeff,
4265 real *energy, real lambda,
4266 real *dvdlambda, int flags)
4268 int q, d, i, j, ntot, npme;
4271 pme_atomcomm_t *atc = NULL;
4272 pmegrids_t *pmegrid = NULL;
4276 real *charge = NULL, *q_d;
4280 gmx_parallel_3dfft_t pfft_setup;
4282 t_complex * cfftgrid;
4284 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4285 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4287 assert(pme->nnodes > 0);
4288 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4290 if (pme->nnodes > 1)
4294 if (atc->npd > atc->pd_nalloc)
4296 atc->pd_nalloc = over_alloc_dd(atc->npd);
4297 srenew(atc->pd, atc->pd_nalloc);
4299 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4303 /* This could be necessary for TPI */
4304 pme->atc[0].n = homenr;
4307 for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
4311 pmegrid = &pme->pmegridA;
4312 fftgrid = pme->fftgridA;
4313 cfftgrid = pme->cfftgridA;
4314 pfft_setup = pme->pfft_setupA;
4315 charge = chargeA+start;
4319 pmegrid = &pme->pmegridB;
4320 fftgrid = pme->fftgridB;
4321 cfftgrid = pme->cfftgridB;
4322 pfft_setup = pme->pfft_setupB;
4323 charge = chargeB+start;
4325 grid = pmegrid->grid.grid;
4326 /* Unpack structure */
4329 fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
4330 cr->nnodes, cr->nodeid);
4331 fprintf(debug, "Grid = %p\n", (void*)grid);
4334 gmx_fatal(FARGS, "No grid!");
4339 m_inv_ur0(box, pme->recipbox);
4341 if (pme->nnodes == 1)
4344 if (DOMAINDECOMP(cr))
4347 pme_realloc_atomcomm_things(atc);
4355 wallcycle_start(wcycle, ewcPME_REDISTXF);
4356 for (d = pme->ndecompdim-1; d >= 0; d--)
4358 if (d == pme->ndecompdim-1)
4366 n_d = pme->atc[d+1].n;
4372 if (atc->npd > atc->pd_nalloc)
4374 atc->pd_nalloc = over_alloc_dd(atc->npd);
4375 srenew(atc->pd, atc->pd_nalloc);
4377 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4378 pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4381 /* Redistribute x (only once) and qA or qB */
4382 if (DOMAINDECOMP(cr))
4384 dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
4388 pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
4393 wallcycle_stop(wcycle, ewcPME_REDISTXF);
4398 fprintf(debug, "Node= %6d, pme local particles=%6d\n",
4399 cr->nodeid, atc->n);
4402 if (flags & GMX_PME_SPREAD_Q)
4404 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4406 /* Spread the charges on a grid */
4407 spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
4411 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
4413 inc_nrnb(nrnb, eNR_SPREADQBSP,
4414 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4416 if (pme->nthread == 1)
4418 wrap_periodic_pmegrid(pme, grid);
4420 /* sum contributions to local grid from other nodes */
4422 if (pme->nnodes > 1)
4424 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
4429 copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
4432 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4435 dump_local_fftgrid(pme,fftgrid);
4440 /* Here we start a large thread parallel region */
4441 #pragma omp parallel num_threads(pme->nthread) private(thread)
4443 thread = gmx_omp_get_thread_num();
4444 if (flags & GMX_PME_SOLVE)
4451 wallcycle_start(wcycle, ewcPME_FFT);
4453 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4454 fftgrid, cfftgrid, thread, wcycle);
4457 wallcycle_stop(wcycle, ewcPME_FFT);
4461 /* solve in k-space for our local cells */
4464 wallcycle_start(wcycle, ewcPME_SOLVE);
4467 solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
4468 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4470 pme->nthread, thread);
4473 wallcycle_stop(wcycle, ewcPME_SOLVE);
4475 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
4485 wallcycle_start(wcycle, ewcPME_FFT);
4487 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4488 cfftgrid, fftgrid, thread, wcycle);
4491 wallcycle_stop(wcycle, ewcPME_FFT);
4495 if (pme->nodeid == 0)
4497 ntot = pme->nkx*pme->nky*pme->nkz;
4498 npme = ntot*log((real)ntot)/log(2.0);
4499 inc_nrnb(nrnb, eNR_FFT, 2*npme);
4502 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4505 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
4508 /* End of thread parallel section.
4509 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4514 /* distribute local grid to all nodes */
4516 if (pme->nnodes > 1)
4518 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
4523 unwrap_periodic_pmegrid(pme, grid);
4525 /* interpolate forces for our local atoms */
4529 /* If we are running without parallelization,
4530 * atc->f is the actual force array, not a buffer,
4531 * therefore we should not clear it.
4533 bClearF = (q == 0 && PAR(cr));
4534 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4535 for (thread = 0; thread < pme->nthread; thread++)
4537 gather_f_bsplines(pme, grid, bClearF, atc,
4538 &atc->spline[thread],
4539 pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
4544 inc_nrnb(nrnb, eNR_GATHERFBSP,
4545 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4546 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4551 /* This should only be called on the master thread
4552 * and after the threads have synchronized.
4554 get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
4558 if (bCalcF && pme->nnodes > 1)
4560 wallcycle_start(wcycle, ewcPME_REDISTXF);
4561 for (d = 0; d < pme->ndecompdim; d++)
4564 if (d == pme->ndecompdim - 1)
4571 n_d = pme->atc[d+1].n;
4572 f_d = pme->atc[d+1].f;
4574 if (DOMAINDECOMP(cr))
4576 dd_pmeredist_f(pme, atc, n_d, f_d,
4577 d == pme->ndecompdim-1 && pme->bPPnode);
4581 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4585 wallcycle_stop(wcycle, ewcPME_REDISTXF);
4593 *energy = energy_AB[0];
4594 m_add(vir, vir_AB[0], vir);
4598 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4599 *dvdlambda += energy_AB[1] - energy_AB[0];
4600 for (i = 0; i < DIM; i++)
4602 for (j = 0; j < DIM; j++)
4604 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4605 lambda*vir_AB[1][i][j];
4617 fprintf(debug, "PME mesh energy: %g\n", *energy);