1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
63 #include "gromacs/utility/gmxmpi.h"
72 #include "gmxcomplex.h"
76 #include "gmx_fatal.h"
82 #include "gmx_wallcycle.h"
83 #include "gmx_parallel_3dfft.h"
85 #include "gmx_cyclecounter.h"
89 /* Single precision, with SSE2 or higher available */
90 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
92 #include "gmx_x86_simd_single.h"
95 /* Some old AMD processors could have problems with unaligned loads+stores */
97 #define PME_SSE_UNALIGNED
102 /* #define PRT_FORCE */
103 /* conditions for on the fly time-measurement */
104 /* #define TAKETIME (step > 1 && timesteps < 10) */
105 #define TAKETIME FALSE
107 /* #define PME_TIME_THREADS */
110 #define mpi_type MPI_DOUBLE
112 #define mpi_type MPI_FLOAT
115 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
116 #define GMX_CACHE_SEP 64
118 /* We only define a maximum to be able to use local arrays without allocation.
119 * An order larger than 12 should never be needed, even for test cases.
120 * If needed it can be changed here.
122 #define PME_ORDER_MAX 12
124 /* Internal datastructures */
130 int recv_size; /* Receive buffer width, used with OpenMP */
141 int *send_id, *recv_id;
142 int send_size; /* Send buffer width, used with OpenMP */
143 pme_grid_comm_t *comm_data;
149 int *n; /* Cumulative counts of the number of particles per thread */
150 int nalloc; /* Allocation size of i */
151 int *i; /* Particle indices ordered on thread index (n) */
165 int dimind; /* The index of the dimension, 0=x, 1=y */
172 int *node_dest; /* The nodes to send x and q to with DD */
173 int *node_src; /* The nodes to receive x and q from with DD */
174 int *buf_index; /* Index for commnode into the buffers */
181 int *count; /* The number of atoms to send to each node */
183 int *rcount; /* The number of atoms to receive */
190 gmx_bool bSpread; /* These coordinates are used for spreading */
193 rvec *fractx; /* Fractional coordinate relative to the
194 * lower cell boundary
197 int *thread_idx; /* Which thread should spread which charge */
198 thread_plist_t *thread_plist;
199 splinedata_t *spline;
206 ivec ci; /* The spatial location of this grid */
207 ivec n; /* The used size of *grid, including order-1 */
208 ivec offset; /* The grid offset from the full node grid */
209 int order; /* PME spreading order */
210 ivec s; /* The allocated size of *grid, s >= n */
211 real *grid; /* The grid local thread, size n */
215 pmegrid_t grid; /* The full node grid (non thread-local) */
216 int nthread; /* The number of threads operating on this grid */
217 ivec nc; /* The local spatial decomposition over the threads */
218 pmegrid_t *grid_th; /* Array of grids for each thread */
219 real *grid_all; /* Allocated array for the grids in *grid_th */
220 int **g2t; /* The grid to thread index */
221 ivec nthread_comm; /* The number of threads to communicate with */
227 /* Masks for SSE aligned spreading and gathering */
228 __m128 mask_SSE0[6], mask_SSE1[6];
230 int dummy; /* C89 requires that struct has at least one member */
235 /* work data for solve_pme */
251 typedef struct gmx_pme {
252 int ndecompdim; /* The number of decomposition dimensions */
253 int nodeid; /* Our nodeid in mpi->mpi_comm */
256 int nnodes; /* The number of nodes doing PME */
261 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
263 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
266 int nthread; /* The number of threads doing PME */
268 gmx_bool bPPnode; /* Node also does particle-particle forces */
269 gmx_bool bFEP; /* Compute Free energy contribution */
270 int nkx, nky, nkz; /* Grid dimensions */
271 gmx_bool bP3M; /* Do P3M: optimize the influence function */
275 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
277 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
278 int pmegrid_nx, pmegrid_ny, pmegrid_nz;
279 /* pmegrid_nz might be larger than strictly necessary to ensure
280 * memory alignment, pmegrid_nz_base gives the real base size.
283 /* The local PME grid starting indices */
284 int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
286 /* Work data for spreading and gathering */
287 pme_spline_work_t *spline_work;
289 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
290 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
291 int fftgrid_nx, fftgrid_ny, fftgrid_nz;
293 t_complex *cfftgridA; /* Grids for complex FFT data */
294 t_complex *cfftgridB;
295 int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
297 gmx_parallel_3dfft_t pfft_setupA;
298 gmx_parallel_3dfft_t pfft_setupB;
300 int *nnx, *nny, *nnz;
301 real *fshx, *fshy, *fshz;
303 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
307 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
309 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
311 rvec *bufv; /* Communication buffer */
312 real *bufr; /* Communication buffer */
313 int buf_nalloc; /* The communication buffer size */
315 /* thread local work data for solve_pme */
318 /* Work data for PME_redist */
319 gmx_bool redist_init;
327 int redist_buf_nalloc;
329 /* Work data for sum_qgrid */
330 real * sum_qgrid_tmp;
331 real * sum_qgrid_dd_tmp;
335 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
336 int start, int end, int thread)
339 int *idxptr, tix, tiy, tiz;
340 real *xptr, *fptr, tx, ty, tz;
341 real rxx, ryx, ryy, rzx, rzy, rzz;
343 int start_ix, start_iy, start_iz;
344 int *g2tx, *g2ty, *g2tz;
346 int *thread_idx = NULL;
347 thread_plist_t *tpl = NULL;
355 start_ix = pme->pmegrid_start_ix;
356 start_iy = pme->pmegrid_start_iy;
357 start_iz = pme->pmegrid_start_iz;
359 rxx = pme->recipbox[XX][XX];
360 ryx = pme->recipbox[YY][XX];
361 ryy = pme->recipbox[YY][YY];
362 rzx = pme->recipbox[ZZ][XX];
363 rzy = pme->recipbox[ZZ][YY];
364 rzz = pme->recipbox[ZZ][ZZ];
366 g2tx = pme->pmegridA.g2t[XX];
367 g2ty = pme->pmegridA.g2t[YY];
368 g2tz = pme->pmegridA.g2t[ZZ];
370 bThreads = (atc->nthread > 1);
373 thread_idx = atc->thread_idx;
375 tpl = &atc->thread_plist[thread];
377 for (i = 0; i < atc->nthread; i++)
383 for (i = start; i < end; i++)
386 idxptr = atc->idx[i];
387 fptr = atc->fractx[i];
389 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
390 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
391 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
392 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
398 /* Because decomposition only occurs in x and y,
399 * we never have a fraction correction in z.
401 fptr[XX] = tx - tix + pme->fshx[tix];
402 fptr[YY] = ty - tiy + pme->fshy[tiy];
405 idxptr[XX] = pme->nnx[tix];
406 idxptr[YY] = pme->nny[tiy];
407 idxptr[ZZ] = pme->nnz[tiz];
410 range_check(idxptr[XX], 0, pme->pmegrid_nx);
411 range_check(idxptr[YY], 0, pme->pmegrid_ny);
412 range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
417 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
418 thread_idx[i] = thread_i;
425 /* Make a list of particle indices sorted on thread */
427 /* Get the cumulative count */
428 for (i = 1; i < atc->nthread; i++)
430 tpl_n[i] += tpl_n[i-1];
432 /* The current implementation distributes particles equally
433 * over the threads, so we could actually allocate for that
434 * in pme_realloc_atomcomm_things.
436 if (tpl_n[atc->nthread-1] > tpl->nalloc)
438 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
439 srenew(tpl->i, tpl->nalloc);
441 /* Set tpl_n to the cumulative start */
442 for (i = atc->nthread-1; i >= 1; i--)
444 tpl_n[i] = tpl_n[i-1];
448 /* Fill our thread local array with indices sorted on thread */
449 for (i = start; i < end; i++)
451 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
453 /* Now tpl_n contains the cummulative count again */
457 static void make_thread_local_ind(pme_atomcomm_t *atc,
458 int thread, splinedata_t *spline)
460 int n, t, i, start, end;
463 /* Combine the indices made by each thread into one index */
467 for (t = 0; t < atc->nthread; t++)
469 tpl = &atc->thread_plist[t];
470 /* Copy our part (start - end) from the list of thread t */
473 start = tpl->n[thread-1];
475 end = tpl->n[thread];
476 for (i = start; i < end; i++)
478 spline->ind[n++] = tpl->i[i];
486 static void pme_calc_pidx(int start, int end,
487 matrix recipbox, rvec x[],
488 pme_atomcomm_t *atc, int *count)
493 real rxx, ryx, rzx, ryy, rzy;
496 /* Calculate PME task index (pidx) for each grid index.
497 * Here we always assign equally sized slabs to each node
498 * for load balancing reasons (the PME grid spacing is not used).
504 /* Reset the count */
505 for (i = 0; i < nslab; i++)
510 if (atc->dimind == 0)
512 rxx = recipbox[XX][XX];
513 ryx = recipbox[YY][XX];
514 rzx = recipbox[ZZ][XX];
515 /* Calculate the node index in x-dimension */
516 for (i = start; i < end; i++)
519 /* Fractional coordinates along box vectors */
520 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
521 si = (int)(s + 2*nslab) % nslab;
528 ryy = recipbox[YY][YY];
529 rzy = recipbox[ZZ][YY];
530 /* Calculate the node index in y-dimension */
531 for (i = start; i < end; i++)
534 /* Fractional coordinates along box vectors */
535 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
536 si = (int)(s + 2*nslab) % nslab;
543 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
546 int nthread, thread, slab;
548 nthread = atc->nthread;
550 #pragma omp parallel for num_threads(nthread) schedule(static)
551 for (thread = 0; thread < nthread; thread++)
553 pme_calc_pidx(natoms* thread /nthread,
554 natoms*(thread+1)/nthread,
555 recipbox, x, atc, atc->count_thread[thread]);
557 /* Non-parallel reduction, since nslab is small */
559 for (thread = 1; thread < nthread; thread++)
561 for (slab = 0; slab < atc->nslab; slab++)
563 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
568 static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
570 const int padding = 4;
573 srenew(th[XX], nalloc);
574 srenew(th[YY], nalloc);
575 /* In z we add padding, this is only required for the aligned SSE code */
576 srenew(*ptr_z, nalloc+2*padding);
577 th[ZZ] = *ptr_z + padding;
579 for (i = 0; i < padding; i++)
582 (*ptr_z)[padding+nalloc+i] = 0;
586 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
590 srenew(spline->ind, atc->nalloc);
591 /* Initialize the index to identity so it works without threads */
592 for (i = 0; i < atc->nalloc; i++)
597 realloc_splinevec(spline->theta, &spline->ptr_theta_z,
598 atc->pme_order*atc->nalloc);
599 realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
600 atc->pme_order*atc->nalloc);
603 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
605 int nalloc_old, i, j, nalloc_tpl;
607 /* We have to avoid a NULL pointer for atc->x to avoid
608 * possible fatal errors in MPI routines.
610 if (atc->n > atc->nalloc || atc->nalloc == 0)
612 nalloc_old = atc->nalloc;
613 atc->nalloc = over_alloc_dd(max(atc->n, 1));
617 srenew(atc->x, atc->nalloc);
618 srenew(atc->q, atc->nalloc);
619 srenew(atc->f, atc->nalloc);
620 for (i = nalloc_old; i < atc->nalloc; i++)
622 clear_rvec(atc->f[i]);
627 srenew(atc->fractx, atc->nalloc);
628 srenew(atc->idx, atc->nalloc);
630 if (atc->nthread > 1)
632 srenew(atc->thread_idx, atc->nalloc);
635 for (i = 0; i < atc->nthread; i++)
637 pme_realloc_splinedata(&atc->spline[i], atc);
643 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
644 int n, gmx_bool bXF, rvec *x_f, real *charge,
646 /* Redistribute particle data for PME calculation */
647 /* domain decomposition by x coordinate */
652 if (FALSE == pme->redist_init)
654 snew(pme->scounts, atc->nslab);
655 snew(pme->rcounts, atc->nslab);
656 snew(pme->sdispls, atc->nslab);
657 snew(pme->rdispls, atc->nslab);
658 snew(pme->sidx, atc->nslab);
659 pme->redist_init = TRUE;
661 if (n > pme->redist_buf_nalloc)
663 pme->redist_buf_nalloc = over_alloc_dd(n);
664 srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
672 /* forward, redistribution from pp to pme */
674 /* Calculate send counts and exchange them with other nodes */
675 for (i = 0; (i < atc->nslab); i++)
679 for (i = 0; (i < n); i++)
681 pme->scounts[pme->idxa[i]]++;
683 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
685 /* Calculate send and receive displacements and index into send
690 for (i = 1; i < atc->nslab; i++)
692 pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
693 pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
694 pme->sidx[i] = pme->sdispls[i];
696 /* Total # of particles to be received */
697 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
699 pme_realloc_atomcomm_things(atc);
701 /* Copy particle coordinates into send buffer and exchange*/
702 for (i = 0; (i < n); i++)
704 ii = DIM*pme->sidx[pme->idxa[i]];
705 pme->sidx[pme->idxa[i]]++;
706 pme->redist_buf[ii+XX] = x_f[i][XX];
707 pme->redist_buf[ii+YY] = x_f[i][YY];
708 pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
710 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
711 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
712 pme->rvec_mpi, atc->mpi_comm);
716 /* Copy charge into send buffer and exchange*/
717 for (i = 0; i < atc->nslab; i++)
719 pme->sidx[i] = pme->sdispls[i];
721 for (i = 0; (i < n); i++)
723 ii = pme->sidx[pme->idxa[i]];
724 pme->sidx[pme->idxa[i]]++;
725 pme->redist_buf[ii] = charge[i];
727 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
728 atc->q, pme->rcounts, pme->rdispls, mpi_type,
731 else /* backward, redistribution from pme to pp */
733 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
734 pme->redist_buf, pme->scounts, pme->sdispls,
735 pme->rvec_mpi, atc->mpi_comm);
737 /* Copy data from receive buffer */
738 for (i = 0; i < atc->nslab; i++)
740 pme->sidx[i] = pme->sdispls[i];
742 for (i = 0; (i < n); i++)
744 ii = DIM*pme->sidx[pme->idxa[i]];
745 x_f[i][XX] += pme->redist_buf[ii+XX];
746 x_f[i][YY] += pme->redist_buf[ii+YY];
747 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
748 pme->sidx[pme->idxa[i]]++;
754 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
755 gmx_bool bBackward, int shift,
756 void *buf_s, int nbyte_s,
757 void *buf_r, int nbyte_r)
763 if (bBackward == FALSE)
765 dest = atc->node_dest[shift];
766 src = atc->node_src[shift];
770 dest = atc->node_src[shift];
771 src = atc->node_dest[shift];
774 if (nbyte_s > 0 && nbyte_r > 0)
776 MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
778 buf_r, nbyte_r, MPI_BYTE,
780 atc->mpi_comm, &stat);
782 else if (nbyte_s > 0)
784 MPI_Send(buf_s, nbyte_s, MPI_BYTE,
788 else if (nbyte_r > 0)
790 MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
792 atc->mpi_comm, &stat);
797 static void dd_pmeredist_x_q(gmx_pme_t pme,
798 int n, gmx_bool bX, rvec *x, real *charge,
801 int *commnode, *buf_index;
802 int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
804 commnode = atc->node_dest;
805 buf_index = atc->buf_index;
807 nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
810 for (i = 0; i < nnodes_comm; i++)
812 buf_index[commnode[i]] = nsend;
813 nsend += atc->count[commnode[i]];
817 if (atc->count[atc->nodeid] + nsend != n)
819 gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
820 "This usually means that your system is not well equilibrated.",
821 n - (atc->count[atc->nodeid] + nsend),
822 pme->nodeid, 'x'+atc->dimind);
825 if (nsend > pme->buf_nalloc)
827 pme->buf_nalloc = over_alloc_dd(nsend);
828 srenew(pme->bufv, pme->buf_nalloc);
829 srenew(pme->bufr, pme->buf_nalloc);
832 atc->n = atc->count[atc->nodeid];
833 for (i = 0; i < nnodes_comm; i++)
835 scount = atc->count[commnode[i]];
836 /* Communicate the count */
839 fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
840 atc->dimind, atc->nodeid, commnode[i], scount);
842 pme_dd_sendrecv(atc, FALSE, i,
843 &scount, sizeof(int),
844 &atc->rcount[i], sizeof(int));
845 atc->n += atc->rcount[i];
848 pme_realloc_atomcomm_things(atc);
852 for (i = 0; i < n; i++)
855 if (node == atc->nodeid)
857 /* Copy direct to the receive buffer */
860 copy_rvec(x[i], atc->x[local_pos]);
862 atc->q[local_pos] = charge[i];
867 /* Copy to the send buffer */
870 copy_rvec(x[i], pme->bufv[buf_index[node]]);
872 pme->bufr[buf_index[node]] = charge[i];
878 for (i = 0; i < nnodes_comm; i++)
880 scount = atc->count[commnode[i]];
881 rcount = atc->rcount[i];
882 if (scount > 0 || rcount > 0)
886 /* Communicate the coordinates */
887 pme_dd_sendrecv(atc, FALSE, i,
888 pme->bufv[buf_pos], scount*sizeof(rvec),
889 atc->x[local_pos], rcount*sizeof(rvec));
891 /* Communicate the charges */
892 pme_dd_sendrecv(atc, FALSE, i,
893 pme->bufr+buf_pos, scount*sizeof(real),
894 atc->q+local_pos, rcount*sizeof(real));
896 local_pos += atc->rcount[i];
901 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
905 int *commnode, *buf_index;
906 int nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
908 commnode = atc->node_dest;
909 buf_index = atc->buf_index;
911 nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
913 local_pos = atc->count[atc->nodeid];
915 for (i = 0; i < nnodes_comm; i++)
917 scount = atc->rcount[i];
918 rcount = atc->count[commnode[i]];
919 if (scount > 0 || rcount > 0)
921 /* Communicate the forces */
922 pme_dd_sendrecv(atc, TRUE, i,
923 atc->f[local_pos], scount*sizeof(rvec),
924 pme->bufv[buf_pos], rcount*sizeof(rvec));
927 buf_index[commnode[i]] = buf_pos;
934 for (i = 0; i < n; i++)
937 if (node == atc->nodeid)
939 /* Add from the local force array */
940 rvec_inc(f[i], atc->f[local_pos]);
945 /* Add from the receive buffer */
946 rvec_inc(f[i], pme->bufv[buf_index[node]]);
953 for (i = 0; i < n; i++)
956 if (node == atc->nodeid)
958 /* Copy from the local force array */
959 copy_rvec(atc->f[local_pos], f[i]);
964 /* Copy from the receive buffer */
965 copy_rvec(pme->bufv[buf_index[node]], f[i]);
974 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
976 pme_overlap_t *overlap;
977 int send_index0, send_nindex;
978 int recv_index0, recv_nindex;
980 int i, j, k, ix, iy, iz, icnt;
981 int ipulse, send_id, recv_id, datasize;
983 real *sendptr, *recvptr;
985 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
986 overlap = &pme->overlap[1];
988 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
990 /* Since we have already (un)wrapped the overlap in the z-dimension,
991 * we only have to communicate 0 to nkz (not pmegrid_nz).
993 if (direction == GMX_SUM_QGRID_FORWARD)
995 send_id = overlap->send_id[ipulse];
996 recv_id = overlap->recv_id[ipulse];
997 send_index0 = overlap->comm_data[ipulse].send_index0;
998 send_nindex = overlap->comm_data[ipulse].send_nindex;
999 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1000 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1004 send_id = overlap->recv_id[ipulse];
1005 recv_id = overlap->send_id[ipulse];
1006 send_index0 = overlap->comm_data[ipulse].recv_index0;
1007 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1008 recv_index0 = overlap->comm_data[ipulse].send_index0;
1009 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1012 /* Copy data to contiguous send buffer */
1015 fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1016 pme->nodeid, overlap->nodeid, send_id,
1017 pme->pmegrid_start_iy,
1018 send_index0-pme->pmegrid_start_iy,
1019 send_index0-pme->pmegrid_start_iy+send_nindex);
1022 for (i = 0; i < pme->pmegrid_nx; i++)
1025 for (j = 0; j < send_nindex; j++)
1027 iy = j + send_index0 - pme->pmegrid_start_iy;
1028 for (k = 0; k < pme->nkz; k++)
1031 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
1036 datasize = pme->pmegrid_nx * pme->nkz;
1038 MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
1040 overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
1042 overlap->mpi_comm, &stat);
1044 /* Get data from contiguous recv buffer */
1047 fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1048 pme->nodeid, overlap->nodeid, recv_id,
1049 pme->pmegrid_start_iy,
1050 recv_index0-pme->pmegrid_start_iy,
1051 recv_index0-pme->pmegrid_start_iy+recv_nindex);
1054 for (i = 0; i < pme->pmegrid_nx; i++)
1057 for (j = 0; j < recv_nindex; j++)
1059 iy = j + recv_index0 - pme->pmegrid_start_iy;
1060 for (k = 0; k < pme->nkz; k++)
1063 if (direction == GMX_SUM_QGRID_FORWARD)
1065 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1069 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1076 /* Major dimension is easier, no copying required,
1077 * but we might have to sum to separate array.
1078 * Since we don't copy, we have to communicate up to pmegrid_nz,
1079 * not nkz as for the minor direction.
1081 overlap = &pme->overlap[0];
1083 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1085 if (direction == GMX_SUM_QGRID_FORWARD)
1087 send_id = overlap->send_id[ipulse];
1088 recv_id = overlap->recv_id[ipulse];
1089 send_index0 = overlap->comm_data[ipulse].send_index0;
1090 send_nindex = overlap->comm_data[ipulse].send_nindex;
1091 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1092 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1093 recvptr = overlap->recvbuf;
1097 send_id = overlap->recv_id[ipulse];
1098 recv_id = overlap->send_id[ipulse];
1099 send_index0 = overlap->comm_data[ipulse].recv_index0;
1100 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1101 recv_index0 = overlap->comm_data[ipulse].send_index0;
1102 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1103 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1106 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1107 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1111 fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1112 pme->nodeid, overlap->nodeid, send_id,
1113 pme->pmegrid_start_ix,
1114 send_index0-pme->pmegrid_start_ix,
1115 send_index0-pme->pmegrid_start_ix+send_nindex);
1116 fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1117 pme->nodeid, overlap->nodeid, recv_id,
1118 pme->pmegrid_start_ix,
1119 recv_index0-pme->pmegrid_start_ix,
1120 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1123 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
1125 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
1127 overlap->mpi_comm, &stat);
1129 /* ADD data from contiguous recv buffer */
1130 if (direction == GMX_SUM_QGRID_FORWARD)
1132 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1133 for (i = 0; i < recv_nindex*datasize; i++)
1135 p[i] += overlap->recvbuf[i];
1144 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1146 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1147 ivec local_pme_size;
1151 /* Dimensions should be identical for A/B grid, so we just use A here */
1152 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1157 local_pme_size[0] = pme->pmegrid_nx;
1158 local_pme_size[1] = pme->pmegrid_ny;
1159 local_pme_size[2] = pme->pmegrid_nz;
1161 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1162 the offset is identical, and the PME grid always has more data (due to overlap)
1167 char fn[STRLEN], format[STRLEN];
1169 sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1170 fp = ffopen(fn, "w");
1171 sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1172 fp2 = ffopen(fn, "w");
1173 sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1176 for (ix = 0; ix < local_fft_ndata[XX]; ix++)
1178 for (iy = 0; iy < local_fft_ndata[YY]; iy++)
1180 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1182 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1183 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1184 fftgrid[fftidx] = pmegrid[pmeidx];
1186 val = 100*pmegrid[pmeidx];
1187 if (pmegrid[pmeidx] != 0)
1189 fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
1190 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
1192 if (pmegrid[pmeidx] != 0)
1194 fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n",
1196 pme->pmegrid_start_ix + ix,
1197 pme->pmegrid_start_iy + iy,
1198 pme->pmegrid_start_iz + iz,
1214 static gmx_cycles_t omp_cyc_start()
1216 return gmx_cycles_read();
1219 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1221 return gmx_cycles_read() - c;
1226 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1227 int nthread, int thread)
1229 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1230 ivec local_pme_size;
1231 int ixy0, ixy1, ixy, ix, iy, iz;
1233 #ifdef PME_TIME_THREADS
1235 static double cs1 = 0;
1239 #ifdef PME_TIME_THREADS
1240 c1 = omp_cyc_start();
1242 /* Dimensions should be identical for A/B grid, so we just use A here */
1243 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1248 local_pme_size[0] = pme->pmegrid_nx;
1249 local_pme_size[1] = pme->pmegrid_ny;
1250 local_pme_size[2] = pme->pmegrid_nz;
1252 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1253 the offset is identical, and the PME grid always has more data (due to overlap)
1255 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1256 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1258 for (ixy = ixy0; ixy < ixy1; ixy++)
1260 ix = ixy/local_fft_ndata[YY];
1261 iy = ixy - ix*local_fft_ndata[YY];
1263 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1264 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1265 for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
1267 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1271 #ifdef PME_TIME_THREADS
1272 c1 = omp_cyc_end(c1);
1277 printf("copy %.2f\n", cs1*1e-9);
1286 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1288 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1294 pnx = pme->pmegrid_nx;
1295 pny = pme->pmegrid_ny;
1296 pnz = pme->pmegrid_nz;
1298 overlap = pme->pme_order - 1;
1300 /* Add periodic overlap in z */
1301 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1303 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1305 for (iz = 0; iz < overlap; iz++)
1307 pmegrid[(ix*pny+iy)*pnz+iz] +=
1308 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1313 if (pme->nnodes_minor == 1)
1315 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1317 for (iy = 0; iy < overlap; iy++)
1319 for (iz = 0; iz < nz; iz++)
1321 pmegrid[(ix*pny+iy)*pnz+iz] +=
1322 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1328 if (pme->nnodes_major == 1)
1330 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1332 for (ix = 0; ix < overlap; ix++)
1334 for (iy = 0; iy < ny_x; iy++)
1336 for (iz = 0; iz < nz; iz++)
1338 pmegrid[(ix*pny+iy)*pnz+iz] +=
1339 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1348 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1350 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1356 pnx = pme->pmegrid_nx;
1357 pny = pme->pmegrid_ny;
1358 pnz = pme->pmegrid_nz;
1360 overlap = pme->pme_order - 1;
1362 if (pme->nnodes_major == 1)
1364 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1366 for (ix = 0; ix < overlap; ix++)
1370 for (iy = 0; iy < ny_x; iy++)
1372 for (iz = 0; iz < nz; iz++)
1374 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1375 pmegrid[(ix*pny+iy)*pnz+iz];
1381 if (pme->nnodes_minor == 1)
1383 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1384 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1388 for (iy = 0; iy < overlap; iy++)
1390 for (iz = 0; iz < nz; iz++)
1392 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1393 pmegrid[(ix*pny+iy)*pnz+iz];
1399 /* Copy periodic overlap in z */
1400 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1401 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1405 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1407 for (iz = 0; iz < overlap; iz++)
1409 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1410 pmegrid[(ix*pny+iy)*pnz+iz];
1416 static void clear_grid(int nx, int ny, int nz, real *grid,
1418 int fx, int fy, int fz,
1422 int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
1425 nc = 2 + (order - 2)/FLBS;
1426 ncz = 2 + (order - 2)/FLBSZ;
1428 for (fsx = fx; fsx < fx+nc; fsx++)
1430 for (fsy = fy; fsy < fy+nc; fsy++)
1432 for (fsz = fz; fsz < fz+ncz; fsz++)
1434 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1435 if (flag[flind] == 0)
1440 g0x = (gx*ny + gy)*nz + gz;
1441 for (x = 0; x < FLBS; x++)
1444 for (y = 0; y < FLBS; y++)
1446 for (z = 0; z < FLBSZ; z++)
1462 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1463 #define DO_BSPLINE(order) \
1464 for (ithx = 0; (ithx < order); ithx++) \
1466 index_x = (i0+ithx)*pny*pnz; \
1467 valx = qn*thx[ithx]; \
1469 for (ithy = 0; (ithy < order); ithy++) \
1471 valxy = valx*thy[ithy]; \
1472 index_xy = index_x+(j0+ithy)*pnz; \
1474 for (ithz = 0; (ithz < order); ithz++) \
1476 index_xyz = index_xy+(k0+ithz); \
1477 grid[index_xyz] += valxy*thz[ithz]; \
1483 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1484 pme_atomcomm_t *atc, splinedata_t *spline,
1485 pme_spline_work_t *work)
1488 /* spread charges from home atoms to local grid */
1491 int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1493 int order, norder, index_x, index_xy, index_xyz;
1494 real valx, valxy, qn;
1495 real *thx, *thy, *thz;
1496 int localsize, bndsize;
1497 int pnx, pny, pnz, ndatatot;
1498 int offx, offy, offz;
1500 pnx = pmegrid->s[XX];
1501 pny = pmegrid->s[YY];
1502 pnz = pmegrid->s[ZZ];
1504 offx = pmegrid->offset[XX];
1505 offy = pmegrid->offset[YY];
1506 offz = pmegrid->offset[ZZ];
1508 ndatatot = pnx*pny*pnz;
1509 grid = pmegrid->grid;
1510 for (i = 0; i < ndatatot; i++)
1515 order = pmegrid->order;
1517 for (nn = 0; nn < spline->n; nn++)
1519 n = spline->ind[nn];
1524 idxptr = atc->idx[n];
1527 i0 = idxptr[XX] - offx;
1528 j0 = idxptr[YY] - offy;
1529 k0 = idxptr[ZZ] - offz;
1531 thx = spline->theta[XX] + norder;
1532 thy = spline->theta[YY] + norder;
1533 thz = spline->theta[ZZ] + norder;
1539 #ifdef PME_SSE_UNALIGNED
1540 #define PME_SPREAD_SSE_ORDER4
1542 #define PME_SPREAD_SSE_ALIGNED
1545 #include "pme_sse_single.h"
1552 #define PME_SPREAD_SSE_ALIGNED
1554 #include "pme_sse_single.h"
1567 static void set_grid_alignment(int *pmegrid_nz, int pme_order)
1571 #ifndef PME_SSE_UNALIGNED
1576 /* Round nz up to a multiple of 4 to ensure alignment */
1577 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1582 static void set_gridsize_alignment(int *gridsize, int pme_order)
1585 #ifndef PME_SSE_UNALIGNED
1588 /* Add extra elements to ensured aligned operations do not go
1589 * beyond the allocated grid size.
1590 * Note that for pme_order=5, the pme grid z-size alignment
1591 * ensures that we will not go beyond the grid size.
1599 static void pmegrid_init(pmegrid_t *grid,
1600 int cx, int cy, int cz,
1601 int x0, int y0, int z0,
1602 int x1, int y1, int z1,
1603 gmx_bool set_alignment,
1612 grid->offset[XX] = x0;
1613 grid->offset[YY] = y0;
1614 grid->offset[ZZ] = z0;
1615 grid->n[XX] = x1 - x0 + pme_order - 1;
1616 grid->n[YY] = y1 - y0 + pme_order - 1;
1617 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1618 copy_ivec(grid->n, grid->s);
1621 set_grid_alignment(&nz, pme_order);
1626 else if (nz != grid->s[ZZ])
1628 gmx_incons("pmegrid_init call with an unaligned z size");
1631 grid->order = pme_order;
1634 gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1635 set_gridsize_alignment(&gridsize, pme_order);
1636 snew_aligned(grid->grid, gridsize, 16);
1644 static int div_round_up(int enumerator, int denominator)
1646 return (enumerator + denominator - 1)/denominator;
1649 static void make_subgrid_division(const ivec n, int ovl, int nthread,
1652 int gsize_opt, gsize;
1657 for (nsx = 1; nsx <= nthread; nsx++)
1659 if (nthread % nsx == 0)
1661 for (nsy = 1; nsy <= nthread; nsy++)
1663 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1665 nsz = nthread/(nsx*nsy);
1667 /* Determine the number of grid points per thread */
1669 (div_round_up(n[XX], nsx) + ovl)*
1670 (div_round_up(n[YY], nsy) + ovl)*
1671 (div_round_up(n[ZZ], nsz) + ovl);
1673 /* Minimize the number of grids points per thread
1674 * and, secondarily, the number of cuts in minor dimensions.
1676 if (gsize_opt == -1 ||
1677 gsize < gsize_opt ||
1678 (gsize == gsize_opt &&
1679 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1691 env = getenv("GMX_PME_THREAD_DIVISION");
1694 sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
1697 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1699 gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
1703 static void pmegrids_init(pmegrids_t *grids,
1704 int nx, int ny, int nz, int nz_base,
1710 ivec n, n_base, g0, g1;
1711 int t, x, y, z, d, i, tfac;
1712 int max_comm_lines = -1;
1714 n[XX] = nx - (pme_order - 1);
1715 n[YY] = ny - (pme_order - 1);
1716 n[ZZ] = nz - (pme_order - 1);
1718 copy_ivec(n, n_base);
1719 n_base[ZZ] = nz_base;
1721 pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
1724 grids->nthread = nthread;
1726 make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1728 if (grids->nthread > 1)
1733 for (d = 0; d < DIM; d++)
1735 nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1737 set_grid_alignment(&nst[ZZ], pme_order);
1741 fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1742 grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
1743 fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1745 nst[XX], nst[YY], nst[ZZ]);
1748 snew(grids->grid_th, grids->nthread);
1750 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1751 set_gridsize_alignment(&gridsize, pme_order);
1752 snew_aligned(grids->grid_all,
1753 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1756 for (x = 0; x < grids->nc[XX]; x++)
1758 for (y = 0; y < grids->nc[YY]; y++)
1760 for (z = 0; z < grids->nc[ZZ]; z++)
1762 pmegrid_init(&grids->grid_th[t],
1764 (n[XX]*(x ))/grids->nc[XX],
1765 (n[YY]*(y ))/grids->nc[YY],
1766 (n[ZZ]*(z ))/grids->nc[ZZ],
1767 (n[XX]*(x+1))/grids->nc[XX],
1768 (n[YY]*(y+1))/grids->nc[YY],
1769 (n[ZZ]*(z+1))/grids->nc[ZZ],
1772 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1779 snew(grids->g2t, DIM);
1781 for (d = DIM-1; d >= 0; d--)
1783 snew(grids->g2t[d], n[d]);
1785 for (i = 0; i < n[d]; i++)
1787 /* The second check should match the parameters
1788 * of the pmegrid_init call above.
1790 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1794 grids->g2t[d][i] = t*tfac;
1797 tfac *= grids->nc[d];
1801 case XX: max_comm_lines = overlap_x; break;
1802 case YY: max_comm_lines = overlap_y; break;
1803 case ZZ: max_comm_lines = pme_order - 1; break;
1805 grids->nthread_comm[d] = 0;
1806 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1807 grids->nthread_comm[d] < grids->nc[d])
1809 grids->nthread_comm[d]++;
1813 fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1814 'x'+d, grids->nthread_comm[d]);
1816 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1817 * work, but this is not a problematic restriction.
1819 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1821 gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1827 static void pmegrids_destroy(pmegrids_t *grids)
1831 if (grids->grid.grid != NULL)
1833 sfree(grids->grid.grid);
1835 if (grids->nthread > 0)
1837 for (t = 0; t < grids->nthread; t++)
1839 sfree(grids->grid_th[t].grid);
1841 sfree(grids->grid_th);
1847 static void realloc_work(pme_work_t *work, int nkx)
1849 if (nkx > work->nalloc)
1852 srenew(work->mhx, work->nalloc);
1853 srenew(work->mhy, work->nalloc);
1854 srenew(work->mhz, work->nalloc);
1855 srenew(work->m2, work->nalloc);
1856 /* Allocate an aligned pointer for SSE operations, including 3 extra
1857 * elements at the end since SSE operates on 4 elements at a time.
1859 sfree_aligned(work->denom);
1860 sfree_aligned(work->tmp1);
1861 sfree_aligned(work->eterm);
1862 snew_aligned(work->denom, work->nalloc+3, 16);
1863 snew_aligned(work->tmp1, work->nalloc+3, 16);
1864 snew_aligned(work->eterm, work->nalloc+3, 16);
1865 srenew(work->m2inv, work->nalloc);
1870 static void free_work(pme_work_t *work)
1876 sfree_aligned(work->denom);
1877 sfree_aligned(work->tmp1);
1878 sfree_aligned(work->eterm);
1884 /* Calculate exponentials through SSE in float precision */
1885 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1888 const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
1891 __m128 tmp_d1, d_inv, tmp_r, tmp_e;
1893 f_sse = _mm_load1_ps(&f);
1894 for (kx = 0; kx < end; kx += 4)
1896 tmp_d1 = _mm_load_ps(d_aligned+kx);
1897 lu = _mm_rcp_ps(tmp_d1);
1898 d_inv = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
1899 tmp_r = _mm_load_ps(r_aligned+kx);
1900 tmp_r = gmx_mm_exp_ps(tmp_r);
1901 tmp_e = _mm_mul_ps(f_sse, d_inv);
1902 tmp_e = _mm_mul_ps(tmp_e, tmp_r);
1903 _mm_store_ps(e_aligned+kx, tmp_e);
1908 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1911 for (kx = start; kx < end; kx++)
1915 for (kx = start; kx < end; kx++)
1919 for (kx = start; kx < end; kx++)
1921 e[kx] = f*r[kx]*d[kx];
1927 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1928 real ewaldcoeff, real vol,
1930 int nthread, int thread)
1932 /* do recip sum over local cells in grid */
1933 /* y major, z middle, x minor or continuous */
1935 int kx, ky, kz, maxkx, maxky, maxkz;
1936 int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1938 real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1939 real ets2, struct2, vfactor, ets2vf;
1940 real d1, d2, energy = 0;
1942 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1943 real rxx, ryx, ryy, rzx, rzy, rzz;
1945 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1946 real mhxk, mhyk, mhzk, m2k;
1949 ivec local_ndata, local_offset, local_size;
1952 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1958 /* Dimensions should be identical for A/B grid, so we just use A here */
1959 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1965 rxx = pme->recipbox[XX][XX];
1966 ryx = pme->recipbox[YY][XX];
1967 ryy = pme->recipbox[YY][YY];
1968 rzx = pme->recipbox[ZZ][XX];
1969 rzy = pme->recipbox[ZZ][YY];
1970 rzz = pme->recipbox[ZZ][ZZ];
1976 work = &pme->work[thread];
1981 denom = work->denom;
1983 eterm = work->eterm;
1984 m2inv = work->m2inv;
1986 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1987 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1989 for (iyz = iyz0; iyz < iyz1; iyz++)
1991 iy = iyz/local_ndata[ZZ];
1992 iz = iyz - iy*local_ndata[ZZ];
1994 ky = iy + local_offset[YY];
2005 by = M_PI*vol*pme->bsp_mod[YY][ky];
2007 kz = iz + local_offset[ZZ];
2011 bz = pme->bsp_mod[ZZ][kz];
2013 /* 0.5 correction for corner points */
2015 if (kz == 0 || kz == (nz+1)/2)
2020 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
2022 /* We should skip the k-space point (0,0,0) */
2023 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
2025 kxstart = local_offset[XX];
2029 kxstart = local_offset[XX] + 1;
2032 kxend = local_offset[XX] + local_ndata[XX];
2036 /* More expensive inner loop, especially because of the storage
2037 * of the mh elements in array's.
2038 * Because x is the minor grid index, all mh elements
2039 * depend on kx for triclinic unit cells.
2042 /* Two explicit loops to avoid a conditional inside the loop */
2043 for (kx = kxstart; kx < maxkx; kx++)
2048 mhyk = mx * ryx + my * ryy;
2049 mhzk = mx * rzx + my * rzy + mz * rzz;
2050 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2055 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2056 tmp1[kx] = -factor*m2k;
2059 for (kx = maxkx; kx < kxend; kx++)
2064 mhyk = mx * ryx + my * ryy;
2065 mhzk = mx * rzx + my * rzy + mz * rzz;
2066 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2071 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2072 tmp1[kx] = -factor*m2k;
2075 for (kx = kxstart; kx < kxend; kx++)
2077 m2inv[kx] = 1.0/m2[kx];
2080 calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2082 for (kx = kxstart; kx < kxend; kx++, p0++)
2087 p0->re = d1*eterm[kx];
2088 p0->im = d2*eterm[kx];
2090 struct2 = 2.0*(d1*d1+d2*d2);
2092 tmp1[kx] = eterm[kx]*struct2;
2095 for (kx = kxstart; kx < kxend; kx++)
2097 ets2 = corner_fac*tmp1[kx];
2098 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2101 ets2vf = ets2*vfactor;
2102 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2103 virxy += ets2vf*mhx[kx]*mhy[kx];
2104 virxz += ets2vf*mhx[kx]*mhz[kx];
2105 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2106 viryz += ets2vf*mhy[kx]*mhz[kx];
2107 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2112 /* We don't need to calculate the energy and the virial.
2113 * In this case the triclinic overhead is small.
2116 /* Two explicit loops to avoid a conditional inside the loop */
2118 for (kx = kxstart; kx < maxkx; kx++)
2123 mhyk = mx * ryx + my * ryy;
2124 mhzk = mx * rzx + my * rzy + mz * rzz;
2125 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2126 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2127 tmp1[kx] = -factor*m2k;
2130 for (kx = maxkx; kx < kxend; kx++)
2135 mhyk = mx * ryx + my * ryy;
2136 mhzk = mx * rzx + my * rzy + mz * rzz;
2137 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2138 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2139 tmp1[kx] = -factor*m2k;
2142 calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
2144 for (kx = kxstart; kx < kxend; kx++, p0++)
2149 p0->re = d1*eterm[kx];
2150 p0->im = d2*eterm[kx];
2157 /* Update virial with local values.
2158 * The virial is symmetric by definition.
2159 * this virial seems ok for isotropic scaling, but I'm
2160 * experiencing problems on semiisotropic membranes.
2161 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2163 work->vir[XX][XX] = 0.25*virxx;
2164 work->vir[YY][YY] = 0.25*viryy;
2165 work->vir[ZZ][ZZ] = 0.25*virzz;
2166 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2167 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2168 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2170 /* This energy should be corrected for a charged system */
2171 work->energy = 0.5*energy;
2174 /* Return the loop count */
2175 return local_ndata[YY]*local_ndata[XX];
2178 static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
2179 real *mesh_energy, matrix vir)
2181 /* This function sums output over threads
2182 * and should therefore only be called after thread synchronization.
2186 *mesh_energy = pme->work[0].energy;
2187 copy_mat(pme->work[0].vir, vir);
2189 for (thread = 1; thread < nthread; thread++)
2191 *mesh_energy += pme->work[thread].energy;
2192 m_add(vir, pme->work[thread].vir, vir);
2196 #define DO_FSPLINE(order) \
2197 for (ithx = 0; (ithx < order); ithx++) \
2199 index_x = (i0+ithx)*pny*pnz; \
2203 for (ithy = 0; (ithy < order); ithy++) \
2205 index_xy = index_x+(j0+ithy)*pnz; \
2210 for (ithz = 0; (ithz < order); ithz++) \
2212 gval = grid[index_xy+(k0+ithz)]; \
2213 fxy1 += thz[ithz]*gval; \
2214 fz1 += dthz[ithz]*gval; \
2223 static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2224 gmx_bool bClearF, pme_atomcomm_t *atc,
2225 splinedata_t *spline,
2228 /* sum forces for local particles */
2229 int nn, n, ithx, ithy, ithz, i0, j0, k0;
2230 int index_x, index_xy;
2231 int nx, ny, nz, pnx, pny, pnz;
2233 real tx, ty, dx, dy, qn;
2234 real fx, fy, fz, gval;
2236 real *thx, *thy, *thz, *dthx, *dthy, *dthz;
2238 real rxx, ryx, ryy, rzx, rzy, rzz;
2241 pme_spline_work_t *work;
2243 work = pme->spline_work;
2245 order = pme->pme_order;
2246 thx = spline->theta[XX];
2247 thy = spline->theta[YY];
2248 thz = spline->theta[ZZ];
2249 dthx = spline->dtheta[XX];
2250 dthy = spline->dtheta[YY];
2251 dthz = spline->dtheta[ZZ];
2255 pnx = pme->pmegrid_nx;
2256 pny = pme->pmegrid_ny;
2257 pnz = pme->pmegrid_nz;
2259 rxx = pme->recipbox[XX][XX];
2260 ryx = pme->recipbox[YY][XX];
2261 ryy = pme->recipbox[YY][YY];
2262 rzx = pme->recipbox[ZZ][XX];
2263 rzy = pme->recipbox[ZZ][YY];
2264 rzz = pme->recipbox[ZZ][ZZ];
2266 for (nn = 0; nn < spline->n; nn++)
2268 n = spline->ind[nn];
2269 qn = scale*atc->q[n];
2282 idxptr = atc->idx[n];
2289 /* Pointer arithmetic alert, next six statements */
2290 thx = spline->theta[XX] + norder;
2291 thy = spline->theta[YY] + norder;
2292 thz = spline->theta[ZZ] + norder;
2293 dthx = spline->dtheta[XX] + norder;
2294 dthy = spline->dtheta[YY] + norder;
2295 dthz = spline->dtheta[ZZ] + norder;
2301 #ifdef PME_SSE_UNALIGNED
2302 #define PME_GATHER_F_SSE_ORDER4
2304 #define PME_GATHER_F_SSE_ALIGNED
2307 #include "pme_sse_single.h"
2314 #define PME_GATHER_F_SSE_ALIGNED
2316 #include "pme_sse_single.h"
2326 atc->f[n][XX] += -qn*( fx*nx*rxx );
2327 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2328 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2331 /* Since the energy and not forces are interpolated
2332 * the net force might not be exactly zero.
2333 * This can be solved by also interpolating F, but
2334 * that comes at a cost.
2335 * A better hack is to remove the net force every
2336 * step, but that must be done at a higher level
2337 * since this routine doesn't see all atoms if running
2338 * in parallel. Don't know how important it is? EL 990726
2343 static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2344 pme_atomcomm_t *atc)
2346 splinedata_t *spline;
2347 int n, ithx, ithy, ithz, i0, j0, k0;
2348 int index_x, index_xy;
2350 real energy, pot, tx, ty, qn, gval;
2351 real *thx, *thy, *thz;
2355 spline = &atc->spline[0];
2357 order = pme->pme_order;
2360 for (n = 0; (n < atc->n); n++)
2366 idxptr = atc->idx[n];
2373 /* Pointer arithmetic alert, next three statements */
2374 thx = spline->theta[XX] + norder;
2375 thy = spline->theta[YY] + norder;
2376 thz = spline->theta[ZZ] + norder;
2379 for (ithx = 0; (ithx < order); ithx++)
2381 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2384 for (ithy = 0; (ithy < order); ithy++)
2386 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2389 for (ithz = 0; (ithz < order); ithz++)
2391 gval = grid[index_xy+(k0+ithz)];
2392 pot += tx*ty*thz[ithz]*gval;
2405 /* Macro to force loop unrolling by fixing order.
2406 * This gives a significant performance gain.
2408 #define CALC_SPLINE(order) \
2412 real data[PME_ORDER_MAX]; \
2413 real ddata[PME_ORDER_MAX]; \
2415 for (j = 0; (j < DIM); j++) \
2419 /* dr is relative offset from lower cell limit */ \
2420 data[order-1] = 0; \
2424 for (k = 3; (k < order); k++) \
2426 div = 1.0/(k - 1.0); \
2427 data[k-1] = div*dr*data[k-2]; \
2428 for (l = 1; (l < (k-1)); l++) \
2430 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2433 data[0] = div*(1-dr)*data[0]; \
2435 /* differentiate */ \
2436 ddata[0] = -data[0]; \
2437 for (k = 1; (k < order); k++) \
2439 ddata[k] = data[k-1] - data[k]; \
2442 div = 1.0/(order - 1); \
2443 data[order-1] = div*dr*data[order-2]; \
2444 for (l = 1; (l < (order-1)); l++) \
2446 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2447 (order-l-dr)*data[order-l-1]); \
2449 data[0] = div*(1 - dr)*data[0]; \
2451 for (k = 0; k < order; k++) \
2453 theta[j][i*order+k] = data[k]; \
2454 dtheta[j][i*order+k] = ddata[k]; \
2459 void make_bsplines(splinevec theta, splinevec dtheta, int order,
2460 rvec fractx[], int nr, int ind[], real charge[],
2461 gmx_bool bFreeEnergy)
2463 /* construct splines for local atoms */
2467 for (i = 0; i < nr; i++)
2469 /* With free energy we do not use the charge check.
2470 * In most cases this will be more efficient than calling make_bsplines
2471 * twice, since usually more than half the particles have charges.
2474 if (bFreeEnergy || charge[ii] != 0.0)
2479 case 4: CALC_SPLINE(4); break;
2480 case 5: CALC_SPLINE(5); break;
2481 default: CALC_SPLINE(order); break;
2488 void make_dft_mod(real *mod, real *data, int ndata)
2493 for (i = 0; i < ndata; i++)
2496 for (j = 0; j < ndata; j++)
2498 arg = (2.0*M_PI*i*j)/ndata;
2499 sc += data[j]*cos(arg);
2500 ss += data[j]*sin(arg);
2502 mod[i] = sc*sc+ss*ss;
2504 for (i = 0; i < ndata; i++)
2508 mod[i] = (mod[i-1]+mod[i+1])*0.5;
2514 static void make_bspline_moduli(splinevec bsp_mod,
2515 int nx, int ny, int nz, int order)
2517 int nmax = max(nx, max(ny, nz));
2518 real *data, *ddata, *bsp_data;
2524 snew(bsp_data, nmax);
2530 for (k = 3; k < order; k++)
2534 for (l = 1; l < (k-1); l++)
2536 data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2538 data[0] = div*data[0];
2541 ddata[0] = -data[0];
2542 for (k = 1; k < order; k++)
2544 ddata[k] = data[k-1]-data[k];
2546 div = 1.0/(order-1);
2548 for (l = 1; l < (order-1); l++)
2550 data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2552 data[0] = div*data[0];
2554 for (i = 0; i < nmax; i++)
2558 for (i = 1; i <= order; i++)
2560 bsp_data[i] = data[i-1];
2563 make_dft_mod(bsp_mod[XX], bsp_data, nx);
2564 make_dft_mod(bsp_mod[YY], bsp_data, ny);
2565 make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
2573 /* Return the P3M optimal influence function */
2574 static double do_p3m_influence(double z, int order)
2581 /* The formula and most constants can be found in:
2582 * Ballenegger et al., JCTC 8, 936 (2012)
2587 return 1.0 - 2.0*z2/3.0;
2590 return 1.0 - z2 + 2.0*z4/15.0;
2593 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2596 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2599 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2602 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2604 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2611 /* Calculate the P3M B-spline moduli for one dimension */
2612 static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2614 double zarg, zai, sinzai, infl;
2619 gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
2626 for (i = -maxk; i < 0; i++)
2630 infl = do_p3m_influence(sinzai, order);
2631 bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2634 for (i = 1; i < maxk; i++)
2638 infl = do_p3m_influence(sinzai, order);
2639 bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2643 /* Calculate the P3M B-spline moduli */
2644 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2645 int nx, int ny, int nz, int order)
2647 make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
2648 make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
2649 make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
2653 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2661 for (i = 1; i <= nslab/2; i++)
2663 fw = (atc->nodeid + i) % nslab;
2664 bw = (atc->nodeid - i + nslab) % nslab;
2667 atc->node_dest[n] = fw;
2668 atc->node_src[n] = bw;
2673 atc->node_dest[n] = bw;
2674 atc->node_src[n] = fw;
2680 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2686 fprintf(log, "Destroying PME data structures.\n");
2689 sfree((*pmedata)->nnx);
2690 sfree((*pmedata)->nny);
2691 sfree((*pmedata)->nnz);
2693 pmegrids_destroy(&(*pmedata)->pmegridA);
2695 sfree((*pmedata)->fftgridA);
2696 sfree((*pmedata)->cfftgridA);
2697 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2699 if ((*pmedata)->pmegridB.grid.grid != NULL)
2701 pmegrids_destroy(&(*pmedata)->pmegridB);
2702 sfree((*pmedata)->fftgridB);
2703 sfree((*pmedata)->cfftgridB);
2704 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2706 for (thread = 0; thread < (*pmedata)->nthread; thread++)
2708 free_work(&(*pmedata)->work[thread]);
2710 sfree((*pmedata)->work);
2718 static int mult_up(int n, int f)
2720 return ((n + f - 1)/f)*f;
2724 static double pme_load_imbalance(gmx_pme_t pme)
2729 nma = pme->nnodes_major;
2730 nmi = pme->nnodes_minor;
2732 n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
2733 n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
2734 n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
2736 /* pme_solve is roughly double the cost of an fft */
2738 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2741 static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
2742 int dimind, gmx_bool bSpread)
2744 int nk, k, s, thread;
2746 atc->dimind = dimind;
2751 if (pme->nnodes > 1)
2753 atc->mpi_comm = pme->mpi_comm_d[dimind];
2754 MPI_Comm_size(atc->mpi_comm, &atc->nslab);
2755 MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
2759 fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
2763 atc->bSpread = bSpread;
2764 atc->pme_order = pme->pme_order;
2768 /* These three allocations are not required for particle decomp. */
2769 snew(atc->node_dest, atc->nslab);
2770 snew(atc->node_src, atc->nslab);
2771 setup_coordinate_communication(atc);
2773 snew(atc->count_thread, pme->nthread);
2774 for (thread = 0; thread < pme->nthread; thread++)
2776 snew(atc->count_thread[thread], atc->nslab);
2778 atc->count = atc->count_thread[0];
2779 snew(atc->rcount, atc->nslab);
2780 snew(atc->buf_index, atc->nslab);
2783 atc->nthread = pme->nthread;
2784 if (atc->nthread > 1)
2786 snew(atc->thread_plist, atc->nthread);
2788 snew(atc->spline, atc->nthread);
2789 for (thread = 0; thread < atc->nthread; thread++)
2791 if (atc->nthread > 1)
2793 snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
2794 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2796 snew(atc->spline[thread].thread_one, pme->nthread);
2797 atc->spline[thread].thread_one[thread] = 1;
2802 init_overlap_comm(pme_overlap_t * ol,
2812 int lbnd, rbnd, maxlr, b, i;
2815 pme_grid_comm_t *pgc;
2817 int fft_start, fft_end, send_index1, recv_index1;
2821 ol->mpi_comm = comm;
2824 ol->nnodes = nnodes;
2825 ol->nodeid = nodeid;
2827 /* Linear translation of the PME grid won't affect reciprocal space
2828 * calculations, so to optimize we only interpolate "upwards",
2829 * which also means we only have to consider overlap in one direction.
2830 * I.e., particles on this node might also be spread to grid indices
2831 * that belong to higher nodes (modulo nnodes)
2834 snew(ol->s2g0, ol->nnodes+1);
2835 snew(ol->s2g1, ol->nnodes);
2838 fprintf(debug, "PME slab boundaries:");
2840 for (i = 0; i < nnodes; i++)
2842 /* s2g0 the local interpolation grid start.
2843 * s2g1 the local interpolation grid end.
2844 * Because grid overlap communication only goes forward,
2845 * the grid the slabs for fft's should be rounded down.
2847 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2848 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2852 fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
2855 ol->s2g0[nnodes] = ndata;
2858 fprintf(debug, "\n");
2861 /* Determine with how many nodes we need to communicate the grid overlap */
2867 for (i = 0; i < nnodes; i++)
2869 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2870 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2876 while (bCont && b < nnodes);
2877 ol->noverlap_nodes = b - 1;
2879 snew(ol->send_id, ol->noverlap_nodes);
2880 snew(ol->recv_id, ol->noverlap_nodes);
2881 for (b = 0; b < ol->noverlap_nodes; b++)
2883 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2884 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2886 snew(ol->comm_data, ol->noverlap_nodes);
2889 for (b = 0; b < ol->noverlap_nodes; b++)
2891 pgc = &ol->comm_data[b];
2893 fft_start = ol->s2g0[ol->send_id[b]];
2894 fft_end = ol->s2g0[ol->send_id[b]+1];
2895 if (ol->send_id[b] < nodeid)
2900 send_index1 = ol->s2g1[nodeid];
2901 send_index1 = min(send_index1, fft_end);
2902 pgc->send_index0 = fft_start;
2903 pgc->send_nindex = max(0, send_index1 - pgc->send_index0);
2904 ol->send_size += pgc->send_nindex;
2906 /* We always start receiving to the first index of our slab */
2907 fft_start = ol->s2g0[ol->nodeid];
2908 fft_end = ol->s2g0[ol->nodeid+1];
2909 recv_index1 = ol->s2g1[ol->recv_id[b]];
2910 if (ol->recv_id[b] > nodeid)
2912 recv_index1 -= ndata;
2914 recv_index1 = min(recv_index1, fft_end);
2915 pgc->recv_index0 = fft_start;
2916 pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
2920 /* Communicate the buffer sizes to receive */
2921 for (b = 0; b < ol->noverlap_nodes; b++)
2923 MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
2924 &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
2925 ol->mpi_comm, &stat);
2929 /* For non-divisible grid we need pme_order iso pme_order-1 */
2930 snew(ol->sendbuf, norder*commplainsize);
2931 snew(ol->recvbuf, norder*commplainsize);
2935 make_gridindex5_to_localindex(int n, int local_start, int local_range,
2936 int **global_to_local,
2937 real **fraction_shift)
2945 for (i = 0; (i < 5*n); i++)
2947 /* Determine the global to local grid index */
2948 gtl[i] = (i - local_start + n) % n;
2949 /* For coordinates that fall within the local grid the fraction
2950 * is correct, we don't need to shift it.
2953 if (local_range < n)
2955 /* Due to rounding issues i could be 1 beyond the lower or
2956 * upper boundary of the local grid. Correct the index for this.
2957 * If we shift the index, we need to shift the fraction by
2958 * the same amount in the other direction to not affect
2960 * Note that due to this shifting the weights at the end of
2961 * the spline might change, but that will only involve values
2962 * between zero and values close to the precision of a real,
2963 * which is anyhow the accuracy of the whole mesh calculation.
2965 /* With local_range=0 we should not change i=local_start */
2966 if (i % n != local_start)
2973 else if (gtl[i] == local_range)
2975 gtl[i] = local_range - 1;
2982 *global_to_local = gtl;
2983 *fraction_shift = fsh;
2986 static pme_spline_work_t *make_pme_spline_work(int order)
2988 pme_spline_work_t *work;
2995 snew_aligned(work, 1, 16);
2997 zero_SSE = _mm_setzero_ps();
2999 /* Generate bit masks to mask out the unused grid entries,
3000 * as we only operate on order of the 8 grid entries that are
3001 * load into 2 SSE float registers.
3003 for (of = 0; of < 8-(order-1); of++)
3005 for (i = 0; i < 8; i++)
3007 tmp[i] = (i >= of && i < of+order ? 1 : 0);
3009 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
3010 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
3011 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
3012 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
3022 gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
3026 if (*nk % nnodes != 0)
3028 nk_new = nnodes*(*nk/nnodes + 1);
3030 if (2*nk_new >= 3*(*nk))
3032 gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
3033 dim, *nk, dim, nnodes, dim);
3038 fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
3039 dim, *nk, dim, nnodes, dim, nk_new, dim);
3046 int gmx_pme_init(gmx_pme_t * pmedata,
3052 gmx_bool bFreeEnergy,
3053 gmx_bool bReproducible,
3056 gmx_pme_t pme = NULL;
3058 pme_atomcomm_t *atc;
3063 fprintf(debug, "Creating PME data structures.\n");
3067 pme->redist_init = FALSE;
3068 pme->sum_qgrid_tmp = NULL;
3069 pme->sum_qgrid_dd_tmp = NULL;
3070 pme->buf_nalloc = 0;
3071 pme->redist_buf_nalloc = 0;
3074 pme->bPPnode = TRUE;
3076 pme->nnodes_major = nnodes_major;
3077 pme->nnodes_minor = nnodes_minor;
3080 if (nnodes_major*nnodes_minor > 1)
3082 pme->mpi_comm = cr->mpi_comm_mygroup;
3084 MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3085 MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3086 if (pme->nnodes != nnodes_major*nnodes_minor)
3088 gmx_incons("PME node count mismatch");
3093 pme->mpi_comm = MPI_COMM_NULL;
3097 if (pme->nnodes == 1)
3100 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3101 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3103 pme->ndecompdim = 0;
3104 pme->nodeid_major = 0;
3105 pme->nodeid_minor = 0;
3107 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3112 if (nnodes_minor == 1)
3115 pme->mpi_comm_d[0] = pme->mpi_comm;
3116 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3118 pme->ndecompdim = 1;
3119 pme->nodeid_major = pme->nodeid;
3120 pme->nodeid_minor = 0;
3123 else if (nnodes_major == 1)
3126 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3127 pme->mpi_comm_d[1] = pme->mpi_comm;
3129 pme->ndecompdim = 1;
3130 pme->nodeid_major = 0;
3131 pme->nodeid_minor = pme->nodeid;
3135 if (pme->nnodes % nnodes_major != 0)
3137 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3139 pme->ndecompdim = 2;
3142 MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3143 pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
3144 MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3145 pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3147 MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3148 MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3149 MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3150 MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3153 pme->bPPnode = (cr->duty & DUTY_PP);
3156 pme->nthread = nthread;
3158 if (ir->ePBC == epbcSCREW)
3160 gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
3163 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
3167 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3168 pme->pme_order = ir->pme_order;
3169 pme->epsilon_r = ir->epsilon_r;
3171 if (pme->pme_order > PME_ORDER_MAX)
3173 gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3174 pme->pme_order, PME_ORDER_MAX);
3177 /* Currently pme.c supports only the fft5d FFT code.
3178 * Therefore the grid always needs to be divisible by nnodes.
3179 * When the old 1D code is also supported again, change this check.
3181 * This check should be done before calling gmx_pme_init
3182 * and fplog should be passed iso stderr.
3184 if (pme->ndecompdim >= 2)
3186 if (pme->ndecompdim >= 1)
3189 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3190 'x',nnodes_major,&pme->nkx);
3191 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3192 'y',nnodes_minor,&pme->nky);
3196 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3197 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3198 pme->nkz <= pme->pme_order)
3200 gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
3203 if (pme->nnodes > 1)
3208 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3209 MPI_Type_commit(&(pme->rvec_mpi));
3212 /* Note that the charge spreading and force gathering, which usually
3213 * takes about the same amount of time as FFT+solve_pme,
3214 * is always fully load balanced
3215 * (unless the charge distribution is inhomogeneous).
3218 imbal = pme_load_imbalance(pme);
3219 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3223 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3224 " For optimal PME load balancing\n"
3225 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3226 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3228 (int)((imbal-1)*100 + 0.5),
3229 pme->nkx, pme->nky, pme->nnodes_major,
3230 pme->nky, pme->nkz, pme->nnodes_minor);
3234 /* For non-divisible grid we need pme_order iso pme_order-1 */
3235 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3236 * y is always copied through a buffer: we don't need padding in z,
3237 * but we do need the overlap in x because of the communication order.
3239 init_overlap_comm(&pme->overlap[0], pme->pme_order,
3243 pme->nnodes_major, pme->nodeid_major,
3245 (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3247 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3248 * We do this with an offset buffer of equal size, so we need to allocate
3249 * extra for the offset. That's what the (+1)*pme->nkz is for.
3251 init_overlap_comm(&pme->overlap[1], pme->pme_order,
3255 pme->nnodes_minor, pme->nodeid_minor,
3257 (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3259 /* Check for a limitation of the (current) sum_fftgrid_dd code.
3260 * We only allow multiple communication pulses in dim 1, not in dim 0.
3262 if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
3263 pme->nkx < pme->nnodes_major*pme->pme_order))
3265 gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3266 pme->nkx/(double)pme->nnodes_major, pme->pme_order);
3269 snew(pme->bsp_mod[XX], pme->nkx);
3270 snew(pme->bsp_mod[YY], pme->nky);
3271 snew(pme->bsp_mod[ZZ], pme->nkz);
3273 /* The required size of the interpolation grid, including overlap.
3274 * The allocated size (pmegrid_n?) might be slightly larger.
3276 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3277 pme->overlap[0].s2g0[pme->nodeid_major];
3278 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3279 pme->overlap[1].s2g0[pme->nodeid_minor];
3280 pme->pmegrid_nz_base = pme->nkz;
3281 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3282 set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3284 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3285 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3286 pme->pmegrid_start_iz = 0;
3288 make_gridindex5_to_localindex(pme->nkx,
3289 pme->pmegrid_start_ix,
3290 pme->pmegrid_nx - (pme->pme_order-1),
3291 &pme->nnx, &pme->fshx);
3292 make_gridindex5_to_localindex(pme->nky,
3293 pme->pmegrid_start_iy,
3294 pme->pmegrid_ny - (pme->pme_order-1),
3295 &pme->nny, &pme->fshy);
3296 make_gridindex5_to_localindex(pme->nkz,
3297 pme->pmegrid_start_iz,
3298 pme->pmegrid_nz_base,
3299 &pme->nnz, &pme->fshz);
3301 pmegrids_init(&pme->pmegridA,
3302 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3303 pme->pmegrid_nz_base,
3306 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3307 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3309 pme->spline_work = make_pme_spline_work(pme->pme_order);
3311 ndata[0] = pme->nkx;
3312 ndata[1] = pme->nky;
3313 ndata[2] = pme->nkz;
3315 /* This routine will allocate the grid data to fit the FFTs */
3316 gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
3317 &pme->fftgridA, &pme->cfftgridA,
3319 pme->overlap[0].s2g0, pme->overlap[1].s2g0,
3320 bReproducible, pme->nthread);
3324 pmegrids_init(&pme->pmegridB,
3325 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3326 pme->pmegrid_nz_base,
3329 pme->nkx % pme->nnodes_major != 0,
3330 pme->nky % pme->nnodes_minor != 0);
3332 gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
3333 &pme->fftgridB, &pme->cfftgridB,
3335 pme->overlap[0].s2g0, pme->overlap[1].s2g0,
3336 bReproducible, pme->nthread);
3340 pme->pmegridB.grid.grid = NULL;
3341 pme->fftgridB = NULL;
3342 pme->cfftgridB = NULL;
3347 /* Use plain SPME B-spline interpolation */
3348 make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3352 /* Use the P3M grid-optimized influence function */
3353 make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3356 /* Use atc[0] for spreading */
3357 init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
3358 if (pme->ndecompdim >= 2)
3360 init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
3363 if (pme->nnodes == 1)
3365 pme->atc[0].n = homenr;
3366 pme_realloc_atomcomm_things(&pme->atc[0]);
3372 /* Use fft5d, order after FFT is y major, z, x minor */
3374 snew(pme->work, pme->nthread);
3375 for (thread = 0; thread < pme->nthread; thread++)
3377 realloc_work(&pme->work[thread], pme->nkx);
3386 static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3390 for (d = 0; d < DIM; d++)
3392 if (new->grid.n[d] > old->grid.n[d])
3398 sfree_aligned(new->grid.grid);
3399 new->grid.grid = old->grid.grid;
3401 if (new->nthread > 1 && new->nthread == old->nthread)
3403 sfree_aligned(new->grid_all);
3404 for (t = 0; t < new->nthread; t++)
3406 new->grid_th[t].grid = old->grid_th[t].grid;
3411 int gmx_pme_reinit(gmx_pme_t * pmedata,
3414 const t_inputrec * ir,
3422 irc.nkx = grid_size[XX];
3423 irc.nky = grid_size[YY];
3424 irc.nkz = grid_size[ZZ];
3426 if (pme_src->nnodes == 1)
3428 homenr = pme_src->atc[0].n;
3435 ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3436 &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
3440 /* We can easily reuse the allocated pme grids in pme_src */
3441 reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
3442 /* We would like to reuse the fft grids, but that's harder */
3449 static void copy_local_grid(gmx_pme_t pme,
3450 pmegrids_t *pmegrids, int thread, real *fftgrid)
3452 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3456 int offx, offy, offz, x, y, z, i0, i0t;
3461 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3465 fft_my = local_fft_size[YY];
3466 fft_mz = local_fft_size[ZZ];
3468 pmegrid = &pmegrids->grid_th[thread];
3470 nsx = pmegrid->s[XX];
3471 nsy = pmegrid->s[YY];
3472 nsz = pmegrid->s[ZZ];
3474 for (d = 0; d < DIM; d++)
3476 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3477 local_fft_ndata[d] - pmegrid->offset[d]);
3480 offx = pmegrid->offset[XX];
3481 offy = pmegrid->offset[YY];
3482 offz = pmegrid->offset[ZZ];
3484 /* Directly copy the non-overlapping parts of the local grids.
3485 * This also initializes the full grid.
3487 grid_th = pmegrid->grid;
3488 for (x = 0; x < nf[XX]; x++)
3490 for (y = 0; y < nf[YY]; y++)
3492 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3493 i0t = (x*nsy + y)*nsz;
3494 for (z = 0; z < nf[ZZ]; z++)
3496 fftgrid[i0+z] = grid_th[i0t+z];
3503 reduce_threadgrid_overlap(gmx_pme_t pme,
3504 const pmegrids_t *pmegrids, int thread,
3505 real *fftgrid, real *commbuf_x, real *commbuf_y)
3507 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3508 int fft_nx, fft_ny, fft_nz;
3513 int offx, offy, offz, x, y, z, i0, i0t;
3514 int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3515 gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3516 gmx_bool bCommX, bCommY;
3519 const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3520 const real *grid_th;
3521 real *commbuf = NULL;
3523 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3527 fft_nx = local_fft_ndata[XX];
3528 fft_ny = local_fft_ndata[YY];
3529 fft_nz = local_fft_ndata[ZZ];
3531 fft_my = local_fft_size[YY];
3532 fft_mz = local_fft_size[ZZ];
3534 /* This routine is called when all thread have finished spreading.
3535 * Here each thread sums grid contributions calculated by other threads
3536 * to the thread local grid volume.
3537 * To minimize the number of grid copying operations,
3538 * this routines sums immediately from the pmegrid to the fftgrid.
3541 /* Determine which part of the full node grid we should operate on,
3542 * this is our thread local part of the full grid.
3544 pmegrid = &pmegrids->grid_th[thread];
3546 for (d = 0; d < DIM; d++)
3548 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3549 local_fft_ndata[d]);
3552 offx = pmegrid->offset[XX];
3553 offy = pmegrid->offset[YY];
3554 offz = pmegrid->offset[ZZ];
3561 /* Now loop over all the thread data blocks that contribute
3562 * to the grid region we (our thread) are operating on.
3564 /* Note that ffy_nx/y is equal to the number of grid points
3565 * between the first point of our node grid and the one of the next node.
3567 for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
3569 fx = pmegrid->ci[XX] + sx;
3574 fx += pmegrids->nc[XX];
3576 bCommX = (pme->nnodes_major > 1);
3578 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3579 ox += pmegrid_g->offset[XX];
3582 tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
3586 tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
3589 for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
3591 fy = pmegrid->ci[YY] + sy;
3596 fy += pmegrids->nc[YY];
3598 bCommY = (pme->nnodes_minor > 1);
3600 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3601 oy += pmegrid_g->offset[YY];
3604 ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
3608 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
3611 for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
3613 fz = pmegrid->ci[ZZ] + sz;
3617 fz += pmegrids->nc[ZZ];
3620 pmegrid_g = &pmegrids->grid_th[fz];
3621 oz += pmegrid_g->offset[ZZ];
3622 tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
3624 if (sx == 0 && sy == 0 && sz == 0)
3626 /* We have already added our local contribution
3627 * before calling this routine, so skip it here.
3632 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3634 pmegrid_f = &pmegrids->grid_th[thread_f];
3636 grid_th = pmegrid_f->grid;
3638 nsx = pmegrid_f->s[XX];
3639 nsy = pmegrid_f->s[YY];
3640 nsz = pmegrid_f->s[ZZ];
3642 #ifdef DEBUG_PME_REDUCE
3643 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3644 pme->nodeid, thread, thread_f,
3645 pme->pmegrid_start_ix,
3646 pme->pmegrid_start_iy,
3647 pme->pmegrid_start_iz,
3649 offx-ox, tx1-ox, offx, tx1,
3650 offy-oy, ty1-oy, offy, ty1,
3651 offz-oz, tz1-oz, offz, tz1);
3654 if (!(bCommX || bCommY))
3656 /* Copy from the thread local grid to the node grid */
3657 for (x = offx; x < tx1; x++)
3659 for (y = offy; y < ty1; y++)
3661 i0 = (x*fft_my + y)*fft_mz;
3662 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3663 for (z = offz; z < tz1; z++)
3665 fftgrid[i0+z] += grid_th[i0t+z];
3672 /* The order of this conditional decides
3673 * where the corner volume gets stored with x+y decomp.
3677 commbuf = commbuf_y;
3678 buf_my = ty1 - offy;
3681 /* We index commbuf modulo the local grid size */
3682 commbuf += buf_my*fft_nx*fft_nz;
3684 bClearBuf = bClearBufXY;
3685 bClearBufXY = FALSE;
3689 bClearBuf = bClearBufY;
3695 commbuf = commbuf_x;
3697 bClearBuf = bClearBufX;
3701 /* Copy to the communication buffer */
3702 for (x = offx; x < tx1; x++)
3704 for (y = offy; y < ty1; y++)
3706 i0 = (x*buf_my + y)*fft_nz;
3707 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3711 /* First access of commbuf, initialize it */
3712 for (z = offz; z < tz1; z++)
3714 commbuf[i0+z] = grid_th[i0t+z];
3719 for (z = offz; z < tz1; z++)
3721 commbuf[i0+z] += grid_th[i0t+z];
3733 static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
3735 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3736 pme_overlap_t *overlap;
3737 int send_index0, send_nindex;
3742 int send_size_y, recv_size_y;
3743 int ipulse, send_id, recv_id, datasize, gridsize, size_yx;
3744 real *sendptr, *recvptr;
3745 int x, y, z, indg, indb;
3747 /* Note that this routine is only used for forward communication.
3748 * Since the force gathering, unlike the charge spreading,
3749 * can be trivially parallelized over the particles,
3750 * the backwards process is much simpler and can use the "old"
3751 * communication setup.
3754 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3759 if (pme->nnodes_minor > 1)
3761 /* Major dimension */
3762 overlap = &pme->overlap[1];
3764 if (pme->nnodes_major > 1)
3766 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3772 datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
3774 send_size_y = overlap->send_size;
3776 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
3778 send_id = overlap->send_id[ipulse];
3779 recv_id = overlap->recv_id[ipulse];
3781 overlap->comm_data[ipulse].send_index0 -
3782 overlap->comm_data[0].send_index0;
3783 send_nindex = overlap->comm_data[ipulse].send_nindex;
3784 /* We don't use recv_index0, as we always receive starting at 0 */
3785 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3786 recv_size_y = overlap->comm_data[ipulse].recv_size;
3788 sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
3789 recvptr = overlap->recvbuf;
3792 MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
3794 recvptr, recv_size_y*datasize, GMX_MPI_REAL,
3796 overlap->mpi_comm, &stat);
3799 for (x = 0; x < local_fft_ndata[XX]; x++)
3801 for (y = 0; y < recv_nindex; y++)
3803 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3804 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
3805 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3807 fftgrid[indg+z] += recvptr[indb+z];
3812 if (pme->nnodes_major > 1)
3814 /* Copy from the received buffer to the send buffer for dim 0 */
3815 sendptr = pme->overlap[0].sendbuf;
3816 for (x = 0; x < size_yx; x++)
3818 for (y = 0; y < recv_nindex; y++)
3820 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3821 indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
3822 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3824 sendptr[indg+z] += recvptr[indb+z];
3832 /* We only support a single pulse here.
3833 * This is not a severe limitation, as this code is only used
3834 * with OpenMP and with OpenMP the (PME) domains can be larger.
3836 if (pme->nnodes_major > 1)
3838 /* Major dimension */
3839 overlap = &pme->overlap[0];
3841 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3842 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3846 send_id = overlap->send_id[ipulse];
3847 recv_id = overlap->recv_id[ipulse];
3848 send_nindex = overlap->comm_data[ipulse].send_nindex;
3849 /* We don't use recv_index0, as we always receive starting at 0 */
3850 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3852 sendptr = overlap->sendbuf;
3853 recvptr = overlap->recvbuf;
3857 fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
3858 send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
3862 MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
3864 recvptr, recv_nindex*datasize, GMX_MPI_REAL,
3866 overlap->mpi_comm, &stat);
3869 for (x = 0; x < recv_nindex; x++)
3871 for (y = 0; y < local_fft_ndata[YY]; y++)
3873 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3874 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3875 for (z = 0; z < local_fft_ndata[ZZ]; z++)
3877 fftgrid[indg+z] += recvptr[indb+z];
3885 static void spread_on_grid(gmx_pme_t pme,
3886 pme_atomcomm_t *atc, pmegrids_t *grids,
3887 gmx_bool bCalcSplines, gmx_bool bSpread,
3890 int nthread, thread;
3891 #ifdef PME_TIME_THREADS
3892 gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
3893 static double cs1 = 0, cs2 = 0, cs3 = 0;
3894 static double cs1a[6] = {0, 0, 0, 0, 0, 0};
3898 nthread = pme->nthread;
3899 assert(nthread > 0);
3901 #ifdef PME_TIME_THREADS
3902 c1 = omp_cyc_start();
3906 #pragma omp parallel for num_threads(nthread) schedule(static)
3907 for (thread = 0; thread < nthread; thread++)
3911 start = atc->n* thread /nthread;
3912 end = atc->n*(thread+1)/nthread;
3914 /* Compute fftgrid index for all atoms,
3915 * with help of some extra variables.
3917 calc_interpolation_idx(pme, atc, start, end, thread);
3920 #ifdef PME_TIME_THREADS
3921 c1 = omp_cyc_end(c1);
3925 #ifdef PME_TIME_THREADS
3926 c2 = omp_cyc_start();
3928 #pragma omp parallel for num_threads(nthread) schedule(static)
3929 for (thread = 0; thread < nthread; thread++)
3931 splinedata_t *spline;
3934 /* make local bsplines */
3935 if (grids == NULL || grids->nthread == 1)
3937 spline = &atc->spline[0];
3941 grid = &grids->grid;
3945 spline = &atc->spline[thread];
3947 make_thread_local_ind(atc, thread, spline);
3949 grid = &grids->grid_th[thread];
3954 make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
3955 atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
3960 /* put local atoms on grid. */
3961 #ifdef PME_TIME_SPREAD
3962 ct1a = omp_cyc_start();
3964 spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
3966 if (grids->nthread > 1)
3968 copy_local_grid(pme, grids, thread, fftgrid);
3970 #ifdef PME_TIME_SPREAD
3971 ct1a = omp_cyc_end(ct1a);
3972 cs1a[thread] += (double)ct1a;
3976 #ifdef PME_TIME_THREADS
3977 c2 = omp_cyc_end(c2);
3981 if (bSpread && grids->nthread > 1)
3983 #ifdef PME_TIME_THREADS
3984 c3 = omp_cyc_start();
3986 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3987 for (thread = 0; thread < grids->nthread; thread++)
3989 reduce_threadgrid_overlap(pme, grids, thread,
3991 pme->overlap[0].sendbuf,
3992 pme->overlap[1].sendbuf);
3994 #ifdef PME_TIME_THREADS
3995 c3 = omp_cyc_end(c3);
3999 if (pme->nnodes > 1)
4001 /* Communicate the overlapping part of the fftgrid */
4002 sum_fftgrid_dd(pme, fftgrid);
4006 #ifdef PME_TIME_THREADS
4010 printf("idx %.2f spread %.2f red %.2f",
4011 cs1*1e-9, cs2*1e-9, cs3*1e-9);
4012 #ifdef PME_TIME_SPREAD
4013 for (thread = 0; thread < nthread; thread++)
4015 printf(" %.2f", cs1a[thread]*1e-9);
4024 static void dump_grid(FILE *fp,
4025 int sx, int sy, int sz, int nx, int ny, int nz,
4026 int my, int mz, const real *g)
4030 for (x = 0; x < nx; x++)
4032 for (y = 0; y < ny; y++)
4034 for (z = 0; z < nz; z++)
4036 fprintf(fp, "%2d %2d %2d %6.3f\n",
4037 sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4043 static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4045 ivec local_fft_ndata, local_fft_offset, local_fft_size;
4047 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
4053 pme->pmegrid_start_ix,
4054 pme->pmegrid_start_iy,
4055 pme->pmegrid_start_iz,
4056 pme->pmegrid_nx-pme->pme_order+1,
4057 pme->pmegrid_ny-pme->pme_order+1,
4058 pme->pmegrid_nz-pme->pme_order+1,
4065 void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4067 pme_atomcomm_t *atc;
4070 if (pme->nnodes > 1)
4072 gmx_incons("gmx_pme_calc_energy called in parallel");
4076 gmx_incons("gmx_pme_calc_energy with free energy");
4079 atc = &pme->atc_energy;
4081 if (atc->spline == NULL)
4083 snew(atc->spline, atc->nthread);
4086 atc->bSpread = TRUE;
4087 atc->pme_order = pme->pme_order;
4089 pme_realloc_atomcomm_things(atc);
4093 /* We only use the A-charges grid */
4094 grid = &pme->pmegridA;
4096 spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
4098 *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4102 static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
4103 t_nrnb *nrnb, t_inputrec *ir,
4104 gmx_large_int_t step)
4106 /* Reset all the counters related to performance over the run */
4107 wallcycle_stop(wcycle, ewcRUN);
4108 wallcycle_reset_all(wcycle);
4110 if (ir->nsteps >= 0)
4112 /* ir->nsteps is not used here, but we update it for consistency */
4113 ir->nsteps -= step - ir->init_step;
4115 ir->init_step = step;
4116 wallcycle_start(wcycle, ewcRUN);
4120 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4122 t_commrec *cr, t_inputrec *ir,
4126 gmx_pme_t pme = NULL;
4129 while (ind < *npmedata)
4131 pme = (*pmedata)[ind];
4132 if (pme->nkx == grid_size[XX] &&
4133 pme->nky == grid_size[YY] &&
4134 pme->nkz == grid_size[ZZ])
4145 srenew(*pmedata, *npmedata);
4147 /* Generate a new PME data structure, copying part of the old pointers */
4148 gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4150 *pme_ret = (*pmedata)[ind];
4154 int gmx_pmeonly(gmx_pme_t pme,
4155 t_commrec *cr, t_nrnb *nrnb,
4156 gmx_wallcycle_t wcycle,
4157 real ewaldcoeff, gmx_bool bGatherOnly,
4162 gmx_pme_pp_t pme_pp;
4166 rvec *x_pp = NULL, *f_pp = NULL;
4167 real *chargeA = NULL, *chargeB = NULL;
4169 int maxshift_x = 0, maxshift_y = 0;
4170 real energy, dvdlambda;
4175 gmx_large_int_t step, step_rel;
4178 /* This data will only use with PME tuning, i.e. switching PME grids */
4180 snew(pmedata, npmedata);
4183 pme_pp = gmx_pme_pp_init(cr);
4188 do /****** this is a quasi-loop over time steps! */
4190 /* The reason for having a loop here is PME grid tuning/switching */
4193 /* Domain decomposition */
4194 ret = gmx_pme_recv_q_x(pme_pp,
4196 &chargeA, &chargeB, box, &x_pp, &f_pp,
4197 &maxshift_x, &maxshift_y,
4198 &pme->bFEP, &lambda,
4201 grid_switch, &ewaldcoeff);
4203 if (ret == pmerecvqxSWITCHGRID)
4205 /* Switch the PME grid to grid_switch */
4206 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4209 if (ret == pmerecvqxRESETCOUNTERS)
4211 /* Reset the cycle and flop counters */
4212 reset_pmeonly_counters(cr, wcycle, nrnb, ir, step);
4215 while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4217 if (ret == pmerecvqxFINISH)
4219 /* We should stop: break out of the loop */
4223 step_rel = step - ir->init_step;
4227 wallcycle_start(wcycle, ewcRUN);
4230 wallcycle_start(wcycle, ewcPMEMESH);
4234 gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
4235 cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
4236 &energy, lambda, &dvdlambda,
4237 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4239 cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4241 gmx_pme_send_force_vir_ener(pme_pp,
4242 f_pp, vir, energy, dvdlambda,
4246 } /***** end of quasi-loop, we stop with the break above */
4252 int gmx_pme_do(gmx_pme_t pme,
4253 int start, int homenr,
4255 real *chargeA, real *chargeB,
4256 matrix box, t_commrec *cr,
4257 int maxshift_x, int maxshift_y,
4258 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4259 matrix vir, real ewaldcoeff,
4260 real *energy, real lambda,
4261 real *dvdlambda, int flags)
4263 int q, d, i, j, ntot, npme;
4266 pme_atomcomm_t *atc = NULL;
4267 pmegrids_t *pmegrid = NULL;
4271 real *charge = NULL, *q_d;
4275 gmx_parallel_3dfft_t pfft_setup;
4277 t_complex * cfftgrid;
4279 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4280 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4282 assert(pme->nnodes > 0);
4283 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4285 if (pme->nnodes > 1)
4289 if (atc->npd > atc->pd_nalloc)
4291 atc->pd_nalloc = over_alloc_dd(atc->npd);
4292 srenew(atc->pd, atc->pd_nalloc);
4294 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4298 /* This could be necessary for TPI */
4299 pme->atc[0].n = homenr;
4302 for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
4306 pmegrid = &pme->pmegridA;
4307 fftgrid = pme->fftgridA;
4308 cfftgrid = pme->cfftgridA;
4309 pfft_setup = pme->pfft_setupA;
4310 charge = chargeA+start;
4314 pmegrid = &pme->pmegridB;
4315 fftgrid = pme->fftgridB;
4316 cfftgrid = pme->cfftgridB;
4317 pfft_setup = pme->pfft_setupB;
4318 charge = chargeB+start;
4320 grid = pmegrid->grid.grid;
4321 /* Unpack structure */
4324 fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
4325 cr->nnodes, cr->nodeid);
4326 fprintf(debug, "Grid = %p\n", (void*)grid);
4329 gmx_fatal(FARGS, "No grid!");
4334 m_inv_ur0(box, pme->recipbox);
4336 if (pme->nnodes == 1)
4339 if (DOMAINDECOMP(cr))
4342 pme_realloc_atomcomm_things(atc);
4350 wallcycle_start(wcycle, ewcPME_REDISTXF);
4351 for (d = pme->ndecompdim-1; d >= 0; d--)
4353 if (d == pme->ndecompdim-1)
4361 n_d = pme->atc[d+1].n;
4367 if (atc->npd > atc->pd_nalloc)
4369 atc->pd_nalloc = over_alloc_dd(atc->npd);
4370 srenew(atc->pd, atc->pd_nalloc);
4372 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4373 pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4376 /* Redistribute x (only once) and qA or qB */
4377 if (DOMAINDECOMP(cr))
4379 dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
4383 pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
4388 wallcycle_stop(wcycle, ewcPME_REDISTXF);
4393 fprintf(debug, "Node= %6d, pme local particles=%6d\n",
4394 cr->nodeid, atc->n);
4397 if (flags & GMX_PME_SPREAD_Q)
4399 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4401 /* Spread the charges on a grid */
4402 spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
4406 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
4408 inc_nrnb(nrnb, eNR_SPREADQBSP,
4409 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4411 if (pme->nthread == 1)
4413 wrap_periodic_pmegrid(pme, grid);
4415 /* sum contributions to local grid from other nodes */
4417 if (pme->nnodes > 1)
4419 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
4424 copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
4427 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4430 dump_local_fftgrid(pme,fftgrid);
4435 /* Here we start a large thread parallel region */
4436 #pragma omp parallel num_threads(pme->nthread) private(thread)
4438 thread = gmx_omp_get_thread_num();
4439 if (flags & GMX_PME_SOLVE)
4446 wallcycle_start(wcycle, ewcPME_FFT);
4448 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4449 fftgrid, cfftgrid, thread, wcycle);
4452 wallcycle_stop(wcycle, ewcPME_FFT);
4456 /* solve in k-space for our local cells */
4459 wallcycle_start(wcycle, ewcPME_SOLVE);
4462 solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
4463 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4465 pme->nthread, thread);
4468 wallcycle_stop(wcycle, ewcPME_SOLVE);
4470 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
4480 wallcycle_start(wcycle, ewcPME_FFT);
4482 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4483 cfftgrid, fftgrid, thread, wcycle);
4486 wallcycle_stop(wcycle, ewcPME_FFT);
4490 if (pme->nodeid == 0)
4492 ntot = pme->nkx*pme->nky*pme->nkz;
4493 npme = ntot*log((real)ntot)/log(2.0);
4494 inc_nrnb(nrnb, eNR_FFT, 2*npme);
4497 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4500 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
4503 /* End of thread parallel section.
4504 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4509 /* distribute local grid to all nodes */
4511 if (pme->nnodes > 1)
4513 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
4518 unwrap_periodic_pmegrid(pme, grid);
4520 /* interpolate forces for our local atoms */
4524 /* If we are running without parallelization,
4525 * atc->f is the actual force array, not a buffer,
4526 * therefore we should not clear it.
4528 bClearF = (q == 0 && PAR(cr));
4529 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4530 for (thread = 0; thread < pme->nthread; thread++)
4532 gather_f_bsplines(pme, grid, bClearF, atc,
4533 &atc->spline[thread],
4534 pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
4539 inc_nrnb(nrnb, eNR_GATHERFBSP,
4540 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4541 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4546 /* This should only be called on the master thread
4547 * and after the threads have synchronized.
4549 get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
4553 if (bCalcF && pme->nnodes > 1)
4555 wallcycle_start(wcycle, ewcPME_REDISTXF);
4556 for (d = 0; d < pme->ndecompdim; d++)
4559 if (d == pme->ndecompdim - 1)
4566 n_d = pme->atc[d+1].n;
4567 f_d = pme->atc[d+1].f;
4569 if (DOMAINDECOMP(cr))
4571 dd_pmeredist_f(pme, atc, n_d, f_d,
4572 d == pme->ndecompdim-1 && pme->bPPnode);
4576 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4580 wallcycle_stop(wcycle, ewcPME_REDISTXF);
4588 *energy = energy_AB[0];
4589 m_add(vir, vir_AB[0], vir);
4593 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4594 *dvdlambda += energy_AB[1] - energy_AB[0];
4595 for (i = 0; i < DIM; i++)
4597 for (j = 0; j < DIM; j++)
4599 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4600 lambda*vir_AB[1][i][j];
4612 fprintf(debug, "PME mesh energy: %g\n", *energy);