2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 /* IMPORTANT FOR DEVELOPERS:
40 * Triclinic pme stuff isn't entirely trivial, and we've experienced
41 * some bugs during development (many of them due to me). To avoid
42 * this in the future, please check the following things if you make
43 * changes in this file:
45 * 1. You should obtain identical (at least to the PME precision)
46 * energies, forces, and virial for
47 * a rectangular box and a triclinic one where the z (or y) axis is
48 * tilted a whole box side. For instance you could use these boxes:
50 * rectangular triclinic
55 * 2. You should check the energy conservation in a triclinic box.
57 * It might seem an overkill, but better safe than sorry.
79 #include "gmxcomplex.h"
83 #include "gmx_fatal.h"
89 #include "gmx_wallcycle.h"
90 #include "gmx_parallel_3dfft.h"
92 #include "gmx_cyclecounter.h"
95 /* Single precision, with SSE2 or higher available */
96 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
98 #include "gmx_x86_simd_single.h"
101 /* Some old AMD processors could have problems with unaligned loads+stores */
103 #define PME_SSE_UNALIGNED
107 #include "mpelogging.h"
110 /* #define PRT_FORCE */
111 /* conditions for on the fly time-measurement */
112 /* #define TAKETIME (step > 1 && timesteps < 10) */
113 #define TAKETIME FALSE
115 /* #define PME_TIME_THREADS */
118 #define mpi_type MPI_DOUBLE
120 #define mpi_type MPI_FLOAT
123 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
124 #define GMX_CACHE_SEP 64
126 /* We only define a maximum to be able to use local arrays without allocation.
127 * An order larger than 12 should never be needed, even for test cases.
128 * If needed it can be changed here.
130 #define PME_ORDER_MAX 12
132 /* Internal datastructures */
138 int recv_size; /* Receive buffer width, used with OpenMP */
149 int *send_id,*recv_id;
150 int send_size; /* Send buffer width, used with OpenMP */
151 pme_grid_comm_t *comm_data;
157 int *n; /* Cumulative counts of the number of particles per thread */
158 int nalloc; /* Allocation size of i */
159 int *i; /* Particle indices ordered on thread index (n) */
173 int dimind; /* The index of the dimension, 0=x, 1=y */
180 int *node_dest; /* The nodes to send x and q to with DD */
181 int *node_src; /* The nodes to receive x and q from with DD */
182 int *buf_index; /* Index for commnode into the buffers */
189 int *count; /* The number of atoms to send to each node */
191 int *rcount; /* The number of atoms to receive */
198 gmx_bool bSpread; /* These coordinates are used for spreading */
201 rvec *fractx; /* Fractional coordinate relative to the
202 * lower cell boundary
205 int *thread_idx; /* Which thread should spread which charge */
206 thread_plist_t *thread_plist;
207 splinedata_t *spline;
214 ivec ci; /* The spatial location of this grid */
215 ivec n; /* The used size of *grid, including order-1 */
216 ivec offset; /* The grid offset from the full node grid */
217 int order; /* PME spreading order */
218 ivec s; /* The allocated size of *grid, s >= n */
219 real *grid; /* The grid local thread, size n */
223 pmegrid_t grid; /* The full node grid (non thread-local) */
224 int nthread; /* The number of threads operating on this grid */
225 ivec nc; /* The local spatial decomposition over the threads */
226 pmegrid_t *grid_th; /* Array of grids for each thread */
227 real *grid_all; /* Allocated array for the grids in *grid_th */
228 int **g2t; /* The grid to thread index */
229 ivec nthread_comm; /* The number of threads to communicate with */
235 /* Masks for SSE aligned spreading and gathering */
236 __m128 mask_SSE0[6],mask_SSE1[6];
238 int dummy; /* C89 requires that struct has at least one member */
243 /* work data for solve_pme */
259 typedef struct gmx_pme {
260 int ndecompdim; /* The number of decomposition dimensions */
261 int nodeid; /* Our nodeid in mpi->mpi_comm */
264 int nnodes; /* The number of nodes doing PME */
269 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
271 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
274 int nthread; /* The number of threads doing PME */
276 gmx_bool bPPnode; /* Node also does particle-particle forces */
277 gmx_bool bFEP; /* Compute Free energy contribution */
278 int nkx,nky,nkz; /* Grid dimensions */
279 gmx_bool bP3M; /* Do P3M: optimize the influence function */
283 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
285 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
286 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
287 /* pmegrid_nz might be larger than strictly necessary to ensure
288 * memory alignment, pmegrid_nz_base gives the real base size.
291 /* The local PME grid starting indices */
292 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
294 /* Work data for spreading and gathering */
295 pme_spline_work_t *spline_work;
297 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
298 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
299 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
301 t_complex *cfftgridA; /* Grids for complex FFT data */
302 t_complex *cfftgridB;
303 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
305 gmx_parallel_3dfft_t pfft_setupA;
306 gmx_parallel_3dfft_t pfft_setupB;
309 real *fshx,*fshy,*fshz;
311 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
315 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
317 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
319 rvec *bufv; /* Communication buffer */
320 real *bufr; /* Communication buffer */
321 int buf_nalloc; /* The communication buffer size */
323 /* thread local work data for solve_pme */
326 /* Work data for PME_redist */
327 gmx_bool redist_init;
335 int redist_buf_nalloc;
337 /* Work data for sum_qgrid */
338 real * sum_qgrid_tmp;
339 real * sum_qgrid_dd_tmp;
343 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
344 int start,int end,int thread)
347 int *idxptr,tix,tiy,tiz;
348 real *xptr,*fptr,tx,ty,tz;
349 real rxx,ryx,ryy,rzx,rzy,rzz;
351 int start_ix,start_iy,start_iz;
352 int *g2tx,*g2ty,*g2tz;
354 int *thread_idx=NULL;
355 thread_plist_t *tpl=NULL;
363 start_ix = pme->pmegrid_start_ix;
364 start_iy = pme->pmegrid_start_iy;
365 start_iz = pme->pmegrid_start_iz;
367 rxx = pme->recipbox[XX][XX];
368 ryx = pme->recipbox[YY][XX];
369 ryy = pme->recipbox[YY][YY];
370 rzx = pme->recipbox[ZZ][XX];
371 rzy = pme->recipbox[ZZ][YY];
372 rzz = pme->recipbox[ZZ][ZZ];
374 g2tx = pme->pmegridA.g2t[XX];
375 g2ty = pme->pmegridA.g2t[YY];
376 g2tz = pme->pmegridA.g2t[ZZ];
378 bThreads = (atc->nthread > 1);
381 thread_idx = atc->thread_idx;
383 tpl = &atc->thread_plist[thread];
385 for(i=0; i<atc->nthread; i++)
391 for(i=start; i<end; i++) {
393 idxptr = atc->idx[i];
394 fptr = atc->fractx[i];
396 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
397 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
398 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
399 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
405 /* Because decomposition only occurs in x and y,
406 * we never have a fraction correction in z.
408 fptr[XX] = tx - tix + pme->fshx[tix];
409 fptr[YY] = ty - tiy + pme->fshy[tiy];
412 idxptr[XX] = pme->nnx[tix];
413 idxptr[YY] = pme->nny[tiy];
414 idxptr[ZZ] = pme->nnz[tiz];
417 range_check(idxptr[XX],0,pme->pmegrid_nx);
418 range_check(idxptr[YY],0,pme->pmegrid_ny);
419 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
424 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
425 thread_idx[i] = thread_i;
432 /* Make a list of particle indices sorted on thread */
434 /* Get the cumulative count */
435 for(i=1; i<atc->nthread; i++)
437 tpl_n[i] += tpl_n[i-1];
439 /* The current implementation distributes particles equally
440 * over the threads, so we could actually allocate for that
441 * in pme_realloc_atomcomm_things.
443 if (tpl_n[atc->nthread-1] > tpl->nalloc)
445 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
446 srenew(tpl->i,tpl->nalloc);
448 /* Set tpl_n to the cumulative start */
449 for(i=atc->nthread-1; i>=1; i--)
451 tpl_n[i] = tpl_n[i-1];
455 /* Fill our thread local array with indices sorted on thread */
456 for(i=start; i<end; i++)
458 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
460 /* Now tpl_n contains the cummulative count again */
464 static void make_thread_local_ind(pme_atomcomm_t *atc,
465 int thread,splinedata_t *spline)
470 /* Combine the indices made by each thread into one index */
474 for(t=0; t<atc->nthread; t++)
476 tpl = &atc->thread_plist[t];
477 /* Copy our part (start - end) from the list of thread t */
480 start = tpl->n[thread-1];
482 end = tpl->n[thread];
483 for(i=start; i<end; i++)
485 spline->ind[n++] = tpl->i[i];
493 static void pme_calc_pidx(int start, int end,
494 matrix recipbox, rvec x[],
495 pme_atomcomm_t *atc, int *count)
500 real rxx,ryx,rzx,ryy,rzy;
503 /* Calculate PME task index (pidx) for each grid index.
504 * Here we always assign equally sized slabs to each node
505 * for load balancing reasons (the PME grid spacing is not used).
511 /* Reset the count */
512 for(i=0; i<nslab; i++)
517 if (atc->dimind == 0)
519 rxx = recipbox[XX][XX];
520 ryx = recipbox[YY][XX];
521 rzx = recipbox[ZZ][XX];
522 /* Calculate the node index in x-dimension */
523 for(i=start; i<end; i++)
526 /* Fractional coordinates along box vectors */
527 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
528 si = (int)(s + 2*nslab) % nslab;
535 ryy = recipbox[YY][YY];
536 rzy = recipbox[ZZ][YY];
537 /* Calculate the node index in y-dimension */
538 for(i=start; i<end; i++)
541 /* Fractional coordinates along box vectors */
542 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
543 si = (int)(s + 2*nslab) % nslab;
550 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
553 int nthread,thread,slab;
555 nthread = atc->nthread;
557 #pragma omp parallel for num_threads(nthread) schedule(static)
558 for(thread=0; thread<nthread; thread++)
560 pme_calc_pidx(natoms* thread /nthread,
561 natoms*(thread+1)/nthread,
562 recipbox,x,atc,atc->count_thread[thread]);
564 /* Non-parallel reduction, since nslab is small */
566 for(thread=1; thread<nthread; thread++)
568 for(slab=0; slab<atc->nslab; slab++)
570 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
575 static void realloc_splinevec(splinevec th,real **ptr_z,int nalloc)
580 srenew(th[XX],nalloc);
581 srenew(th[YY],nalloc);
582 /* In z we add padding, this is only required for the aligned SSE code */
583 srenew(*ptr_z,nalloc+2*padding);
584 th[ZZ] = *ptr_z + padding;
586 for(i=0; i<padding; i++)
589 (*ptr_z)[padding+nalloc+i] = 0;
593 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
597 srenew(spline->ind,atc->nalloc);
598 /* Initialize the index to identity so it works without threads */
599 for(i=0; i<atc->nalloc; i++)
604 realloc_splinevec(spline->theta,&spline->ptr_theta_z,
605 atc->pme_order*atc->nalloc);
606 realloc_splinevec(spline->dtheta,&spline->ptr_dtheta_z,
607 atc->pme_order*atc->nalloc);
610 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
612 int nalloc_old,i,j,nalloc_tpl;
614 /* We have to avoid a NULL pointer for atc->x to avoid
615 * possible fatal errors in MPI routines.
617 if (atc->n > atc->nalloc || atc->nalloc == 0)
619 nalloc_old = atc->nalloc;
620 atc->nalloc = over_alloc_dd(max(atc->n,1));
622 if (atc->nslab > 1) {
623 srenew(atc->x,atc->nalloc);
624 srenew(atc->q,atc->nalloc);
625 srenew(atc->f,atc->nalloc);
626 for(i=nalloc_old; i<atc->nalloc; i++)
628 clear_rvec(atc->f[i]);
632 srenew(atc->fractx,atc->nalloc);
633 srenew(atc->idx ,atc->nalloc);
635 if (atc->nthread > 1)
637 srenew(atc->thread_idx,atc->nalloc);
640 for(i=0; i<atc->nthread; i++)
642 pme_realloc_splinedata(&atc->spline[i],atc);
648 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
649 int n, gmx_bool bXF, rvec *x_f, real *charge,
651 /* Redistribute particle data for PME calculation */
652 /* domain decomposition by x coordinate */
657 if(FALSE == pme->redist_init) {
658 snew(pme->scounts,atc->nslab);
659 snew(pme->rcounts,atc->nslab);
660 snew(pme->sdispls,atc->nslab);
661 snew(pme->rdispls,atc->nslab);
662 snew(pme->sidx,atc->nslab);
663 pme->redist_init = TRUE;
665 if (n > pme->redist_buf_nalloc) {
666 pme->redist_buf_nalloc = over_alloc_dd(n);
667 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
674 /* forward, redistribution from pp to pme */
676 /* Calculate send counts and exchange them with other nodes */
677 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
678 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
679 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
681 /* Calculate send and receive displacements and index into send
686 for(i=1; i<atc->nslab; i++) {
687 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
688 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
689 pme->sidx[i]=pme->sdispls[i];
691 /* Total # of particles to be received */
692 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
694 pme_realloc_atomcomm_things(atc);
696 /* Copy particle coordinates into send buffer and exchange*/
697 for(i=0; (i<n); i++) {
698 ii=DIM*pme->sidx[pme->idxa[i]];
699 pme->sidx[pme->idxa[i]]++;
700 pme->redist_buf[ii+XX]=x_f[i][XX];
701 pme->redist_buf[ii+YY]=x_f[i][YY];
702 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
704 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
705 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
706 pme->rvec_mpi, atc->mpi_comm);
709 /* Copy charge into send buffer and exchange*/
710 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
711 for(i=0; (i<n); i++) {
712 ii=pme->sidx[pme->idxa[i]];
713 pme->sidx[pme->idxa[i]]++;
714 pme->redist_buf[ii]=charge[i];
716 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
717 atc->q, pme->rcounts, pme->rdispls, mpi_type,
720 else { /* backward, redistribution from pme to pp */
721 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
722 pme->redist_buf, pme->scounts, pme->sdispls,
723 pme->rvec_mpi, atc->mpi_comm);
725 /* Copy data from receive buffer */
726 for(i=0; i<atc->nslab; i++)
727 pme->sidx[i] = pme->sdispls[i];
728 for(i=0; (i<n); i++) {
729 ii = DIM*pme->sidx[pme->idxa[i]];
730 x_f[i][XX] += pme->redist_buf[ii+XX];
731 x_f[i][YY] += pme->redist_buf[ii+YY];
732 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
733 pme->sidx[pme->idxa[i]]++;
739 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
740 gmx_bool bBackward,int shift,
741 void *buf_s,int nbyte_s,
742 void *buf_r,int nbyte_r)
748 if (bBackward == FALSE) {
749 dest = atc->node_dest[shift];
750 src = atc->node_src[shift];
752 dest = atc->node_src[shift];
753 src = atc->node_dest[shift];
756 if (nbyte_s > 0 && nbyte_r > 0) {
757 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
759 buf_r,nbyte_r,MPI_BYTE,
761 atc->mpi_comm,&stat);
762 } else if (nbyte_s > 0) {
763 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
766 } else if (nbyte_r > 0) {
767 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
769 atc->mpi_comm,&stat);
774 static void dd_pmeredist_x_q(gmx_pme_t pme,
775 int n, gmx_bool bX, rvec *x, real *charge,
778 int *commnode,*buf_index;
779 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
781 commnode = atc->node_dest;
782 buf_index = atc->buf_index;
784 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
787 for(i=0; i<nnodes_comm; i++) {
788 buf_index[commnode[i]] = nsend;
789 nsend += atc->count[commnode[i]];
792 if (atc->count[atc->nodeid] + nsend != n)
793 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
794 "This usually means that your system is not well equilibrated.",
795 n - (atc->count[atc->nodeid] + nsend),
796 pme->nodeid,'x'+atc->dimind);
798 if (nsend > pme->buf_nalloc) {
799 pme->buf_nalloc = over_alloc_dd(nsend);
800 srenew(pme->bufv,pme->buf_nalloc);
801 srenew(pme->bufr,pme->buf_nalloc);
804 atc->n = atc->count[atc->nodeid];
805 for(i=0; i<nnodes_comm; i++) {
806 scount = atc->count[commnode[i]];
807 /* Communicate the count */
809 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
810 atc->dimind,atc->nodeid,commnode[i],scount);
811 pme_dd_sendrecv(atc,FALSE,i,
813 &atc->rcount[i],sizeof(int));
814 atc->n += atc->rcount[i];
817 pme_realloc_atomcomm_things(atc);
823 if (node == atc->nodeid) {
824 /* Copy direct to the receive buffer */
826 copy_rvec(x[i],atc->x[local_pos]);
828 atc->q[local_pos] = charge[i];
831 /* Copy to the send buffer */
833 copy_rvec(x[i],pme->bufv[buf_index[node]]);
835 pme->bufr[buf_index[node]] = charge[i];
841 for(i=0; i<nnodes_comm; i++) {
842 scount = atc->count[commnode[i]];
843 rcount = atc->rcount[i];
844 if (scount > 0 || rcount > 0) {
846 /* Communicate the coordinates */
847 pme_dd_sendrecv(atc,FALSE,i,
848 pme->bufv[buf_pos],scount*sizeof(rvec),
849 atc->x[local_pos],rcount*sizeof(rvec));
851 /* Communicate the charges */
852 pme_dd_sendrecv(atc,FALSE,i,
853 pme->bufr+buf_pos,scount*sizeof(real),
854 atc->q+local_pos,rcount*sizeof(real));
856 local_pos += atc->rcount[i];
861 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
865 int *commnode,*buf_index;
866 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
868 commnode = atc->node_dest;
869 buf_index = atc->buf_index;
871 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
873 local_pos = atc->count[atc->nodeid];
875 for(i=0; i<nnodes_comm; i++) {
876 scount = atc->rcount[i];
877 rcount = atc->count[commnode[i]];
878 if (scount > 0 || rcount > 0) {
879 /* Communicate the forces */
880 pme_dd_sendrecv(atc,TRUE,i,
881 atc->f[local_pos],scount*sizeof(rvec),
882 pme->bufv[buf_pos],rcount*sizeof(rvec));
885 buf_index[commnode[i]] = buf_pos;
895 if (node == atc->nodeid)
897 /* Add from the local force array */
898 rvec_inc(f[i],atc->f[local_pos]);
903 /* Add from the receive buffer */
904 rvec_inc(f[i],pme->bufv[buf_index[node]]);
914 if (node == atc->nodeid)
916 /* Copy from the local force array */
917 copy_rvec(atc->f[local_pos],f[i]);
922 /* Copy from the receive buffer */
923 copy_rvec(pme->bufv[buf_index[node]],f[i]);
932 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
934 pme_overlap_t *overlap;
935 int send_index0,send_nindex;
936 int recv_index0,recv_nindex;
938 int i,j,k,ix,iy,iz,icnt;
939 int ipulse,send_id,recv_id,datasize;
941 real *sendptr,*recvptr;
943 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
944 overlap = &pme->overlap[1];
946 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
948 /* Since we have already (un)wrapped the overlap in the z-dimension,
949 * we only have to communicate 0 to nkz (not pmegrid_nz).
951 if (direction==GMX_SUM_QGRID_FORWARD)
953 send_id = overlap->send_id[ipulse];
954 recv_id = overlap->recv_id[ipulse];
955 send_index0 = overlap->comm_data[ipulse].send_index0;
956 send_nindex = overlap->comm_data[ipulse].send_nindex;
957 recv_index0 = overlap->comm_data[ipulse].recv_index0;
958 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
962 send_id = overlap->recv_id[ipulse];
963 recv_id = overlap->send_id[ipulse];
964 send_index0 = overlap->comm_data[ipulse].recv_index0;
965 send_nindex = overlap->comm_data[ipulse].recv_nindex;
966 recv_index0 = overlap->comm_data[ipulse].send_index0;
967 recv_nindex = overlap->comm_data[ipulse].send_nindex;
970 /* Copy data to contiguous send buffer */
973 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
974 pme->nodeid,overlap->nodeid,send_id,
975 pme->pmegrid_start_iy,
976 send_index0-pme->pmegrid_start_iy,
977 send_index0-pme->pmegrid_start_iy+send_nindex);
980 for(i=0;i<pme->pmegrid_nx;i++)
983 for(j=0;j<send_nindex;j++)
985 iy = j + send_index0 - pme->pmegrid_start_iy;
986 for(k=0;k<pme->nkz;k++)
989 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
994 datasize = pme->pmegrid_nx * pme->nkz;
996 MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
998 overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
1000 overlap->mpi_comm,&stat);
1002 /* Get data from contiguous recv buffer */
1005 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1006 pme->nodeid,overlap->nodeid,recv_id,
1007 pme->pmegrid_start_iy,
1008 recv_index0-pme->pmegrid_start_iy,
1009 recv_index0-pme->pmegrid_start_iy+recv_nindex);
1012 for(i=0;i<pme->pmegrid_nx;i++)
1015 for(j=0;j<recv_nindex;j++)
1017 iy = j + recv_index0 - pme->pmegrid_start_iy;
1018 for(k=0;k<pme->nkz;k++)
1021 if(direction==GMX_SUM_QGRID_FORWARD)
1023 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1027 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1034 /* Major dimension is easier, no copying required,
1035 * but we might have to sum to separate array.
1036 * Since we don't copy, we have to communicate up to pmegrid_nz,
1037 * not nkz as for the minor direction.
1039 overlap = &pme->overlap[0];
1041 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
1043 if(direction==GMX_SUM_QGRID_FORWARD)
1045 send_id = overlap->send_id[ipulse];
1046 recv_id = overlap->recv_id[ipulse];
1047 send_index0 = overlap->comm_data[ipulse].send_index0;
1048 send_nindex = overlap->comm_data[ipulse].send_nindex;
1049 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1050 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1051 recvptr = overlap->recvbuf;
1055 send_id = overlap->recv_id[ipulse];
1056 recv_id = overlap->send_id[ipulse];
1057 send_index0 = overlap->comm_data[ipulse].recv_index0;
1058 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1059 recv_index0 = overlap->comm_data[ipulse].send_index0;
1060 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1061 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1064 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1065 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1069 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1070 pme->nodeid,overlap->nodeid,send_id,
1071 pme->pmegrid_start_ix,
1072 send_index0-pme->pmegrid_start_ix,
1073 send_index0-pme->pmegrid_start_ix+send_nindex);
1074 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1075 pme->nodeid,overlap->nodeid,recv_id,
1076 pme->pmegrid_start_ix,
1077 recv_index0-pme->pmegrid_start_ix,
1078 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1081 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
1083 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
1085 overlap->mpi_comm,&stat);
1087 /* ADD data from contiguous recv buffer */
1088 if(direction==GMX_SUM_QGRID_FORWARD)
1090 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1091 for(i=0;i<recv_nindex*datasize;i++)
1093 p[i] += overlap->recvbuf[i];
1102 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1104 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1105 ivec local_pme_size;
1109 /* Dimensions should be identical for A/B grid, so we just use A here */
1110 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1115 local_pme_size[0] = pme->pmegrid_nx;
1116 local_pme_size[1] = pme->pmegrid_ny;
1117 local_pme_size[2] = pme->pmegrid_nz;
1119 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1120 the offset is identical, and the PME grid always has more data (due to overlap)
1125 char fn[STRLEN],format[STRLEN];
1127 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
1128 fp = ffopen(fn,"w");
1129 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
1130 fp2 = ffopen(fn,"w");
1131 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1134 for(ix=0;ix<local_fft_ndata[XX];ix++)
1136 for(iy=0;iy<local_fft_ndata[YY];iy++)
1138 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1140 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1141 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1142 fftgrid[fftidx] = pmegrid[pmeidx];
1144 val = 100*pmegrid[pmeidx];
1145 if (pmegrid[pmeidx] != 0)
1146 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
1147 5.0*ix,5.0*iy,5.0*iz,1.0,val);
1148 if (pmegrid[pmeidx] != 0)
1149 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
1151 pme->pmegrid_start_ix + ix,
1152 pme->pmegrid_start_iy + iy,
1153 pme->pmegrid_start_iz + iz,
1168 static gmx_cycles_t omp_cyc_start()
1170 return gmx_cycles_read();
1173 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1175 return gmx_cycles_read() - c;
1180 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1181 int nthread,int thread)
1183 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1184 ivec local_pme_size;
1185 int ixy0,ixy1,ixy,ix,iy,iz;
1187 #ifdef PME_TIME_THREADS
1189 static double cs1=0;
1193 #ifdef PME_TIME_THREADS
1194 c1 = omp_cyc_start();
1196 /* Dimensions should be identical for A/B grid, so we just use A here */
1197 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1202 local_pme_size[0] = pme->pmegrid_nx;
1203 local_pme_size[1] = pme->pmegrid_ny;
1204 local_pme_size[2] = pme->pmegrid_nz;
1206 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1207 the offset is identical, and the PME grid always has more data (due to overlap)
1209 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1210 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1212 for(ixy=ixy0;ixy<ixy1;ixy++)
1214 ix = ixy/local_fft_ndata[YY];
1215 iy = ixy - ix*local_fft_ndata[YY];
1217 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1218 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1219 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1221 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1225 #ifdef PME_TIME_THREADS
1226 c1 = omp_cyc_end(c1);
1231 printf("copy %.2f\n",cs1*1e-9);
1240 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1242 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1248 pnx = pme->pmegrid_nx;
1249 pny = pme->pmegrid_ny;
1250 pnz = pme->pmegrid_nz;
1252 overlap = pme->pme_order - 1;
1254 /* Add periodic overlap in z */
1255 for(ix=0; ix<pme->pmegrid_nx; ix++)
1257 for(iy=0; iy<pme->pmegrid_ny; iy++)
1259 for(iz=0; iz<overlap; iz++)
1261 pmegrid[(ix*pny+iy)*pnz+iz] +=
1262 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1267 if (pme->nnodes_minor == 1)
1269 for(ix=0; ix<pme->pmegrid_nx; ix++)
1271 for(iy=0; iy<overlap; iy++)
1273 for(iz=0; iz<nz; iz++)
1275 pmegrid[(ix*pny+iy)*pnz+iz] +=
1276 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1282 if (pme->nnodes_major == 1)
1284 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1286 for(ix=0; ix<overlap; ix++)
1288 for(iy=0; iy<ny_x; iy++)
1290 for(iz=0; iz<nz; iz++)
1292 pmegrid[(ix*pny+iy)*pnz+iz] +=
1293 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1302 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1304 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
1310 pnx = pme->pmegrid_nx;
1311 pny = pme->pmegrid_ny;
1312 pnz = pme->pmegrid_nz;
1314 overlap = pme->pme_order - 1;
1316 if (pme->nnodes_major == 1)
1318 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1320 for(ix=0; ix<overlap; ix++)
1324 for(iy=0; iy<ny_x; iy++)
1326 for(iz=0; iz<nz; iz++)
1328 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1329 pmegrid[(ix*pny+iy)*pnz+iz];
1335 if (pme->nnodes_minor == 1)
1337 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1338 for(ix=0; ix<pme->pmegrid_nx; ix++)
1342 for(iy=0; iy<overlap; iy++)
1344 for(iz=0; iz<nz; iz++)
1346 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1347 pmegrid[(ix*pny+iy)*pnz+iz];
1353 /* Copy periodic overlap in z */
1354 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1355 for(ix=0; ix<pme->pmegrid_nx; ix++)
1359 for(iy=0; iy<pme->pmegrid_ny; iy++)
1361 for(iz=0; iz<overlap; iz++)
1363 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1364 pmegrid[(ix*pny+iy)*pnz+iz];
1370 static void clear_grid(int nx,int ny,int nz,real *grid,
1372 int fx,int fy,int fz,
1376 int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
1379 nc = 2 + (order - 2)/FLBS;
1380 ncz = 2 + (order - 2)/FLBSZ;
1382 for(fsx=fx; fsx<fx+nc; fsx++)
1384 for(fsy=fy; fsy<fy+nc; fsy++)
1386 for(fsz=fz; fsz<fz+ncz; fsz++)
1388 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1389 if (flag[flind] == 0)
1394 g0x = (gx*ny + gy)*nz + gz;
1395 for(x=0; x<FLBS; x++)
1398 for(y=0; y<FLBS; y++)
1400 for(z=0; z<FLBSZ; z++)
1416 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1417 #define DO_BSPLINE(order) \
1418 for(ithx=0; (ithx<order); ithx++) \
1420 index_x = (i0+ithx)*pny*pnz; \
1421 valx = qn*thx[ithx]; \
1423 for(ithy=0; (ithy<order); ithy++) \
1425 valxy = valx*thy[ithy]; \
1426 index_xy = index_x+(j0+ithy)*pnz; \
1428 for(ithz=0; (ithz<order); ithz++) \
1430 index_xyz = index_xy+(k0+ithz); \
1431 grid[index_xyz] += valxy*thz[ithz]; \
1437 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1438 pme_atomcomm_t *atc, splinedata_t *spline,
1439 pme_spline_work_t *work)
1442 /* spread charges from home atoms to local grid */
1445 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1447 int order,norder,index_x,index_xy,index_xyz;
1449 real *thx,*thy,*thz;
1450 int localsize, bndsize;
1451 int pnx,pny,pnz,ndatatot;
1454 pnx = pmegrid->s[XX];
1455 pny = pmegrid->s[YY];
1456 pnz = pmegrid->s[ZZ];
1458 offx = pmegrid->offset[XX];
1459 offy = pmegrid->offset[YY];
1460 offz = pmegrid->offset[ZZ];
1462 ndatatot = pnx*pny*pnz;
1463 grid = pmegrid->grid;
1464 for(i=0;i<ndatatot;i++)
1469 order = pmegrid->order;
1471 for(nn=0; nn<spline->n; nn++)
1473 n = spline->ind[nn];
1478 idxptr = atc->idx[n];
1481 i0 = idxptr[XX] - offx;
1482 j0 = idxptr[YY] - offy;
1483 k0 = idxptr[ZZ] - offz;
1485 thx = spline->theta[XX] + norder;
1486 thy = spline->theta[YY] + norder;
1487 thz = spline->theta[ZZ] + norder;
1492 #ifdef PME_SSE_UNALIGNED
1493 #define PME_SPREAD_SSE_ORDER4
1495 #define PME_SPREAD_SSE_ALIGNED
1498 #include "pme_sse_single.h"
1505 #define PME_SPREAD_SSE_ALIGNED
1507 #include "pme_sse_single.h"
1520 static void set_grid_alignment(int *pmegrid_nz,int pme_order)
1524 #ifndef PME_SSE_UNALIGNED
1529 /* Round nz up to a multiple of 4 to ensure alignment */
1530 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1535 static void set_gridsize_alignment(int *gridsize,int pme_order)
1538 #ifndef PME_SSE_UNALIGNED
1541 /* Add extra elements to ensured aligned operations do not go
1542 * beyond the allocated grid size.
1543 * Note that for pme_order=5, the pme grid z-size alignment
1544 * ensures that we will not go beyond the grid size.
1552 static void pmegrid_init(pmegrid_t *grid,
1553 int cx, int cy, int cz,
1554 int x0, int y0, int z0,
1555 int x1, int y1, int z1,
1556 gmx_bool set_alignment,
1565 grid->offset[XX] = x0;
1566 grid->offset[YY] = y0;
1567 grid->offset[ZZ] = z0;
1568 grid->n[XX] = x1 - x0 + pme_order - 1;
1569 grid->n[YY] = y1 - y0 + pme_order - 1;
1570 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1571 copy_ivec(grid->n,grid->s);
1574 set_grid_alignment(&nz,pme_order);
1579 else if (nz != grid->s[ZZ])
1581 gmx_incons("pmegrid_init call with an unaligned z size");
1584 grid->order = pme_order;
1587 gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1588 set_gridsize_alignment(&gridsize,pme_order);
1589 snew_aligned(grid->grid,gridsize,16);
1597 static int div_round_up(int enumerator,int denominator)
1599 return (enumerator + denominator - 1)/denominator;
1602 static void make_subgrid_division(const ivec n,int ovl,int nthread,
1605 int gsize_opt,gsize;
1610 for(nsx=1; nsx<=nthread; nsx++)
1612 if (nthread % nsx == 0)
1614 for(nsy=1; nsy<=nthread; nsy++)
1616 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1618 nsz = nthread/(nsx*nsy);
1620 /* Determine the number of grid points per thread */
1622 (div_round_up(n[XX],nsx) + ovl)*
1623 (div_round_up(n[YY],nsy) + ovl)*
1624 (div_round_up(n[ZZ],nsz) + ovl);
1626 /* Minimize the number of grids points per thread
1627 * and, secondarily, the number of cuts in minor dimensions.
1629 if (gsize_opt == -1 ||
1630 gsize < gsize_opt ||
1631 (gsize == gsize_opt &&
1632 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1644 env = getenv("GMX_PME_THREAD_DIVISION");
1647 sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
1650 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1652 gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
1656 static void pmegrids_init(pmegrids_t *grids,
1657 int nx,int ny,int nz,int nz_base,
1663 ivec n,n_base,g0,g1;
1664 int t,x,y,z,d,i,tfac;
1665 int max_comm_lines=-1;
1667 n[XX] = nx - (pme_order - 1);
1668 n[YY] = ny - (pme_order - 1);
1669 n[ZZ] = nz - (pme_order - 1);
1671 copy_ivec(n,n_base);
1672 n_base[ZZ] = nz_base;
1674 pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
1677 grids->nthread = nthread;
1679 make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
1681 if (grids->nthread > 1)
1686 for(d=0; d<DIM; d++)
1688 nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
1690 set_grid_alignment(&nst[ZZ],pme_order);
1694 fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
1695 grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
1696 fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1698 nst[XX],nst[YY],nst[ZZ]);
1701 snew(grids->grid_th,grids->nthread);
1703 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1704 set_gridsize_alignment(&gridsize,pme_order);
1705 snew_aligned(grids->grid_all,
1706 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1709 for(x=0; x<grids->nc[XX]; x++)
1711 for(y=0; y<grids->nc[YY]; y++)
1713 for(z=0; z<grids->nc[ZZ]; z++)
1715 pmegrid_init(&grids->grid_th[t],
1717 (n[XX]*(x ))/grids->nc[XX],
1718 (n[YY]*(y ))/grids->nc[YY],
1719 (n[ZZ]*(z ))/grids->nc[ZZ],
1720 (n[XX]*(x+1))/grids->nc[XX],
1721 (n[YY]*(y+1))/grids->nc[YY],
1722 (n[ZZ]*(z+1))/grids->nc[ZZ],
1725 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1732 snew(grids->g2t,DIM);
1734 for(d=DIM-1; d>=0; d--)
1736 snew(grids->g2t[d],n[d]);
1738 for(i=0; i<n[d]; i++)
1740 /* The second check should match the parameters
1741 * of the pmegrid_init call above.
1743 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1747 grids->g2t[d][i] = t*tfac;
1750 tfac *= grids->nc[d];
1754 case XX: max_comm_lines = overlap_x; break;
1755 case YY: max_comm_lines = overlap_y; break;
1756 case ZZ: max_comm_lines = pme_order - 1; break;
1758 grids->nthread_comm[d] = 0;
1759 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1760 grids->nthread_comm[d] < grids->nc[d])
1762 grids->nthread_comm[d]++;
1766 fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
1767 'x'+d,grids->nthread_comm[d]);
1769 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1770 * work, but this is not a problematic restriction.
1772 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1774 gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
1780 static void pmegrids_destroy(pmegrids_t *grids)
1784 if (grids->grid.grid != NULL)
1786 sfree(grids->grid.grid);
1788 if (grids->nthread > 0)
1790 for(t=0; t<grids->nthread; t++)
1792 sfree(grids->grid_th[t].grid);
1794 sfree(grids->grid_th);
1800 static void realloc_work(pme_work_t *work,int nkx)
1802 if (nkx > work->nalloc)
1805 srenew(work->mhx ,work->nalloc);
1806 srenew(work->mhy ,work->nalloc);
1807 srenew(work->mhz ,work->nalloc);
1808 srenew(work->m2 ,work->nalloc);
1809 /* Allocate an aligned pointer for SSE operations, including 3 extra
1810 * elements at the end since SSE operates on 4 elements at a time.
1812 sfree_aligned(work->denom);
1813 sfree_aligned(work->tmp1);
1814 sfree_aligned(work->eterm);
1815 snew_aligned(work->denom,work->nalloc+3,16);
1816 snew_aligned(work->tmp1 ,work->nalloc+3,16);
1817 snew_aligned(work->eterm,work->nalloc+3,16);
1818 srenew(work->m2inv,work->nalloc);
1823 static void free_work(pme_work_t *work)
1829 sfree_aligned(work->denom);
1830 sfree_aligned(work->tmp1);
1831 sfree_aligned(work->eterm);
1837 /* Calculate exponentials through SSE in float precision */
1838 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1841 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
1844 __m128 tmp_d1,d_inv,tmp_r,tmp_e;
1846 f_sse = _mm_load1_ps(&f);
1847 for(kx=0; kx<end; kx+=4)
1849 tmp_d1 = _mm_load_ps(d_aligned+kx);
1850 lu = _mm_rcp_ps(tmp_d1);
1851 d_inv = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
1852 tmp_r = _mm_load_ps(r_aligned+kx);
1853 tmp_r = gmx_mm_exp_ps(tmp_r);
1854 tmp_e = _mm_mul_ps(f_sse,d_inv);
1855 tmp_e = _mm_mul_ps(tmp_e,tmp_r);
1856 _mm_store_ps(e_aligned+kx,tmp_e);
1861 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1864 for(kx=start; kx<end; kx++)
1868 for(kx=start; kx<end; kx++)
1872 for(kx=start; kx<end; kx++)
1874 e[kx] = f*r[kx]*d[kx];
1880 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1881 real ewaldcoeff,real vol,
1883 int nthread,int thread)
1885 /* do recip sum over local cells in grid */
1886 /* y major, z middle, x minor or continuous */
1888 int kx,ky,kz,maxkx,maxky,maxkz;
1889 int nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
1891 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1892 real ets2,struct2,vfactor,ets2vf;
1893 real d1,d2,energy=0;
1895 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1896 real rxx,ryx,ryy,rzx,rzy,rzz;
1898 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
1899 real mhxk,mhyk,mhzk,m2k;
1902 ivec local_ndata,local_offset,local_size;
1905 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1911 /* Dimensions should be identical for A/B grid, so we just use A here */
1912 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1918 rxx = pme->recipbox[XX][XX];
1919 ryx = pme->recipbox[YY][XX];
1920 ryy = pme->recipbox[YY][YY];
1921 rzx = pme->recipbox[ZZ][XX];
1922 rzy = pme->recipbox[ZZ][YY];
1923 rzz = pme->recipbox[ZZ][ZZ];
1929 work = &pme->work[thread];
1934 denom = work->denom;
1936 eterm = work->eterm;
1937 m2inv = work->m2inv;
1939 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1940 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1942 for(iyz=iyz0; iyz<iyz1; iyz++)
1944 iy = iyz/local_ndata[ZZ];
1945 iz = iyz - iy*local_ndata[ZZ];
1947 ky = iy + local_offset[YY];
1958 by = M_PI*vol*pme->bsp_mod[YY][ky];
1960 kz = iz + local_offset[ZZ];
1964 bz = pme->bsp_mod[ZZ][kz];
1966 /* 0.5 correction for corner points */
1968 if (kz == 0 || kz == (nz+1)/2)
1973 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1975 /* We should skip the k-space point (0,0,0) */
1976 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
1978 kxstart = local_offset[XX];
1982 kxstart = local_offset[XX] + 1;
1985 kxend = local_offset[XX] + local_ndata[XX];
1989 /* More expensive inner loop, especially because of the storage
1990 * of the mh elements in array's.
1991 * Because x is the minor grid index, all mh elements
1992 * depend on kx for triclinic unit cells.
1995 /* Two explicit loops to avoid a conditional inside the loop */
1996 for(kx=kxstart; kx<maxkx; kx++)
2001 mhyk = mx * ryx + my * ryy;
2002 mhzk = mx * rzx + my * rzy + mz * rzz;
2003 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2008 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2009 tmp1[kx] = -factor*m2k;
2012 for(kx=maxkx; kx<kxend; kx++)
2017 mhyk = mx * ryx + my * ryy;
2018 mhzk = mx * rzx + my * rzy + mz * rzz;
2019 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2024 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2025 tmp1[kx] = -factor*m2k;
2028 for(kx=kxstart; kx<kxend; kx++)
2030 m2inv[kx] = 1.0/m2[kx];
2033 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2035 for(kx=kxstart; kx<kxend; kx++,p0++)
2040 p0->re = d1*eterm[kx];
2041 p0->im = d2*eterm[kx];
2043 struct2 = 2.0*(d1*d1+d2*d2);
2045 tmp1[kx] = eterm[kx]*struct2;
2048 for(kx=kxstart; kx<kxend; kx++)
2050 ets2 = corner_fac*tmp1[kx];
2051 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2054 ets2vf = ets2*vfactor;
2055 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2056 virxy += ets2vf*mhx[kx]*mhy[kx];
2057 virxz += ets2vf*mhx[kx]*mhz[kx];
2058 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2059 viryz += ets2vf*mhy[kx]*mhz[kx];
2060 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2065 /* We don't need to calculate the energy and the virial.
2066 * In this case the triclinic overhead is small.
2069 /* Two explicit loops to avoid a conditional inside the loop */
2071 for(kx=kxstart; kx<maxkx; kx++)
2076 mhyk = mx * ryx + my * ryy;
2077 mhzk = mx * rzx + my * rzy + mz * rzz;
2078 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2079 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2080 tmp1[kx] = -factor*m2k;
2083 for(kx=maxkx; kx<kxend; kx++)
2088 mhyk = mx * ryx + my * ryy;
2089 mhzk = mx * rzx + my * rzy + mz * rzz;
2090 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2091 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2092 tmp1[kx] = -factor*m2k;
2095 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2097 for(kx=kxstart; kx<kxend; kx++,p0++)
2102 p0->re = d1*eterm[kx];
2103 p0->im = d2*eterm[kx];
2110 /* Update virial with local values.
2111 * The virial is symmetric by definition.
2112 * this virial seems ok for isotropic scaling, but I'm
2113 * experiencing problems on semiisotropic membranes.
2114 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2116 work->vir[XX][XX] = 0.25*virxx;
2117 work->vir[YY][YY] = 0.25*viryy;
2118 work->vir[ZZ][ZZ] = 0.25*virzz;
2119 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2120 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2121 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2123 /* This energy should be corrected for a charged system */
2124 work->energy = 0.5*energy;
2127 /* Return the loop count */
2128 return local_ndata[YY]*local_ndata[XX];
2131 static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
2132 real *mesh_energy,matrix vir)
2134 /* This function sums output over threads
2135 * and should therefore only be called after thread synchronization.
2139 *mesh_energy = pme->work[0].energy;
2140 copy_mat(pme->work[0].vir,vir);
2142 for(thread=1; thread<nthread; thread++)
2144 *mesh_energy += pme->work[thread].energy;
2145 m_add(vir,pme->work[thread].vir,vir);
2149 #define DO_FSPLINE(order) \
2150 for(ithx=0; (ithx<order); ithx++) \
2152 index_x = (i0+ithx)*pny*pnz; \
2156 for(ithy=0; (ithy<order); ithy++) \
2158 index_xy = index_x+(j0+ithy)*pnz; \
2163 for(ithz=0; (ithz<order); ithz++) \
2165 gval = grid[index_xy+(k0+ithz)]; \
2166 fxy1 += thz[ithz]*gval; \
2167 fz1 += dthz[ithz]*gval; \
2176 static void gather_f_bsplines(gmx_pme_t pme,real *grid,
2177 gmx_bool bClearF,pme_atomcomm_t *atc,
2178 splinedata_t *spline,
2181 /* sum forces for local particles */
2182 int nn,n,ithx,ithy,ithz,i0,j0,k0;
2183 int index_x,index_xy;
2184 int nx,ny,nz,pnx,pny,pnz;
2186 real tx,ty,dx,dy,qn;
2189 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
2191 real rxx,ryx,ryy,rzx,rzy,rzz;
2194 pme_spline_work_t *work;
2196 work = pme->spline_work;
2198 order = pme->pme_order;
2199 thx = spline->theta[XX];
2200 thy = spline->theta[YY];
2201 thz = spline->theta[ZZ];
2202 dthx = spline->dtheta[XX];
2203 dthy = spline->dtheta[YY];
2204 dthz = spline->dtheta[ZZ];
2208 pnx = pme->pmegrid_nx;
2209 pny = pme->pmegrid_ny;
2210 pnz = pme->pmegrid_nz;
2212 rxx = pme->recipbox[XX][XX];
2213 ryx = pme->recipbox[YY][XX];
2214 ryy = pme->recipbox[YY][YY];
2215 rzx = pme->recipbox[ZZ][XX];
2216 rzy = pme->recipbox[ZZ][YY];
2217 rzz = pme->recipbox[ZZ][ZZ];
2219 for(nn=0; nn<spline->n; nn++)
2221 n = spline->ind[nn];
2222 qn = scale*atc->q[n];
2235 idxptr = atc->idx[n];
2242 /* Pointer arithmetic alert, next six statements */
2243 thx = spline->theta[XX] + norder;
2244 thy = spline->theta[YY] + norder;
2245 thz = spline->theta[ZZ] + norder;
2246 dthx = spline->dtheta[XX] + norder;
2247 dthy = spline->dtheta[YY] + norder;
2248 dthz = spline->dtheta[ZZ] + norder;
2253 #ifdef PME_SSE_UNALIGNED
2254 #define PME_GATHER_F_SSE_ORDER4
2256 #define PME_GATHER_F_SSE_ALIGNED
2259 #include "pme_sse_single.h"
2266 #define PME_GATHER_F_SSE_ALIGNED
2268 #include "pme_sse_single.h"
2278 atc->f[n][XX] += -qn*( fx*nx*rxx );
2279 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2280 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2283 /* Since the energy and not forces are interpolated
2284 * the net force might not be exactly zero.
2285 * This can be solved by also interpolating F, but
2286 * that comes at a cost.
2287 * A better hack is to remove the net force every
2288 * step, but that must be done at a higher level
2289 * since this routine doesn't see all atoms if running
2290 * in parallel. Don't know how important it is? EL 990726
2295 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
2296 pme_atomcomm_t *atc)
2298 splinedata_t *spline;
2299 int n,ithx,ithy,ithz,i0,j0,k0;
2300 int index_x,index_xy;
2302 real energy,pot,tx,ty,qn,gval;
2303 real *thx,*thy,*thz;
2307 spline = &atc->spline[0];
2309 order = pme->pme_order;
2312 for(n=0; (n<atc->n); n++) {
2316 idxptr = atc->idx[n];
2323 /* Pointer arithmetic alert, next three statements */
2324 thx = spline->theta[XX] + norder;
2325 thy = spline->theta[YY] + norder;
2326 thz = spline->theta[ZZ] + norder;
2329 for(ithx=0; (ithx<order); ithx++)
2331 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2334 for(ithy=0; (ithy<order); ithy++)
2336 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2339 for(ithz=0; (ithz<order); ithz++)
2341 gval = grid[index_xy+(k0+ithz)];
2342 pot += tx*ty*thz[ithz]*gval;
2355 /* Macro to force loop unrolling by fixing order.
2356 * This gives a significant performance gain.
2358 #define CALC_SPLINE(order) \
2362 real data[PME_ORDER_MAX]; \
2363 real ddata[PME_ORDER_MAX]; \
2365 for(j=0; (j<DIM); j++) \
2369 /* dr is relative offset from lower cell limit */ \
2370 data[order-1] = 0; \
2374 for(k=3; (k<order); k++) \
2376 div = 1.0/(k - 1.0); \
2377 data[k-1] = div*dr*data[k-2]; \
2378 for(l=1; (l<(k-1)); l++) \
2380 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2383 data[0] = div*(1-dr)*data[0]; \
2385 /* differentiate */ \
2386 ddata[0] = -data[0]; \
2387 for(k=1; (k<order); k++) \
2389 ddata[k] = data[k-1] - data[k]; \
2392 div = 1.0/(order - 1); \
2393 data[order-1] = div*dr*data[order-2]; \
2394 for(l=1; (l<(order-1)); l++) \
2396 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2397 (order-l-dr)*data[order-l-1]); \
2399 data[0] = div*(1 - dr)*data[0]; \
2401 for(k=0; k<order; k++) \
2403 theta[j][i*order+k] = data[k]; \
2404 dtheta[j][i*order+k] = ddata[k]; \
2409 void make_bsplines(splinevec theta,splinevec dtheta,int order,
2410 rvec fractx[],int nr,int ind[],real charge[],
2411 gmx_bool bFreeEnergy)
2413 /* construct splines for local atoms */
2419 /* With free energy we do not use the charge check.
2420 * In most cases this will be more efficient than calling make_bsplines
2421 * twice, since usually more than half the particles have charges.
2424 if (bFreeEnergy || charge[ii] != 0.0) {
2427 case 4: CALC_SPLINE(4); break;
2428 case 5: CALC_SPLINE(5); break;
2429 default: CALC_SPLINE(order); break;
2436 void make_dft_mod(real *mod,real *data,int ndata)
2441 for(i=0;i<ndata;i++) {
2443 for(j=0;j<ndata;j++) {
2444 arg=(2.0*M_PI*i*j)/ndata;
2445 sc+=data[j]*cos(arg);
2446 ss+=data[j]*sin(arg);
2450 for(i=0;i<ndata;i++)
2452 mod[i]=(mod[i-1]+mod[i+1])*0.5;
2456 static void make_bspline_moduli(splinevec bsp_mod,
2457 int nx,int ny,int nz,int order)
2459 int nmax=max(nx,max(ny,nz));
2460 real *data,*ddata,*bsp_data;
2466 snew(bsp_data,nmax);
2472 for(k=3;k<order;k++) {
2475 for(l=1;l<(k-1);l++)
2476 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2477 data[0]=div*data[0];
2481 for(k=1;k<order;k++)
2482 ddata[k]=data[k-1]-data[k];
2485 for(l=1;l<(order-1);l++)
2486 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2487 data[0]=div*data[0];
2491 for(i=1;i<=order;i++)
2492 bsp_data[i]=data[i-1];
2494 make_dft_mod(bsp_mod[XX],bsp_data,nx);
2495 make_dft_mod(bsp_mod[YY],bsp_data,ny);
2496 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
2504 /* Return the P3M optimal influence function */
2505 static double do_p3m_influence(double z, int order)
2512 /* The formula and most constants can be found in:
2513 * Ballenegger et al., JCTC 8, 936 (2012)
2518 return 1.0 - 2.0*z2/3.0;
2521 return 1.0 - z2 + 2.0*z4/15.0;
2524 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2527 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2530 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2533 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2535 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2542 /* Calculate the P3M B-spline moduli for one dimension */
2543 static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
2545 double zarg,zai,sinzai,infl;
2550 gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
2557 for(i=-maxk; i<0; i++)
2561 infl = do_p3m_influence(sinzai,order);
2562 bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
2565 for(i=1; i<maxk; i++)
2569 infl = do_p3m_influence(sinzai,order);
2570 bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
2574 /* Calculate the P3M B-spline moduli */
2575 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2576 int nx,int ny,int nz,int order)
2578 make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
2579 make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
2580 make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
2584 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2592 for(i=1; i<=nslab/2; i++) {
2593 fw = (atc->nodeid + i) % nslab;
2594 bw = (atc->nodeid - i + nslab) % nslab;
2595 if (n < nslab - 1) {
2596 atc->node_dest[n] = fw;
2597 atc->node_src[n] = bw;
2600 if (n < nslab - 1) {
2601 atc->node_dest[n] = bw;
2602 atc->node_src[n] = fw;
2608 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
2614 fprintf(log,"Destroying PME data structures.\n");
2617 sfree((*pmedata)->nnx);
2618 sfree((*pmedata)->nny);
2619 sfree((*pmedata)->nnz);
2621 pmegrids_destroy(&(*pmedata)->pmegridA);
2623 sfree((*pmedata)->fftgridA);
2624 sfree((*pmedata)->cfftgridA);
2625 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2627 if ((*pmedata)->pmegridB.grid.grid != NULL)
2629 pmegrids_destroy(&(*pmedata)->pmegridB);
2630 sfree((*pmedata)->fftgridB);
2631 sfree((*pmedata)->cfftgridB);
2632 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2634 for(thread=0; thread<(*pmedata)->nthread; thread++)
2636 free_work(&(*pmedata)->work[thread]);
2638 sfree((*pmedata)->work);
2646 static int mult_up(int n,int f)
2648 return ((n + f - 1)/f)*f;
2652 static double pme_load_imbalance(gmx_pme_t pme)
2657 nma = pme->nnodes_major;
2658 nmi = pme->nnodes_minor;
2660 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
2661 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
2662 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
2664 /* pme_solve is roughly double the cost of an fft */
2666 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2669 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
2670 int dimind,gmx_bool bSpread)
2674 atc->dimind = dimind;
2679 if (pme->nnodes > 1)
2681 atc->mpi_comm = pme->mpi_comm_d[dimind];
2682 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
2683 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
2687 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
2691 atc->bSpread = bSpread;
2692 atc->pme_order = pme->pme_order;
2696 /* These three allocations are not required for particle decomp. */
2697 snew(atc->node_dest,atc->nslab);
2698 snew(atc->node_src,atc->nslab);
2699 setup_coordinate_communication(atc);
2701 snew(atc->count_thread,pme->nthread);
2702 for(thread=0; thread<pme->nthread; thread++)
2704 snew(atc->count_thread[thread],atc->nslab);
2706 atc->count = atc->count_thread[0];
2707 snew(atc->rcount,atc->nslab);
2708 snew(atc->buf_index,atc->nslab);
2711 atc->nthread = pme->nthread;
2712 if (atc->nthread > 1)
2714 snew(atc->thread_plist,atc->nthread);
2716 snew(atc->spline,atc->nthread);
2717 for(thread=0; thread<atc->nthread; thread++)
2719 if (atc->nthread > 1)
2721 snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
2722 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2724 snew(atc->spline[thread].thread_one,pme->nthread);
2725 atc->spline[thread].thread_one[thread] = 1;
2730 init_overlap_comm(pme_overlap_t * ol,
2740 int lbnd,rbnd,maxlr,b,i;
2743 pme_grid_comm_t *pgc;
2745 int fft_start,fft_end,send_index1,recv_index1;
2749 ol->mpi_comm = comm;
2752 ol->nnodes = nnodes;
2753 ol->nodeid = nodeid;
2755 /* Linear translation of the PME grid won't affect reciprocal space
2756 * calculations, so to optimize we only interpolate "upwards",
2757 * which also means we only have to consider overlap in one direction.
2758 * I.e., particles on this node might also be spread to grid indices
2759 * that belong to higher nodes (modulo nnodes)
2762 snew(ol->s2g0,ol->nnodes+1);
2763 snew(ol->s2g1,ol->nnodes);
2764 if (debug) { fprintf(debug,"PME slab boundaries:"); }
2765 for(i=0; i<nnodes; i++)
2767 /* s2g0 the local interpolation grid start.
2768 * s2g1 the local interpolation grid end.
2769 * Because grid overlap communication only goes forward,
2770 * the grid the slabs for fft's should be rounded down.
2772 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2773 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2777 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
2780 ol->s2g0[nnodes] = ndata;
2781 if (debug) { fprintf(debug,"\n"); }
2783 /* Determine with how many nodes we need to communicate the grid overlap */
2789 for(i=0; i<nnodes; i++)
2791 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2792 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2798 while (bCont && b < nnodes);
2799 ol->noverlap_nodes = b - 1;
2801 snew(ol->send_id,ol->noverlap_nodes);
2802 snew(ol->recv_id,ol->noverlap_nodes);
2803 for(b=0; b<ol->noverlap_nodes; b++)
2805 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2806 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2808 snew(ol->comm_data, ol->noverlap_nodes);
2811 for(b=0; b<ol->noverlap_nodes; b++)
2813 pgc = &ol->comm_data[b];
2815 fft_start = ol->s2g0[ol->send_id[b]];
2816 fft_end = ol->s2g0[ol->send_id[b]+1];
2817 if (ol->send_id[b] < nodeid)
2822 send_index1 = ol->s2g1[nodeid];
2823 send_index1 = min(send_index1,fft_end);
2824 pgc->send_index0 = fft_start;
2825 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
2826 ol->send_size += pgc->send_nindex;
2828 /* We always start receiving to the first index of our slab */
2829 fft_start = ol->s2g0[ol->nodeid];
2830 fft_end = ol->s2g0[ol->nodeid+1];
2831 recv_index1 = ol->s2g1[ol->recv_id[b]];
2832 if (ol->recv_id[b] > nodeid)
2834 recv_index1 -= ndata;
2836 recv_index1 = min(recv_index1,fft_end);
2837 pgc->recv_index0 = fft_start;
2838 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
2842 /* Communicate the buffer sizes to receive */
2843 for(b=0; b<ol->noverlap_nodes; b++)
2845 MPI_Sendrecv(&ol->send_size ,1,MPI_INT,ol->send_id[b],b,
2846 &ol->comm_data[b].recv_size,1,MPI_INT,ol->recv_id[b],b,
2847 ol->mpi_comm,&stat);
2851 /* For non-divisible grid we need pme_order iso pme_order-1 */
2852 snew(ol->sendbuf,norder*commplainsize);
2853 snew(ol->recvbuf,norder*commplainsize);
2857 make_gridindex5_to_localindex(int n,int local_start,int local_range,
2858 int **global_to_local,
2859 real **fraction_shift)
2867 for(i=0; (i<5*n); i++)
2869 /* Determine the global to local grid index */
2870 gtl[i] = (i - local_start + n) % n;
2871 /* For coordinates that fall within the local grid the fraction
2872 * is correct, we don't need to shift it.
2875 if (local_range < n)
2877 /* Due to rounding issues i could be 1 beyond the lower or
2878 * upper boundary of the local grid. Correct the index for this.
2879 * If we shift the index, we need to shift the fraction by
2880 * the same amount in the other direction to not affect
2882 * Note that due to this shifting the weights at the end of
2883 * the spline might change, but that will only involve values
2884 * between zero and values close to the precision of a real,
2885 * which is anyhow the accuracy of the whole mesh calculation.
2887 /* With local_range=0 we should not change i=local_start */
2888 if (i % n != local_start)
2895 else if (gtl[i] == local_range)
2897 gtl[i] = local_range - 1;
2904 *global_to_local = gtl;
2905 *fraction_shift = fsh;
2908 static pme_spline_work_t *make_pme_spline_work(int order)
2910 pme_spline_work_t *work;
2917 snew_aligned(work,1,16);
2919 zero_SSE = _mm_setzero_ps();
2921 /* Generate bit masks to mask out the unused grid entries,
2922 * as we only operate on order of the 8 grid entries that are
2923 * load into 2 SSE float registers.
2925 for(of=0; of<8-(order-1); of++)
2929 tmp[i] = (i >= of && i < of+order ? 1 : 0);
2931 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
2932 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
2933 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
2934 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
2944 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2948 if (*nk % nnodes != 0)
2950 nk_new = nnodes*(*nk/nnodes + 1);
2952 if (2*nk_new >= 3*(*nk))
2954 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2955 dim,*nk,dim,nnodes,dim);
2960 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2961 dim,*nk,dim,nnodes,dim,nk_new,dim);
2968 int gmx_pme_init(gmx_pme_t * pmedata,
2974 gmx_bool bFreeEnergy,
2975 gmx_bool bReproducible,
2980 pme_atomcomm_t *atc;
2984 fprintf(debug,"Creating PME data structures.\n");
2987 pme->redist_init = FALSE;
2988 pme->sum_qgrid_tmp = NULL;
2989 pme->sum_qgrid_dd_tmp = NULL;
2990 pme->buf_nalloc = 0;
2991 pme->redist_buf_nalloc = 0;
2994 pme->bPPnode = TRUE;
2996 pme->nnodes_major = nnodes_major;
2997 pme->nnodes_minor = nnodes_minor;
3000 if (nnodes_major*nnodes_minor > 1)
3002 pme->mpi_comm = cr->mpi_comm_mygroup;
3004 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
3005 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
3006 if (pme->nnodes != nnodes_major*nnodes_minor)
3008 gmx_incons("PME node count mismatch");
3013 pme->mpi_comm = MPI_COMM_NULL;
3017 if (pme->nnodes == 1)
3020 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3021 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3023 pme->ndecompdim = 0;
3024 pme->nodeid_major = 0;
3025 pme->nodeid_minor = 0;
3027 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3032 if (nnodes_minor == 1)
3035 pme->mpi_comm_d[0] = pme->mpi_comm;
3036 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3038 pme->ndecompdim = 1;
3039 pme->nodeid_major = pme->nodeid;
3040 pme->nodeid_minor = 0;
3043 else if (nnodes_major == 1)
3046 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3047 pme->mpi_comm_d[1] = pme->mpi_comm;
3049 pme->ndecompdim = 1;
3050 pme->nodeid_major = 0;
3051 pme->nodeid_minor = pme->nodeid;
3055 if (pme->nnodes % nnodes_major != 0)
3057 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3059 pme->ndecompdim = 2;
3062 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
3063 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
3064 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
3065 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3067 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
3068 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
3069 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
3070 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
3073 pme->bPPnode = (cr->duty & DUTY_PP);
3076 pme->nthread = nthread;
3078 if (ir->ePBC == epbcSCREW)
3080 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
3083 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
3087 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3088 pme->pme_order = ir->pme_order;
3089 pme->epsilon_r = ir->epsilon_r;
3091 if (pme->pme_order > PME_ORDER_MAX)
3093 gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3094 pme->pme_order,PME_ORDER_MAX);
3097 /* Currently pme.c supports only the fft5d FFT code.
3098 * Therefore the grid always needs to be divisible by nnodes.
3099 * When the old 1D code is also supported again, change this check.
3101 * This check should be done before calling gmx_pme_init
3102 * and fplog should be passed iso stderr.
3104 if (pme->ndecompdim >= 2)
3106 if (pme->ndecompdim >= 1)
3109 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3110 'x',nnodes_major,&pme->nkx);
3111 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3112 'y',nnodes_minor,&pme->nky);
3116 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3117 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3118 pme->nkz <= pme->pme_order)
3120 gmx_fatal(FARGS,"The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",pme->pme_order);
3123 if (pme->nnodes > 1) {
3127 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3128 MPI_Type_commit(&(pme->rvec_mpi));
3131 /* Note that the charge spreading and force gathering, which usually
3132 * takes about the same amount of time as FFT+solve_pme,
3133 * is always fully load balanced
3134 * (unless the charge distribution is inhomogeneous).
3137 imbal = pme_load_imbalance(pme);
3138 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3142 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3143 " For optimal PME load balancing\n"
3144 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3145 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3147 (int)((imbal-1)*100 + 0.5),
3148 pme->nkx,pme->nky,pme->nnodes_major,
3149 pme->nky,pme->nkz,pme->nnodes_minor);
3153 /* For non-divisible grid we need pme_order iso pme_order-1 */
3154 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3155 * y is always copied through a buffer: we don't need padding in z,
3156 * but we do need the overlap in x because of the communication order.
3158 init_overlap_comm(&pme->overlap[0],pme->pme_order,
3162 pme->nnodes_major,pme->nodeid_major,
3164 (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3166 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3167 * We do this with an offset buffer of equal size, so we need to allocate
3168 * extra for the offset. That's what the (+1)*pme->nkz is for.
3170 init_overlap_comm(&pme->overlap[1],pme->pme_order,
3174 pme->nnodes_minor,pme->nodeid_minor,
3176 (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3178 /* Check for a limitation of the (current) sum_fftgrid_dd code.
3179 * We only allow multiple communication pulses in dim 1, not in dim 0.
3181 if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
3182 pme->nkx < pme->nnodes_major*pme->pme_order))
3184 gmx_fatal(FARGS,"The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3185 pme->nkx/(double)pme->nnodes_major,pme->pme_order);
3188 snew(pme->bsp_mod[XX],pme->nkx);
3189 snew(pme->bsp_mod[YY],pme->nky);
3190 snew(pme->bsp_mod[ZZ],pme->nkz);
3192 /* The required size of the interpolation grid, including overlap.
3193 * The allocated size (pmegrid_n?) might be slightly larger.
3195 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3196 pme->overlap[0].s2g0[pme->nodeid_major];
3197 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3198 pme->overlap[1].s2g0[pme->nodeid_minor];
3199 pme->pmegrid_nz_base = pme->nkz;
3200 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3201 set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
3203 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3204 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3205 pme->pmegrid_start_iz = 0;
3207 make_gridindex5_to_localindex(pme->nkx,
3208 pme->pmegrid_start_ix,
3209 pme->pmegrid_nx - (pme->pme_order-1),
3210 &pme->nnx,&pme->fshx);
3211 make_gridindex5_to_localindex(pme->nky,
3212 pme->pmegrid_start_iy,
3213 pme->pmegrid_ny - (pme->pme_order-1),
3214 &pme->nny,&pme->fshy);
3215 make_gridindex5_to_localindex(pme->nkz,
3216 pme->pmegrid_start_iz,
3217 pme->pmegrid_nz_base,
3218 &pme->nnz,&pme->fshz);
3220 pmegrids_init(&pme->pmegridA,
3221 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3222 pme->pmegrid_nz_base,
3225 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3226 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3228 pme->spline_work = make_pme_spline_work(pme->pme_order);
3230 ndata[0] = pme->nkx;
3231 ndata[1] = pme->nky;
3232 ndata[2] = pme->nkz;
3234 /* This routine will allocate the grid data to fit the FFTs */
3235 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
3236 &pme->fftgridA,&pme->cfftgridA,
3238 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3239 bReproducible,pme->nthread);
3243 pmegrids_init(&pme->pmegridB,
3244 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3245 pme->pmegrid_nz_base,
3248 pme->nkx % pme->nnodes_major != 0,
3249 pme->nky % pme->nnodes_minor != 0);
3251 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
3252 &pme->fftgridB,&pme->cfftgridB,
3254 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3255 bReproducible,pme->nthread);
3259 pme->pmegridB.grid.grid = NULL;
3260 pme->fftgridB = NULL;
3261 pme->cfftgridB = NULL;
3266 /* Use plain SPME B-spline interpolation */
3267 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3271 /* Use the P3M grid-optimized influence function */
3272 make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3275 /* Use atc[0] for spreading */
3276 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
3277 if (pme->ndecompdim >= 2)
3279 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
3282 if (pme->nnodes == 1) {
3283 pme->atc[0].n = homenr;
3284 pme_realloc_atomcomm_things(&pme->atc[0]);
3290 /* Use fft5d, order after FFT is y major, z, x minor */
3292 snew(pme->work,pme->nthread);
3293 for(thread=0; thread<pme->nthread; thread++)
3295 realloc_work(&pme->work[thread],pme->nkx);
3304 static void reuse_pmegrids(const pmegrids_t *old,pmegrids_t *new)
3308 for(d=0; d<DIM; d++)
3310 if (new->grid.n[d] > old->grid.n[d])
3316 sfree_aligned(new->grid.grid);
3317 new->grid.grid = old->grid.grid;
3319 if (new->nthread > 1 && new->nthread == old->nthread)
3321 sfree_aligned(new->grid_all);
3322 for(t=0; t<new->nthread; t++)
3324 new->grid_th[t].grid = old->grid_th[t].grid;
3329 int gmx_pme_reinit(gmx_pme_t * pmedata,
3332 const t_inputrec * ir,
3340 irc.nkx = grid_size[XX];
3341 irc.nky = grid_size[YY];
3342 irc.nkz = grid_size[ZZ];
3344 if (pme_src->nnodes == 1)
3346 homenr = pme_src->atc[0].n;
3353 ret = gmx_pme_init(pmedata,cr,pme_src->nnodes_major,pme_src->nnodes_minor,
3354 &irc,homenr,pme_src->bFEP,FALSE,pme_src->nthread);
3358 /* We can easily reuse the allocated pme grids in pme_src */
3359 reuse_pmegrids(&pme_src->pmegridA,&(*pmedata)->pmegridA);
3360 /* We would like to reuse the fft grids, but that's harder */
3367 static void copy_local_grid(gmx_pme_t pme,
3368 pmegrids_t *pmegrids,int thread,real *fftgrid)
3370 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3374 int offx,offy,offz,x,y,z,i0,i0t;
3379 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3383 fft_my = local_fft_size[YY];
3384 fft_mz = local_fft_size[ZZ];
3386 pmegrid = &pmegrids->grid_th[thread];
3388 nsx = pmegrid->s[XX];
3389 nsy = pmegrid->s[YY];
3390 nsz = pmegrid->s[ZZ];
3392 for(d=0; d<DIM; d++)
3394 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3395 local_fft_ndata[d] - pmegrid->offset[d]);
3398 offx = pmegrid->offset[XX];
3399 offy = pmegrid->offset[YY];
3400 offz = pmegrid->offset[ZZ];
3402 /* Directly copy the non-overlapping parts of the local grids.
3403 * This also initializes the full grid.
3405 grid_th = pmegrid->grid;
3406 for(x=0; x<nf[XX]; x++)
3408 for(y=0; y<nf[YY]; y++)
3410 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3411 i0t = (x*nsy + y)*nsz;
3412 for(z=0; z<nf[ZZ]; z++)
3414 fftgrid[i0+z] = grid_th[i0t+z];
3421 reduce_threadgrid_overlap(gmx_pme_t pme,
3422 const pmegrids_t *pmegrids,int thread,
3423 real *fftgrid,real *commbuf_x,real *commbuf_y)
3425 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3426 int fft_nx,fft_ny,fft_nz;
3431 int offx,offy,offz,x,y,z,i0,i0t;
3432 int sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
3433 gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
3434 gmx_bool bCommX,bCommY;
3437 const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
3438 const real *grid_th;
3441 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3445 fft_nx = local_fft_ndata[XX];
3446 fft_ny = local_fft_ndata[YY];
3447 fft_nz = local_fft_ndata[ZZ];
3449 fft_my = local_fft_size[YY];
3450 fft_mz = local_fft_size[ZZ];
3452 /* This routine is called when all thread have finished spreading.
3453 * Here each thread sums grid contributions calculated by other threads
3454 * to the thread local grid volume.
3455 * To minimize the number of grid copying operations,
3456 * this routines sums immediately from the pmegrid to the fftgrid.
3459 /* Determine which part of the full node grid we should operate on,
3460 * this is our thread local part of the full grid.
3462 pmegrid = &pmegrids->grid_th[thread];
3464 for(d=0; d<DIM; d++)
3466 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3467 local_fft_ndata[d]);
3470 offx = pmegrid->offset[XX];
3471 offy = pmegrid->offset[YY];
3472 offz = pmegrid->offset[ZZ];
3479 /* Now loop over all the thread data blocks that contribute
3480 * to the grid region we (our thread) are operating on.
3482 /* Note that ffy_nx/y is equal to the number of grid points
3483 * between the first point of our node grid and the one of the next node.
3485 for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
3487 fx = pmegrid->ci[XX] + sx;
3491 fx += pmegrids->nc[XX];
3493 bCommX = (pme->nnodes_major > 1);
3495 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3496 ox += pmegrid_g->offset[XX];
3499 tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
3503 tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
3506 for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
3508 fy = pmegrid->ci[YY] + sy;
3512 fy += pmegrids->nc[YY];
3514 bCommY = (pme->nnodes_minor > 1);
3516 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3517 oy += pmegrid_g->offset[YY];
3520 ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
3524 ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
3527 for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
3529 fz = pmegrid->ci[ZZ] + sz;
3533 fz += pmegrids->nc[ZZ];
3536 pmegrid_g = &pmegrids->grid_th[fz];
3537 oz += pmegrid_g->offset[ZZ];
3538 tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
3540 if (sx == 0 && sy == 0 && sz == 0)
3542 /* We have already added our local contribution
3543 * before calling this routine, so skip it here.
3548 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3550 pmegrid_f = &pmegrids->grid_th[thread_f];
3552 grid_th = pmegrid_f->grid;
3554 nsx = pmegrid_f->s[XX];
3555 nsy = pmegrid_f->s[YY];
3556 nsz = pmegrid_f->s[ZZ];
3558 #ifdef DEBUG_PME_REDUCE
3559 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3560 pme->nodeid,thread,thread_f,
3561 pme->pmegrid_start_ix,
3562 pme->pmegrid_start_iy,
3563 pme->pmegrid_start_iz,
3565 offx-ox,tx1-ox,offx,tx1,
3566 offy-oy,ty1-oy,offy,ty1,
3567 offz-oz,tz1-oz,offz,tz1);
3570 if (!(bCommX || bCommY))
3572 /* Copy from the thread local grid to the node grid */
3573 for(x=offx; x<tx1; x++)
3575 for(y=offy; y<ty1; y++)
3577 i0 = (x*fft_my + y)*fft_mz;
3578 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3579 for(z=offz; z<tz1; z++)
3581 fftgrid[i0+z] += grid_th[i0t+z];
3588 /* The order of this conditional decides
3589 * where the corner volume gets stored with x+y decomp.
3593 commbuf = commbuf_y;
3594 buf_my = ty1 - offy;
3597 /* We index commbuf modulo the local grid size */
3598 commbuf += buf_my*fft_nx*fft_nz;
3600 bClearBuf = bClearBufXY;
3601 bClearBufXY = FALSE;
3605 bClearBuf = bClearBufY;
3611 commbuf = commbuf_x;
3613 bClearBuf = bClearBufX;
3617 /* Copy to the communication buffer */
3618 for(x=offx; x<tx1; x++)
3620 for(y=offy; y<ty1; y++)
3622 i0 = (x*buf_my + y)*fft_nz;
3623 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3627 /* First access of commbuf, initialize it */
3628 for(z=offz; z<tz1; z++)
3630 commbuf[i0+z] = grid_th[i0t+z];
3635 for(z=offz; z<tz1; z++)
3637 commbuf[i0+z] += grid_th[i0t+z];
3649 static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
3651 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3652 pme_overlap_t *overlap;
3653 int send_index0,send_nindex;
3658 int send_size_y,recv_size_y;
3659 int ipulse,send_id,recv_id,datasize,gridsize,size_yx;
3660 real *sendptr,*recvptr;
3661 int x,y,z,indg,indb;
3663 /* Note that this routine is only used for forward communication.
3664 * Since the force gathering, unlike the charge spreading,
3665 * can be trivially parallelized over the particles,
3666 * the backwards process is much simpler and can use the "old"
3667 * communication setup.
3670 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3675 if (pme->nnodes_minor > 1)
3677 /* Major dimension */
3678 overlap = &pme->overlap[1];
3680 if (pme->nnodes_major > 1)
3682 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3688 datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
3690 send_size_y = overlap->send_size;
3692 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
3694 send_id = overlap->send_id[ipulse];
3695 recv_id = overlap->recv_id[ipulse];
3697 overlap->comm_data[ipulse].send_index0 -
3698 overlap->comm_data[0].send_index0;
3699 send_nindex = overlap->comm_data[ipulse].send_nindex;
3700 /* We don't use recv_index0, as we always receive starting at 0 */
3701 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3702 recv_size_y = overlap->comm_data[ipulse].recv_size;
3704 sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
3705 recvptr = overlap->recvbuf;
3708 MPI_Sendrecv(sendptr,send_size_y*datasize,GMX_MPI_REAL,
3710 recvptr,recv_size_y*datasize,GMX_MPI_REAL,
3712 overlap->mpi_comm,&stat);
3715 for(x=0; x<local_fft_ndata[XX]; x++)
3717 for(y=0; y<recv_nindex; y++)
3719 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3720 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
3721 for(z=0; z<local_fft_ndata[ZZ]; z++)
3723 fftgrid[indg+z] += recvptr[indb+z];
3728 if (pme->nnodes_major > 1)
3730 /* Copy from the received buffer to the send buffer for dim 0 */
3731 sendptr = pme->overlap[0].sendbuf;
3732 for(x=0; x<size_yx; x++)
3734 for(y=0; y<recv_nindex; y++)
3736 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3737 indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
3738 for(z=0; z<local_fft_ndata[ZZ]; z++)
3740 sendptr[indg+z] += recvptr[indb+z];
3748 /* We only support a single pulse here.
3749 * This is not a severe limitation, as this code is only used
3750 * with OpenMP and with OpenMP the (PME) domains can be larger.
3752 if (pme->nnodes_major > 1)
3754 /* Major dimension */
3755 overlap = &pme->overlap[0];
3757 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3758 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3762 send_id = overlap->send_id[ipulse];
3763 recv_id = overlap->recv_id[ipulse];
3764 send_nindex = overlap->comm_data[ipulse].send_nindex;
3765 /* We don't use recv_index0, as we always receive starting at 0 */
3766 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3768 sendptr = overlap->sendbuf;
3769 recvptr = overlap->recvbuf;
3773 fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
3774 send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
3778 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3780 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3782 overlap->mpi_comm,&stat);
3785 for(x=0; x<recv_nindex; x++)
3787 for(y=0; y<local_fft_ndata[YY]; y++)
3789 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3790 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3791 for(z=0; z<local_fft_ndata[ZZ]; z++)
3793 fftgrid[indg+z] += recvptr[indb+z];
3801 static void spread_on_grid(gmx_pme_t pme,
3802 pme_atomcomm_t *atc,pmegrids_t *grids,
3803 gmx_bool bCalcSplines,gmx_bool bSpread,
3807 #ifdef PME_TIME_THREADS
3808 gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
3809 static double cs1=0,cs2=0,cs3=0;
3810 static double cs1a[6]={0,0,0,0,0,0};
3814 nthread = pme->nthread;
3817 #ifdef PME_TIME_THREADS
3818 c1 = omp_cyc_start();
3822 #pragma omp parallel for num_threads(nthread) schedule(static)
3823 for(thread=0; thread<nthread; thread++)
3827 start = atc->n* thread /nthread;
3828 end = atc->n*(thread+1)/nthread;
3830 /* Compute fftgrid index for all atoms,
3831 * with help of some extra variables.
3833 calc_interpolation_idx(pme,atc,start,end,thread);
3836 #ifdef PME_TIME_THREADS
3837 c1 = omp_cyc_end(c1);
3841 #ifdef PME_TIME_THREADS
3842 c2 = omp_cyc_start();
3844 #pragma omp parallel for num_threads(nthread) schedule(static)
3845 for(thread=0; thread<nthread; thread++)
3847 splinedata_t *spline;
3850 /* make local bsplines */
3851 if (grids == NULL || grids->nthread == 1)
3853 spline = &atc->spline[0];
3857 grid = &grids->grid;
3861 spline = &atc->spline[thread];
3863 make_thread_local_ind(atc,thread,spline);
3865 grid = &grids->grid_th[thread];
3870 make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
3871 atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
3876 /* put local atoms on grid. */
3877 #ifdef PME_TIME_SPREAD
3878 ct1a = omp_cyc_start();
3880 spread_q_bsplines_thread(grid,atc,spline,pme->spline_work);
3882 if (grids->nthread > 1)
3884 copy_local_grid(pme,grids,thread,fftgrid);
3886 #ifdef PME_TIME_SPREAD
3887 ct1a = omp_cyc_end(ct1a);
3888 cs1a[thread] += (double)ct1a;
3892 #ifdef PME_TIME_THREADS
3893 c2 = omp_cyc_end(c2);
3897 if (bSpread && grids->nthread > 1)
3899 #ifdef PME_TIME_THREADS
3900 c3 = omp_cyc_start();
3902 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3903 for(thread=0; thread<grids->nthread; thread++)
3905 reduce_threadgrid_overlap(pme,grids,thread,
3907 pme->overlap[0].sendbuf,
3908 pme->overlap[1].sendbuf);
3910 #ifdef PME_TIME_THREADS
3911 c3 = omp_cyc_end(c3);
3915 if (pme->nnodes > 1)
3917 /* Communicate the overlapping part of the fftgrid */
3918 sum_fftgrid_dd(pme,fftgrid);
3922 #ifdef PME_TIME_THREADS
3926 printf("idx %.2f spread %.2f red %.2f",
3927 cs1*1e-9,cs2*1e-9,cs3*1e-9);
3928 #ifdef PME_TIME_SPREAD
3929 for(thread=0; thread<nthread; thread++)
3930 printf(" %.2f",cs1a[thread]*1e-9);
3938 static void dump_grid(FILE *fp,
3939 int sx,int sy,int sz,int nx,int ny,int nz,
3940 int my,int mz,const real *g)
3950 fprintf(fp,"%2d %2d %2d %6.3f\n",
3951 sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
3957 static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
3959 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3961 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3967 pme->pmegrid_start_ix,
3968 pme->pmegrid_start_iy,
3969 pme->pmegrid_start_iz,
3970 pme->pmegrid_nx-pme->pme_order+1,
3971 pme->pmegrid_ny-pme->pme_order+1,
3972 pme->pmegrid_nz-pme->pme_order+1,
3979 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
3981 pme_atomcomm_t *atc;
3984 if (pme->nnodes > 1)
3986 gmx_incons("gmx_pme_calc_energy called in parallel");
3990 gmx_incons("gmx_pme_calc_energy with free energy");
3993 atc = &pme->atc_energy;
3995 if (atc->spline == NULL)
3997 snew(atc->spline,atc->nthread);
4000 atc->bSpread = TRUE;
4001 atc->pme_order = pme->pme_order;
4003 pme_realloc_atomcomm_things(atc);
4007 /* We only use the A-charges grid */
4008 grid = &pme->pmegridA;
4010 spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
4012 *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
4016 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
4017 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
4019 /* Reset all the counters related to performance over the run */
4020 wallcycle_stop(wcycle,ewcRUN);
4021 wallcycle_reset_all(wcycle);
4023 ir->init_step += step_rel;
4024 ir->nsteps -= step_rel;
4025 wallcycle_start(wcycle,ewcRUN);
4029 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4031 t_commrec *cr, t_inputrec *ir,
4035 gmx_pme_t pme = NULL;
4038 while (ind < *npmedata)
4040 pme = (*pmedata)[ind];
4041 if (pme->nkx == grid_size[XX] &&
4042 pme->nky == grid_size[YY] &&
4043 pme->nkz == grid_size[ZZ])
4054 srenew(*pmedata,*npmedata);
4056 /* Generate a new PME data structure, copying part of the old pointers */
4057 gmx_pme_reinit(&((*pmedata)[ind]),cr,pme,ir,grid_size);
4059 *pme_ret = (*pmedata)[ind];
4063 int gmx_pmeonly(gmx_pme_t pme,
4064 t_commrec *cr, t_nrnb *nrnb,
4065 gmx_wallcycle_t wcycle,
4066 real ewaldcoeff, gmx_bool bGatherOnly,
4071 gmx_pme_pp_t pme_pp;
4074 rvec *x_pp=NULL,*f_pp=NULL;
4075 real *chargeA=NULL,*chargeB=NULL;
4077 int maxshift_x=0,maxshift_y=0;
4078 real energy,dvdlambda;
4083 gmx_large_int_t step,step_rel;
4086 /* This data will only use with PME tuning, i.e. switching PME grids */
4088 snew(pmedata,npmedata);
4091 pme_pp = gmx_pme_pp_init(cr);
4096 do /****** this is a quasi-loop over time steps! */
4098 /* The reason for having a loop here is PME grid tuning/switching */
4101 /* Domain decomposition */
4102 natoms = gmx_pme_recv_q_x(pme_pp,
4103 &chargeA,&chargeB,box,&x_pp,&f_pp,
4104 &maxshift_x,&maxshift_y,
4108 grid_switch,&ewaldcoeff);
4112 /* Switch the PME grid to grid_switch */
4113 gmx_pmeonly_switch(&npmedata,&pmedata,grid_switch,cr,ir,&pme);
4116 while (natoms == -2);
4120 /* We should stop: break out of the loop */
4124 step_rel = step - ir->init_step;
4127 wallcycle_start(wcycle,ewcRUN);
4129 wallcycle_start(wcycle,ewcPMEMESH);
4133 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
4134 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
4135 &energy,lambda,&dvdlambda,
4136 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4138 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
4140 gmx_pme_send_force_vir_ener(pme_pp,
4141 f_pp,vir,energy,dvdlambda,
4146 if (step_rel == wcycle_get_reset_counters(wcycle))
4148 /* Reset all the counters related to performance over the run */
4149 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
4150 wcycle_set_reset_counters(wcycle, 0);
4153 } /***** end of quasi-loop, we stop with the break above */
4159 int gmx_pme_do(gmx_pme_t pme,
4160 int start, int homenr,
4162 real *chargeA, real *chargeB,
4163 matrix box, t_commrec *cr,
4164 int maxshift_x, int maxshift_y,
4165 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4166 matrix vir, real ewaldcoeff,
4167 real *energy, real lambda,
4168 real *dvdlambda, int flags)
4170 int q,d,i,j,ntot,npme;
4173 pme_atomcomm_t *atc=NULL;
4174 pmegrids_t *pmegrid=NULL;
4178 real *charge=NULL,*q_d;
4182 gmx_parallel_3dfft_t pfft_setup;
4184 t_complex * cfftgrid;
4186 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4187 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4189 assert(pme->nnodes > 0);
4190 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4192 if (pme->nnodes > 1) {
4195 if (atc->npd > atc->pd_nalloc) {
4196 atc->pd_nalloc = over_alloc_dd(atc->npd);
4197 srenew(atc->pd,atc->pd_nalloc);
4199 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4203 /* This could be necessary for TPI */
4204 pme->atc[0].n = homenr;
4207 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
4209 pmegrid = &pme->pmegridA;
4210 fftgrid = pme->fftgridA;
4211 cfftgrid = pme->cfftgridA;
4212 pfft_setup = pme->pfft_setupA;
4213 charge = chargeA+start;
4215 pmegrid = &pme->pmegridB;
4216 fftgrid = pme->fftgridB;
4217 cfftgrid = pme->cfftgridB;
4218 pfft_setup = pme->pfft_setupB;
4219 charge = chargeB+start;
4221 grid = pmegrid->grid.grid;
4222 /* Unpack structure */
4224 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
4225 cr->nnodes,cr->nodeid);
4226 fprintf(debug,"Grid = %p\n",(void*)grid);
4228 gmx_fatal(FARGS,"No grid!");
4232 m_inv_ur0(box,pme->recipbox);
4234 if (pme->nnodes == 1) {
4236 if (DOMAINDECOMP(cr)) {
4238 pme_realloc_atomcomm_things(atc);
4244 wallcycle_start(wcycle,ewcPME_REDISTXF);
4245 for(d=pme->ndecompdim-1; d>=0; d--)
4247 if (d == pme->ndecompdim-1)
4255 n_d = pme->atc[d+1].n;
4261 if (atc->npd > atc->pd_nalloc) {
4262 atc->pd_nalloc = over_alloc_dd(atc->npd);
4263 srenew(atc->pd,atc->pd_nalloc);
4265 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4266 pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
4269 GMX_BARRIER(cr->mpi_comm_mygroup);
4270 /* Redistribute x (only once) and qA or qB */
4271 if (DOMAINDECOMP(cr)) {
4272 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
4274 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
4279 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4283 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
4286 if (flags & GMX_PME_SPREAD_Q)
4288 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4290 /* Spread the charges on a grid */
4291 GMX_MPE_LOG(ev_spread_on_grid_start);
4293 /* Spread the charges on a grid */
4294 spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
4295 GMX_MPE_LOG(ev_spread_on_grid_finish);
4299 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
4301 inc_nrnb(nrnb,eNR_SPREADQBSP,
4302 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4304 if (pme->nthread == 1)
4306 wrap_periodic_pmegrid(pme,grid);
4308 /* sum contributions to local grid from other nodes */
4310 if (pme->nnodes > 1)
4312 GMX_BARRIER(cr->mpi_comm_mygroup);
4313 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
4318 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
4321 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4324 dump_local_fftgrid(pme,fftgrid);
4329 /* Here we start a large thread parallel region */
4330 #pragma omp parallel num_threads(pme->nthread) private(thread)
4332 thread=gmx_omp_get_thread_num();
4333 if (flags & GMX_PME_SOLVE)
4340 GMX_BARRIER(cr->mpi_comm_mygroup);
4341 GMX_MPE_LOG(ev_gmxfft3d_start);
4342 wallcycle_start(wcycle,ewcPME_FFT);
4344 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
4345 fftgrid,cfftgrid,thread,wcycle);
4348 wallcycle_stop(wcycle,ewcPME_FFT);
4349 GMX_MPE_LOG(ev_gmxfft3d_finish);
4353 /* solve in k-space for our local cells */
4356 GMX_BARRIER(cr->mpi_comm_mygroup);
4357 GMX_MPE_LOG(ev_solve_pme_start);
4358 wallcycle_start(wcycle,ewcPME_SOLVE);
4361 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
4362 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4364 pme->nthread,thread);
4367 wallcycle_stop(wcycle,ewcPME_SOLVE);
4369 GMX_MPE_LOG(ev_solve_pme_finish);
4370 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
4379 GMX_BARRIER(cr->mpi_comm_mygroup);
4380 GMX_MPE_LOG(ev_gmxfft3d_start);
4382 wallcycle_start(wcycle,ewcPME_FFT);
4384 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
4385 cfftgrid,fftgrid,thread,wcycle);
4388 wallcycle_stop(wcycle,ewcPME_FFT);
4391 GMX_MPE_LOG(ev_gmxfft3d_finish);
4393 if (pme->nodeid == 0)
4395 ntot = pme->nkx*pme->nky*pme->nkz;
4396 npme = ntot*log((real)ntot)/log(2.0);
4397 inc_nrnb(nrnb,eNR_FFT,2*npme);
4400 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4403 copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
4406 /* End of thread parallel section.
4407 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4412 /* distribute local grid to all nodes */
4414 if (pme->nnodes > 1) {
4415 GMX_BARRIER(cr->mpi_comm_mygroup);
4416 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
4421 unwrap_periodic_pmegrid(pme,grid);
4423 /* interpolate forces for our local atoms */
4424 GMX_BARRIER(cr->mpi_comm_mygroup);
4425 GMX_MPE_LOG(ev_gather_f_bsplines_start);
4429 /* If we are running without parallelization,
4430 * atc->f is the actual force array, not a buffer,
4431 * therefore we should not clear it.
4433 bClearF = (q == 0 && PAR(cr));
4434 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4435 for(thread=0; thread<pme->nthread; thread++)
4437 gather_f_bsplines(pme,grid,bClearF,atc,
4438 &atc->spline[thread],
4439 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
4444 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
4446 inc_nrnb(nrnb,eNR_GATHERFBSP,
4447 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4448 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4453 /* This should only be called on the master thread
4454 * and after the threads have synchronized.
4456 get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
4460 if (bCalcF && pme->nnodes > 1) {
4461 wallcycle_start(wcycle,ewcPME_REDISTXF);
4462 for(d=0; d<pme->ndecompdim; d++)
4465 if (d == pme->ndecompdim - 1)
4472 n_d = pme->atc[d+1].n;
4473 f_d = pme->atc[d+1].f;
4475 GMX_BARRIER(cr->mpi_comm_mygroup);
4476 if (DOMAINDECOMP(cr)) {
4477 dd_pmeredist_f(pme,atc,n_d,f_d,
4478 d==pme->ndecompdim-1 && pme->bPPnode);
4480 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4484 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4491 *energy = energy_AB[0];
4492 m_add(vir,vir_AB[0],vir);
4494 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4495 *dvdlambda += energy_AB[1] - energy_AB[0];
4496 for(i=0; i<DIM; i++)
4498 for(j=0; j<DIM; j++)
4500 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4501 lambda*vir_AB[1][i][j];
4513 fprintf(debug,"PME mesh energy: %g\n",*energy);