1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
80 #include "gmxcomplex.h"
84 #include "gmx_fatal.h"
90 #include "gmx_wallcycle.h"
91 #include "gmx_parallel_3dfft.h"
93 #include "gmx_cyclecounter.h"
95 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
96 #include "gmx_sse2_single.h"
99 /* Some old AMD processors could have problems with unaligned loads+stores */
101 #define PME_SSE_UNALIGNED
105 #include "mpelogging.h"
108 /* #define PRT_FORCE */
109 /* conditions for on the fly time-measurement */
110 /* #define TAKETIME (step > 1 && timesteps < 10) */
111 #define TAKETIME FALSE
113 /* #define PME_TIME_THREADS */
116 #define mpi_type MPI_DOUBLE
118 #define mpi_type MPI_FLOAT
121 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
122 #define GMX_CACHE_SEP 64
124 /* We only define a maximum to be able to use local arrays without allocation.
125 * An order larger than 12 should never be needed, even for test cases.
126 * If needed it can be changed here.
128 #define PME_ORDER_MAX 12
130 /* Internal datastructures */
146 int *send_id,*recv_id;
147 pme_grid_comm_t *comm_data;
153 int *n; /* Cumulative counts of the number of particles per thread */
154 int nalloc; /* Allocation size of i */
155 int *i; /* Particle indices ordered on thread index (n) */
166 int dimind; /* The index of the dimension, 0=x, 1=y */
173 int *node_dest; /* The nodes to send x and q to with DD */
174 int *node_src; /* The nodes to receive x and q from with DD */
175 int *buf_index; /* Index for commnode into the buffers */
182 int *count; /* The number of atoms to send to each node */
184 int *rcount; /* The number of atoms to receive */
191 gmx_bool bSpread; /* These coordinates are used for spreading */
194 rvec *fractx; /* Fractional coordinate relative to the
195 * lower cell boundary
198 int *thread_idx; /* Which thread should spread which charge */
199 thread_plist_t *thread_plist;
200 splinedata_t *spline;
207 ivec ci; /* The spatial location of this grid */
208 ivec n; /* The size of *grid, including order-1 */
209 ivec offset; /* The grid offset from the full node grid */
210 int order; /* PME spreading order */
211 real *grid; /* The grid local thread, size n */
215 pmegrid_t grid; /* The full node grid (non thread-local) */
216 int nthread; /* The number of threads operating on this grid */
217 ivec nc; /* The local spatial decomposition over the threads */
218 pmegrid_t *grid_th; /* Array of grids for each thread */
219 int **g2t; /* The grid to thread index */
220 ivec nthread_comm; /* The number of threads to communicate with */
226 /* Masks for SSE aligned spreading and gathering */
227 __m128 mask_SSE0[6],mask_SSE1[6];
229 int dummy; /* C89 requires that struct has at least one member */
234 /* work data for solve_pme */
250 typedef struct gmx_pme {
251 int ndecompdim; /* The number of decomposition dimensions */
252 int nodeid; /* Our nodeid in mpi->mpi_comm */
255 int nnodes; /* The number of nodes doing PME */
260 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
262 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
265 int nthread; /* The number of threads doing PME */
267 gmx_bool bPPnode; /* Node also does particle-particle forces */
268 gmx_bool bFEP; /* Compute Free energy contribution */
269 int nkx,nky,nkz; /* Grid dimensions */
273 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
275 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
276 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
277 /* pmegrid_nz might be larger than strictly necessary to ensure
278 * memory alignment, pmegrid_nz_base gives the real base size.
281 /* The local PME grid starting indices */
282 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
284 /* Work data for spreading and gathering */
285 pme_spline_work_t spline_work;
287 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
288 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
289 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
291 t_complex *cfftgridA; /* Grids for complex FFT data */
292 t_complex *cfftgridB;
293 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
295 gmx_parallel_3dfft_t pfft_setupA;
296 gmx_parallel_3dfft_t pfft_setupB;
299 real *fshx,*fshy,*fshz;
301 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
305 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
307 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
309 rvec *bufv; /* Communication buffer */
310 real *bufr; /* Communication buffer */
311 int buf_nalloc; /* The communication buffer size */
313 /* thread local work data for solve_pme */
316 /* Work data for PME_redist */
317 gmx_bool redist_init;
325 int redist_buf_nalloc;
327 /* Work data for sum_qgrid */
328 real * sum_qgrid_tmp;
329 real * sum_qgrid_dd_tmp;
333 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
334 int start,int end,int thread)
337 int *idxptr,tix,tiy,tiz;
338 real *xptr,*fptr,tx,ty,tz;
339 real rxx,ryx,ryy,rzx,rzy,rzz;
341 int start_ix,start_iy,start_iz;
342 int *g2tx,*g2ty,*g2tz;
344 int *thread_idx=NULL;
345 thread_plist_t *tpl=NULL;
353 start_ix = pme->pmegrid_start_ix;
354 start_iy = pme->pmegrid_start_iy;
355 start_iz = pme->pmegrid_start_iz;
357 rxx = pme->recipbox[XX][XX];
358 ryx = pme->recipbox[YY][XX];
359 ryy = pme->recipbox[YY][YY];
360 rzx = pme->recipbox[ZZ][XX];
361 rzy = pme->recipbox[ZZ][YY];
362 rzz = pme->recipbox[ZZ][ZZ];
364 g2tx = pme->pmegridA.g2t[XX];
365 g2ty = pme->pmegridA.g2t[YY];
366 g2tz = pme->pmegridA.g2t[ZZ];
368 bThreads = (atc->nthread > 1);
371 thread_idx = atc->thread_idx;
373 tpl = &atc->thread_plist[thread];
375 for(i=0; i<atc->nthread; i++)
381 for(i=start; i<end; i++) {
383 idxptr = atc->idx[i];
384 fptr = atc->fractx[i];
386 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
387 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
388 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
389 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
395 /* Because decomposition only occurs in x and y,
396 * we never have a fraction correction in z.
398 fptr[XX] = tx - tix + pme->fshx[tix];
399 fptr[YY] = ty - tiy + pme->fshy[tiy];
402 idxptr[XX] = pme->nnx[tix];
403 idxptr[YY] = pme->nny[tiy];
404 idxptr[ZZ] = pme->nnz[tiz];
407 range_check(idxptr[XX],0,pme->pmegrid_nx);
408 range_check(idxptr[YY],0,pme->pmegrid_ny);
409 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
414 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
415 thread_idx[i] = thread_i;
422 /* Make a list of particle indices sorted on thread */
424 /* Get the cumulative count */
425 for(i=1; i<atc->nthread; i++)
427 tpl_n[i] += tpl_n[i-1];
429 /* The current implementation distributes particles equally
430 * over the threads, so we could actually allocate for that
431 * in pme_realloc_atomcomm_things.
433 if (tpl_n[atc->nthread-1] > tpl->nalloc)
435 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
436 srenew(tpl->i,tpl->nalloc);
438 /* Set tpl_n to the cumulative start */
439 for(i=atc->nthread-1; i>=1; i--)
441 tpl_n[i] = tpl_n[i-1];
445 /* Fill our thread local array with indices sorted on thread */
446 for(i=start; i<end; i++)
448 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
450 /* Now tpl_n contains the cummulative count again */
454 static void make_thread_local_ind(pme_atomcomm_t *atc,
455 int thread,splinedata_t *spline)
460 /* Combine the indices made by each thread into one index */
464 for(t=0; t<atc->nthread; t++)
466 tpl = &atc->thread_plist[t];
467 /* Copy our part (start - end) from the list of thread t */
470 start = tpl->n[thread-1];
472 end = tpl->n[thread];
473 for(i=start; i<end; i++)
475 spline->ind[n++] = tpl->i[i];
483 static void pme_calc_pidx(int start, int end,
484 matrix recipbox, rvec x[],
485 pme_atomcomm_t *atc, int *count)
490 real rxx,ryx,rzx,ryy,rzy;
493 /* Calculate PME task index (pidx) for each grid index.
494 * Here we always assign equally sized slabs to each node
495 * for load balancing reasons (the PME grid spacing is not used).
501 /* Reset the count */
502 for(i=0; i<nslab; i++)
507 if (atc->dimind == 0)
509 rxx = recipbox[XX][XX];
510 ryx = recipbox[YY][XX];
511 rzx = recipbox[ZZ][XX];
512 /* Calculate the node index in x-dimension */
513 for(i=start; i<end; i++)
516 /* Fractional coordinates along box vectors */
517 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
518 si = (int)(s + 2*nslab) % nslab;
525 ryy = recipbox[YY][YY];
526 rzy = recipbox[ZZ][YY];
527 /* Calculate the node index in y-dimension */
528 for(i=start; i<end; i++)
531 /* Fractional coordinates along box vectors */
532 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
533 si = (int)(s + 2*nslab) % nslab;
540 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
543 int nthread,thread,slab;
545 nthread = atc->nthread;
547 #pragma omp parallel for num_threads(nthread) schedule(static)
548 for(thread=0; thread<nthread; thread++)
550 pme_calc_pidx(natoms* thread /nthread,
551 natoms*(thread+1)/nthread,
552 recipbox,x,atc,atc->count_thread[thread]);
554 /* Non-parallel reduction, since nslab is small */
556 for(thread=1; thread<nthread; thread++)
558 for(slab=0; slab<atc->nslab; slab++)
560 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
565 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
569 srenew(spline->ind,atc->nalloc);
570 /* Initialize the index to identity so it works without threads */
571 for(i=0; i<atc->nalloc; i++)
578 srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
579 srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
583 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
585 int nalloc_old,i,j,nalloc_tpl;
587 /* We have to avoid a NULL pointer for atc->x to avoid
588 * possible fatal errors in MPI routines.
590 if (atc->n > atc->nalloc || atc->nalloc == 0)
592 nalloc_old = atc->nalloc;
593 atc->nalloc = over_alloc_dd(max(atc->n,1));
595 if (atc->nslab > 1) {
596 srenew(atc->x,atc->nalloc);
597 srenew(atc->q,atc->nalloc);
598 srenew(atc->f,atc->nalloc);
599 for(i=nalloc_old; i<atc->nalloc; i++)
601 clear_rvec(atc->f[i]);
605 srenew(atc->fractx,atc->nalloc);
606 srenew(atc->idx ,atc->nalloc);
608 if (atc->nthread > 1)
610 srenew(atc->thread_idx,atc->nalloc);
613 for(i=0; i<atc->nthread; i++)
615 pme_realloc_splinedata(&atc->spline[i],atc);
621 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
622 int n, gmx_bool bXF, rvec *x_f, real *charge,
624 /* Redistribute particle data for PME calculation */
625 /* domain decomposition by x coordinate */
630 if(FALSE == pme->redist_init) {
631 snew(pme->scounts,atc->nslab);
632 snew(pme->rcounts,atc->nslab);
633 snew(pme->sdispls,atc->nslab);
634 snew(pme->rdispls,atc->nslab);
635 snew(pme->sidx,atc->nslab);
636 pme->redist_init = TRUE;
638 if (n > pme->redist_buf_nalloc) {
639 pme->redist_buf_nalloc = over_alloc_dd(n);
640 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
647 /* forward, redistribution from pp to pme */
649 /* Calculate send counts and exchange them with other nodes */
650 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
651 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
652 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
654 /* Calculate send and receive displacements and index into send
659 for(i=1; i<atc->nslab; i++) {
660 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
661 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
662 pme->sidx[i]=pme->sdispls[i];
664 /* Total # of particles to be received */
665 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
667 pme_realloc_atomcomm_things(atc);
669 /* Copy particle coordinates into send buffer and exchange*/
670 for(i=0; (i<n); i++) {
671 ii=DIM*pme->sidx[pme->idxa[i]];
672 pme->sidx[pme->idxa[i]]++;
673 pme->redist_buf[ii+XX]=x_f[i][XX];
674 pme->redist_buf[ii+YY]=x_f[i][YY];
675 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
677 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
678 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
679 pme->rvec_mpi, atc->mpi_comm);
682 /* Copy charge into send buffer and exchange*/
683 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
684 for(i=0; (i<n); i++) {
685 ii=pme->sidx[pme->idxa[i]];
686 pme->sidx[pme->idxa[i]]++;
687 pme->redist_buf[ii]=charge[i];
689 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
690 atc->q, pme->rcounts, pme->rdispls, mpi_type,
693 else { /* backward, redistribution from pme to pp */
694 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
695 pme->redist_buf, pme->scounts, pme->sdispls,
696 pme->rvec_mpi, atc->mpi_comm);
698 /* Copy data from receive buffer */
699 for(i=0; i<atc->nslab; i++)
700 pme->sidx[i] = pme->sdispls[i];
701 for(i=0; (i<n); i++) {
702 ii = DIM*pme->sidx[pme->idxa[i]];
703 x_f[i][XX] += pme->redist_buf[ii+XX];
704 x_f[i][YY] += pme->redist_buf[ii+YY];
705 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
706 pme->sidx[pme->idxa[i]]++;
712 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
713 gmx_bool bBackward,int shift,
714 void *buf_s,int nbyte_s,
715 void *buf_r,int nbyte_r)
721 if (bBackward == FALSE) {
722 dest = atc->node_dest[shift];
723 src = atc->node_src[shift];
725 dest = atc->node_src[shift];
726 src = atc->node_dest[shift];
729 if (nbyte_s > 0 && nbyte_r > 0) {
730 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
732 buf_r,nbyte_r,MPI_BYTE,
734 atc->mpi_comm,&stat);
735 } else if (nbyte_s > 0) {
736 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
739 } else if (nbyte_r > 0) {
740 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
742 atc->mpi_comm,&stat);
747 static void dd_pmeredist_x_q(gmx_pme_t pme,
748 int n, gmx_bool bX, rvec *x, real *charge,
751 int *commnode,*buf_index;
752 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
754 commnode = atc->node_dest;
755 buf_index = atc->buf_index;
757 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
760 for(i=0; i<nnodes_comm; i++) {
761 buf_index[commnode[i]] = nsend;
762 nsend += atc->count[commnode[i]];
765 if (atc->count[atc->nodeid] + nsend != n)
766 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
767 "This usually means that your system is not well equilibrated.",
768 n - (atc->count[atc->nodeid] + nsend),
769 pme->nodeid,'x'+atc->dimind);
771 if (nsend > pme->buf_nalloc) {
772 pme->buf_nalloc = over_alloc_dd(nsend);
773 srenew(pme->bufv,pme->buf_nalloc);
774 srenew(pme->bufr,pme->buf_nalloc);
777 atc->n = atc->count[atc->nodeid];
778 for(i=0; i<nnodes_comm; i++) {
779 scount = atc->count[commnode[i]];
780 /* Communicate the count */
782 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
783 atc->dimind,atc->nodeid,commnode[i],scount);
784 pme_dd_sendrecv(atc,FALSE,i,
786 &atc->rcount[i],sizeof(int));
787 atc->n += atc->rcount[i];
790 pme_realloc_atomcomm_things(atc);
796 if (node == atc->nodeid) {
797 /* Copy direct to the receive buffer */
799 copy_rvec(x[i],atc->x[local_pos]);
801 atc->q[local_pos] = charge[i];
804 /* Copy to the send buffer */
806 copy_rvec(x[i],pme->bufv[buf_index[node]]);
808 pme->bufr[buf_index[node]] = charge[i];
814 for(i=0; i<nnodes_comm; i++) {
815 scount = atc->count[commnode[i]];
816 rcount = atc->rcount[i];
817 if (scount > 0 || rcount > 0) {
819 /* Communicate the coordinates */
820 pme_dd_sendrecv(atc,FALSE,i,
821 pme->bufv[buf_pos],scount*sizeof(rvec),
822 atc->x[local_pos],rcount*sizeof(rvec));
824 /* Communicate the charges */
825 pme_dd_sendrecv(atc,FALSE,i,
826 pme->bufr+buf_pos,scount*sizeof(real),
827 atc->q+local_pos,rcount*sizeof(real));
829 local_pos += atc->rcount[i];
834 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
838 int *commnode,*buf_index;
839 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
841 commnode = atc->node_dest;
842 buf_index = atc->buf_index;
844 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
846 local_pos = atc->count[atc->nodeid];
848 for(i=0; i<nnodes_comm; i++) {
849 scount = atc->rcount[i];
850 rcount = atc->count[commnode[i]];
851 if (scount > 0 || rcount > 0) {
852 /* Communicate the forces */
853 pme_dd_sendrecv(atc,TRUE,i,
854 atc->f[local_pos],scount*sizeof(rvec),
855 pme->bufv[buf_pos],rcount*sizeof(rvec));
858 buf_index[commnode[i]] = buf_pos;
868 if (node == atc->nodeid)
870 /* Add from the local force array */
871 rvec_inc(f[i],atc->f[local_pos]);
876 /* Add from the receive buffer */
877 rvec_inc(f[i],pme->bufv[buf_index[node]]);
887 if (node == atc->nodeid)
889 /* Copy from the local force array */
890 copy_rvec(atc->f[local_pos],f[i]);
895 /* Copy from the receive buffer */
896 copy_rvec(pme->bufv[buf_index[node]],f[i]);
905 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
907 pme_overlap_t *overlap;
908 int send_index0,send_nindex;
909 int recv_index0,recv_nindex;
911 int i,j,k,ix,iy,iz,icnt;
912 int ipulse,send_id,recv_id,datasize;
914 real *sendptr,*recvptr;
916 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
917 overlap = &pme->overlap[1];
919 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
921 /* Since we have already (un)wrapped the overlap in the z-dimension,
922 * we only have to communicate 0 to nkz (not pmegrid_nz).
924 if (direction==GMX_SUM_QGRID_FORWARD)
926 send_id = overlap->send_id[ipulse];
927 recv_id = overlap->recv_id[ipulse];
928 send_index0 = overlap->comm_data[ipulse].send_index0;
929 send_nindex = overlap->comm_data[ipulse].send_nindex;
930 recv_index0 = overlap->comm_data[ipulse].recv_index0;
931 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
935 send_id = overlap->recv_id[ipulse];
936 recv_id = overlap->send_id[ipulse];
937 send_index0 = overlap->comm_data[ipulse].recv_index0;
938 send_nindex = overlap->comm_data[ipulse].recv_nindex;
939 recv_index0 = overlap->comm_data[ipulse].send_index0;
940 recv_nindex = overlap->comm_data[ipulse].send_nindex;
943 /* Copy data to contiguous send buffer */
946 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
947 pme->nodeid,overlap->nodeid,send_id,
948 pme->pmegrid_start_iy,
949 send_index0-pme->pmegrid_start_iy,
950 send_index0-pme->pmegrid_start_iy+send_nindex);
953 for(i=0;i<pme->pmegrid_nx;i++)
956 for(j=0;j<send_nindex;j++)
958 iy = j + send_index0 - pme->pmegrid_start_iy;
959 for(k=0;k<pme->nkz;k++)
962 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
967 datasize = pme->pmegrid_nx * pme->nkz;
969 MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
971 overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
973 overlap->mpi_comm,&stat);
975 /* Get data from contiguous recv buffer */
978 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
979 pme->nodeid,overlap->nodeid,recv_id,
980 pme->pmegrid_start_iy,
981 recv_index0-pme->pmegrid_start_iy,
982 recv_index0-pme->pmegrid_start_iy+recv_nindex);
985 for(i=0;i<pme->pmegrid_nx;i++)
988 for(j=0;j<recv_nindex;j++)
990 iy = j + recv_index0 - pme->pmegrid_start_iy;
991 for(k=0;k<pme->nkz;k++)
994 if(direction==GMX_SUM_QGRID_FORWARD)
996 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1000 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1007 /* Major dimension is easier, no copying required,
1008 * but we might have to sum to separate array.
1009 * Since we don't copy, we have to communicate up to pmegrid_nz,
1010 * not nkz as for the minor direction.
1012 overlap = &pme->overlap[0];
1014 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
1016 if(direction==GMX_SUM_QGRID_FORWARD)
1018 send_id = overlap->send_id[ipulse];
1019 recv_id = overlap->recv_id[ipulse];
1020 send_index0 = overlap->comm_data[ipulse].send_index0;
1021 send_nindex = overlap->comm_data[ipulse].send_nindex;
1022 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1023 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1024 recvptr = overlap->recvbuf;
1028 send_id = overlap->recv_id[ipulse];
1029 recv_id = overlap->send_id[ipulse];
1030 send_index0 = overlap->comm_data[ipulse].recv_index0;
1031 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1032 recv_index0 = overlap->comm_data[ipulse].send_index0;
1033 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1034 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1037 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1038 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1042 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1043 pme->nodeid,overlap->nodeid,send_id,
1044 pme->pmegrid_start_ix,
1045 send_index0-pme->pmegrid_start_ix,
1046 send_index0-pme->pmegrid_start_ix+send_nindex);
1047 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1048 pme->nodeid,overlap->nodeid,recv_id,
1049 pme->pmegrid_start_ix,
1050 recv_index0-pme->pmegrid_start_ix,
1051 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1054 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
1056 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
1058 overlap->mpi_comm,&stat);
1060 /* ADD data from contiguous recv buffer */
1061 if(direction==GMX_SUM_QGRID_FORWARD)
1063 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1064 for(i=0;i<recv_nindex*datasize;i++)
1066 p[i] += overlap->recvbuf[i];
1075 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1077 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1078 ivec local_pme_size;
1082 /* Dimensions should be identical for A/B grid, so we just use A here */
1083 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1088 local_pme_size[0] = pme->pmegrid_nx;
1089 local_pme_size[1] = pme->pmegrid_ny;
1090 local_pme_size[2] = pme->pmegrid_nz;
1092 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1093 the offset is identical, and the PME grid always has more data (due to overlap)
1098 char fn[STRLEN],format[STRLEN];
1100 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
1101 fp = ffopen(fn,"w");
1102 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
1103 fp2 = ffopen(fn,"w");
1104 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1107 for(ix=0;ix<local_fft_ndata[XX];ix++)
1109 for(iy=0;iy<local_fft_ndata[YY];iy++)
1111 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1113 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1114 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1115 fftgrid[fftidx] = pmegrid[pmeidx];
1117 val = 100*pmegrid[pmeidx];
1118 if (pmegrid[pmeidx] != 0)
1119 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
1120 5.0*ix,5.0*iy,5.0*iz,1.0,val);
1121 if (pmegrid[pmeidx] != 0)
1122 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
1124 pme->pmegrid_start_ix + ix,
1125 pme->pmegrid_start_iy + iy,
1126 pme->pmegrid_start_iz + iz,
1141 static gmx_cycles_t omp_cyc_start()
1143 return gmx_cycles_read();
1146 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1148 return gmx_cycles_read() - c;
1153 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1154 int nthread,int thread)
1156 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1157 ivec local_pme_size;
1158 int ixy0,ixy1,ixy,ix,iy,iz;
1160 #ifdef PME_TIME_THREADS
1162 static double cs1=0;
1166 #ifdef PME_TIME_THREADS
1167 c1 = omp_cyc_start();
1169 /* Dimensions should be identical for A/B grid, so we just use A here */
1170 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1175 local_pme_size[0] = pme->pmegrid_nx;
1176 local_pme_size[1] = pme->pmegrid_ny;
1177 local_pme_size[2] = pme->pmegrid_nz;
1179 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1180 the offset is identical, and the PME grid always has more data (due to overlap)
1182 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1183 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1185 for(ixy=ixy0;ixy<ixy1;ixy++)
1187 ix = ixy/local_fft_ndata[YY];
1188 iy = ixy - ix*local_fft_ndata[YY];
1190 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1191 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1192 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1194 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1198 #ifdef PME_TIME_THREADS
1199 c1 = omp_cyc_end(c1);
1204 printf("copy %.2f\n",cs1*1e-9);
1213 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1215 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1221 pnx = pme->pmegrid_nx;
1222 pny = pme->pmegrid_ny;
1223 pnz = pme->pmegrid_nz;
1225 overlap = pme->pme_order - 1;
1227 /* Add periodic overlap in z */
1228 for(ix=0; ix<pme->pmegrid_nx; ix++)
1230 for(iy=0; iy<pme->pmegrid_ny; iy++)
1232 for(iz=0; iz<overlap; iz++)
1234 pmegrid[(ix*pny+iy)*pnz+iz] +=
1235 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1240 if (pme->nnodes_minor == 1)
1242 for(ix=0; ix<pme->pmegrid_nx; ix++)
1244 for(iy=0; iy<overlap; iy++)
1246 for(iz=0; iz<nz; iz++)
1248 pmegrid[(ix*pny+iy)*pnz+iz] +=
1249 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1255 if (pme->nnodes_major == 1)
1257 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1259 for(ix=0; ix<overlap; ix++)
1261 for(iy=0; iy<ny_x; iy++)
1263 for(iz=0; iz<nz; iz++)
1265 pmegrid[(ix*pny+iy)*pnz+iz] +=
1266 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1275 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1277 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
1283 pnx = pme->pmegrid_nx;
1284 pny = pme->pmegrid_ny;
1285 pnz = pme->pmegrid_nz;
1287 overlap = pme->pme_order - 1;
1289 if (pme->nnodes_major == 1)
1291 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1293 for(ix=0; ix<overlap; ix++)
1297 for(iy=0; iy<ny_x; iy++)
1299 for(iz=0; iz<nz; iz++)
1301 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1302 pmegrid[(ix*pny+iy)*pnz+iz];
1308 if (pme->nnodes_minor == 1)
1310 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1311 for(ix=0; ix<pme->pmegrid_nx; ix++)
1315 for(iy=0; iy<overlap; iy++)
1317 for(iz=0; iz<nz; iz++)
1319 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1320 pmegrid[(ix*pny+iy)*pnz+iz];
1326 /* Copy periodic overlap in z */
1327 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1328 for(ix=0; ix<pme->pmegrid_nx; ix++)
1332 for(iy=0; iy<pme->pmegrid_ny; iy++)
1334 for(iz=0; iz<overlap; iz++)
1336 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1337 pmegrid[(ix*pny+iy)*pnz+iz];
1343 static void clear_grid(int nx,int ny,int nz,real *grid,
1345 int fx,int fy,int fz,
1349 int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
1352 nc = 2 + (order - 2)/FLBS;
1353 ncz = 2 + (order - 2)/FLBSZ;
1355 for(fsx=fx; fsx<fx+nc; fsx++)
1357 for(fsy=fy; fsy<fy+nc; fsy++)
1359 for(fsz=fz; fsz<fz+ncz; fsz++)
1361 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1362 if (flag[flind] == 0)
1367 g0x = (gx*ny + gy)*nz + gz;
1368 for(x=0; x<FLBS; x++)
1371 for(y=0; y<FLBS; y++)
1373 for(z=0; z<FLBSZ; z++)
1389 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1390 #define DO_BSPLINE(order) \
1391 for(ithx=0; (ithx<order); ithx++) \
1393 index_x = (i0+ithx)*pny*pnz; \
1394 valx = qn*thx[ithx]; \
1396 for(ithy=0; (ithy<order); ithy++) \
1398 valxy = valx*thy[ithy]; \
1399 index_xy = index_x+(j0+ithy)*pnz; \
1401 for(ithz=0; (ithz<order); ithz++) \
1403 index_xyz = index_xy+(k0+ithz); \
1404 grid[index_xyz] += valxy*thz[ithz]; \
1410 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1411 pme_atomcomm_t *atc, splinedata_t *spline,
1412 pme_spline_work_t *work)
1415 /* spread charges from home atoms to local grid */
1418 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1420 int order,norder,index_x,index_xy,index_xyz;
1422 real *thx,*thy,*thz;
1423 int localsize, bndsize;
1424 int pnx,pny,pnz,ndatatot;
1427 pnx = pmegrid->n[XX];
1428 pny = pmegrid->n[YY];
1429 pnz = pmegrid->n[ZZ];
1431 offx = pmegrid->offset[XX];
1432 offy = pmegrid->offset[YY];
1433 offz = pmegrid->offset[ZZ];
1435 ndatatot = pnx*pny*pnz;
1436 grid = pmegrid->grid;
1437 for(i=0;i<ndatatot;i++)
1442 order = pmegrid->order;
1444 for(nn=0; nn<spline->n; nn++)
1446 n = spline->ind[nn];
1451 idxptr = atc->idx[n];
1454 i0 = idxptr[XX] - offx;
1455 j0 = idxptr[YY] - offy;
1456 k0 = idxptr[ZZ] - offz;
1458 thx = spline->theta[XX] + norder;
1459 thy = spline->theta[YY] + norder;
1460 thz = spline->theta[ZZ] + norder;
1465 #ifdef PME_SSE_UNALIGNED
1466 #define PME_SPREAD_SSE_ORDER4
1468 #define PME_SPREAD_SSE_ALIGNED
1471 #include "pme_sse_single.h"
1478 #define PME_SPREAD_SSE_ALIGNED
1480 #include "pme_sse_single.h"
1494 static void alloc_real_aligned(int n,real **ptr_raw,real **ptr)
1498 *ptr = (real *) (((size_t) *ptr_raw + 16) & (~((size_t) 15)));
1501 static void set_grid_alignment(int *pmegrid_nz,int pme_order)
1505 #ifndef PME_SSE_UNALIGNED
1510 /* Round nz up to a multiple of 4 to ensure alignment */
1511 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1516 static void set_gridsize_alignment(int *gridsize,int pme_order)
1519 #ifndef PME_SSE_UNALIGNED
1522 /* Add extra elements to ensured aligned operations do not go
1523 * beyond the allocated grid size.
1524 * Note that for pme_order=5, the pme grid z-size alignment
1525 * ensures that we will not go beyond the grid size.
1533 static void pmegrid_init(pmegrid_t *grid,
1534 int cx, int cy, int cz,
1535 int x0, int y0, int z0,
1536 int x1, int y1, int z1,
1537 gmx_bool set_alignment,
1546 grid->offset[XX] = x0;
1547 grid->offset[YY] = y0;
1548 grid->offset[ZZ] = z0;
1549 grid->n[XX] = x1 - x0 + pme_order - 1;
1550 grid->n[YY] = y1 - y0 + pme_order - 1;
1551 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1554 set_grid_alignment(&nz,pme_order);
1559 else if (nz != grid->n[ZZ])
1561 gmx_incons("pmegrid_init call with an unaligned z size");
1564 grid->order = pme_order;
1567 gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
1568 set_gridsize_alignment(&gridsize,pme_order);
1569 snew_aligned(grid->grid,gridsize,16);
1577 static int div_round_up(int enumerator,int denominator)
1579 return (enumerator + denominator - 1)/denominator;
1582 static void make_subgrid_division(const ivec n,int ovl,int nthread,
1585 int gsize_opt,gsize;
1590 for(nsx=1; nsx<=nthread; nsx++)
1592 if (nthread % nsx == 0)
1594 for(nsy=1; nsy<=nthread; nsy++)
1596 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1598 nsz = nthread/(nsx*nsy);
1600 /* Determine the number of grid points per thread */
1602 (div_round_up(n[XX],nsx) + ovl)*
1603 (div_round_up(n[YY],nsy) + ovl)*
1604 (div_round_up(n[ZZ],nsz) + ovl);
1606 /* Minimize the number of grids points per thread
1607 * and, secondarily, the number of cuts in minor dimensions.
1609 if (gsize_opt == -1 ||
1610 gsize < gsize_opt ||
1611 (gsize == gsize_opt &&
1612 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1624 env = getenv("GMX_PME_THREAD_DIVISION");
1627 sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
1630 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1632 gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
1636 static void pmegrids_init(pmegrids_t *grids,
1637 int nx,int ny,int nz,int nz_base,
1643 ivec n,n_base,g0,g1;
1644 int t,x,y,z,d,i,tfac;
1647 n[XX] = nx - (pme_order - 1);
1648 n[YY] = ny - (pme_order - 1);
1649 n[ZZ] = nz - (pme_order - 1);
1651 copy_ivec(n,n_base);
1652 n_base[ZZ] = nz_base;
1654 pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
1657 grids->nthread = nthread;
1659 make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
1661 if (grids->nthread > 1)
1667 for(d=0; d<DIM; d++)
1669 nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
1671 set_grid_alignment(&nst[ZZ],pme_order);
1675 fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
1676 grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
1677 fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1679 nst[XX],nst[YY],nst[ZZ]);
1682 snew(grids->grid_th,grids->nthread);
1684 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1685 set_gridsize_alignment(&gridsize,pme_order);
1686 snew_aligned(grid_all,
1687 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1690 for(x=0; x<grids->nc[XX]; x++)
1692 for(y=0; y<grids->nc[YY]; y++)
1694 for(z=0; z<grids->nc[ZZ]; z++)
1696 pmegrid_init(&grids->grid_th[t],
1698 (n[XX]*(x ))/grids->nc[XX],
1699 (n[YY]*(y ))/grids->nc[YY],
1700 (n[ZZ]*(z ))/grids->nc[ZZ],
1701 (n[XX]*(x+1))/grids->nc[XX],
1702 (n[YY]*(y+1))/grids->nc[YY],
1703 (n[ZZ]*(z+1))/grids->nc[ZZ],
1706 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1713 snew(grids->g2t,DIM);
1715 for(d=DIM-1; d>=0; d--)
1717 snew(grids->g2t[d],n[d]);
1719 for(i=0; i<n[d]; i++)
1721 /* The second check should match the parameters
1722 * of the pmegrid_init call above.
1724 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1728 grids->g2t[d][i] = t*tfac;
1731 tfac *= grids->nc[d];
1735 case XX: max_comm_lines = overlap_x; break;
1736 case YY: max_comm_lines = overlap_y; break;
1737 case ZZ: max_comm_lines = pme_order - 1; break;
1739 grids->nthread_comm[d] = 0;
1740 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
1742 grids->nthread_comm[d]++;
1746 fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
1747 'x'+d,grids->nthread_comm[d]);
1749 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1750 * work, but this is not a problematic restriction.
1752 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1754 gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
1760 static void pmegrids_destroy(pmegrids_t *grids)
1764 if (grids->grid.grid != NULL)
1766 sfree(grids->grid.grid);
1768 if (grids->nthread > 0)
1770 for(t=0; t<grids->nthread; t++)
1772 sfree(grids->grid_th[t].grid);
1774 sfree(grids->grid_th);
1780 static void realloc_work(pme_work_t *work,int nkx)
1782 if (nkx > work->nalloc)
1785 srenew(work->mhx ,work->nalloc);
1786 srenew(work->mhy ,work->nalloc);
1787 srenew(work->mhz ,work->nalloc);
1788 srenew(work->m2 ,work->nalloc);
1789 srenew(work->denom,work->nalloc);
1790 /* Allocate an aligned pointer for SSE operations, including 3 extra
1791 * elements at the end since SSE operates on 4 elements at a time.
1793 sfree_aligned(work->denom);
1794 sfree_aligned(work->tmp1);
1795 sfree_aligned(work->eterm);
1796 snew_aligned(work->denom,work->nalloc+3,16);
1797 snew_aligned(work->tmp1 ,work->nalloc+3,16);
1798 snew_aligned(work->eterm,work->nalloc+3,16);
1799 srenew(work->m2inv,work->nalloc);
1804 static void free_work(pme_work_t *work)
1810 sfree_aligned(work->denom);
1811 sfree_aligned(work->tmp1);
1812 sfree_aligned(work->eterm);
1818 /* Calculate exponentials through SSE in float precision */
1819 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1822 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
1825 __m128 tmp_d1,d_inv,tmp_r,tmp_e;
1827 f_sse = _mm_load1_ps(&f);
1828 for(kx=0; kx<end; kx+=4)
1830 tmp_d1 = _mm_load_ps(d_aligned+kx);
1831 lu = _mm_rcp_ps(tmp_d1);
1832 d_inv = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
1833 tmp_r = _mm_load_ps(r_aligned+kx);
1834 tmp_r = gmx_mm_exp_ps(tmp_r);
1835 tmp_e = _mm_mul_ps(f_sse,d_inv);
1836 tmp_e = _mm_mul_ps(tmp_e,tmp_r);
1837 _mm_store_ps(e_aligned+kx,tmp_e);
1842 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1845 for(kx=start; kx<end; kx++)
1849 for(kx=start; kx<end; kx++)
1853 for(kx=start; kx<end; kx++)
1855 e[kx] = f*r[kx]*d[kx];
1861 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1862 real ewaldcoeff,real vol,
1864 int nthread,int thread)
1866 /* do recip sum over local cells in grid */
1867 /* y major, z middle, x minor or continuous */
1869 int kx,ky,kz,maxkx,maxky,maxkz;
1870 int nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
1872 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1873 real ets2,struct2,vfactor,ets2vf;
1874 real d1,d2,energy=0;
1876 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1877 real rxx,ryx,ryy,rzx,rzy,rzz;
1879 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
1880 real mhxk,mhyk,mhzk,m2k;
1883 ivec local_ndata,local_offset,local_size;
1886 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1892 /* Dimensions should be identical for A/B grid, so we just use A here */
1893 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1899 rxx = pme->recipbox[XX][XX];
1900 ryx = pme->recipbox[YY][XX];
1901 ryy = pme->recipbox[YY][YY];
1902 rzx = pme->recipbox[ZZ][XX];
1903 rzy = pme->recipbox[ZZ][YY];
1904 rzz = pme->recipbox[ZZ][ZZ];
1910 work = &pme->work[thread];
1915 denom = work->denom;
1917 eterm = work->eterm;
1918 m2inv = work->m2inv;
1920 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1921 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1923 for(iyz=iyz0; iyz<iyz1; iyz++)
1925 iy = iyz/local_ndata[ZZ];
1926 iz = iyz - iy*local_ndata[ZZ];
1928 ky = iy + local_offset[YY];
1939 by = M_PI*vol*pme->bsp_mod[YY][ky];
1941 kz = iz + local_offset[ZZ];
1945 bz = pme->bsp_mod[ZZ][kz];
1947 /* 0.5 correction for corner points */
1949 if (kz == 0 || kz == (nz+1)/2)
1954 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1956 /* We should skip the k-space point (0,0,0) */
1957 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
1959 kxstart = local_offset[XX];
1963 kxstart = local_offset[XX] + 1;
1966 kxend = local_offset[XX] + local_ndata[XX];
1970 /* More expensive inner loop, especially because of the storage
1971 * of the mh elements in array's.
1972 * Because x is the minor grid index, all mh elements
1973 * depend on kx for triclinic unit cells.
1976 /* Two explicit loops to avoid a conditional inside the loop */
1977 for(kx=kxstart; kx<maxkx; kx++)
1982 mhyk = mx * ryx + my * ryy;
1983 mhzk = mx * rzx + my * rzy + mz * rzz;
1984 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1989 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1990 tmp1[kx] = -factor*m2k;
1993 for(kx=maxkx; kx<kxend; kx++)
1998 mhyk = mx * ryx + my * ryy;
1999 mhzk = mx * rzx + my * rzy + mz * rzz;
2000 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2005 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2006 tmp1[kx] = -factor*m2k;
2009 for(kx=kxstart; kx<kxend; kx++)
2011 m2inv[kx] = 1.0/m2[kx];
2014 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2016 for(kx=kxstart; kx<kxend; kx++,p0++)
2021 p0->re = d1*eterm[kx];
2022 p0->im = d2*eterm[kx];
2024 struct2 = 2.0*(d1*d1+d2*d2);
2026 tmp1[kx] = eterm[kx]*struct2;
2029 for(kx=kxstart; kx<kxend; kx++)
2031 ets2 = corner_fac*tmp1[kx];
2032 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2035 ets2vf = ets2*vfactor;
2036 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2037 virxy += ets2vf*mhx[kx]*mhy[kx];
2038 virxz += ets2vf*mhx[kx]*mhz[kx];
2039 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2040 viryz += ets2vf*mhy[kx]*mhz[kx];
2041 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2046 /* We don't need to calculate the energy and the virial.
2047 * In this case the triclinic overhead is small.
2050 /* Two explicit loops to avoid a conditional inside the loop */
2052 for(kx=kxstart; kx<maxkx; kx++)
2057 mhyk = mx * ryx + my * ryy;
2058 mhzk = mx * rzx + my * rzy + mz * rzz;
2059 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2060 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2061 tmp1[kx] = -factor*m2k;
2064 for(kx=maxkx; kx<kxend; kx++)
2069 mhyk = mx * ryx + my * ryy;
2070 mhzk = mx * rzx + my * rzy + mz * rzz;
2071 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2072 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2073 tmp1[kx] = -factor*m2k;
2076 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2078 for(kx=kxstart; kx<kxend; kx++,p0++)
2083 p0->re = d1*eterm[kx];
2084 p0->im = d2*eterm[kx];
2091 /* Update virial with local values.
2092 * The virial is symmetric by definition.
2093 * this virial seems ok for isotropic scaling, but I'm
2094 * experiencing problems on semiisotropic membranes.
2095 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2097 work->vir[XX][XX] = 0.25*virxx;
2098 work->vir[YY][YY] = 0.25*viryy;
2099 work->vir[ZZ][ZZ] = 0.25*virzz;
2100 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2101 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2102 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2104 /* This energy should be corrected for a charged system */
2105 work->energy = 0.5*energy;
2108 /* Return the loop count */
2109 return local_ndata[YY]*local_ndata[XX];
2112 static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
2113 real *mesh_energy,matrix vir)
2115 /* This function sums output over threads
2116 * and should therefore only be called after thread synchronization.
2120 *mesh_energy = pme->work[0].energy;
2121 copy_mat(pme->work[0].vir,vir);
2123 for(thread=1; thread<nthread; thread++)
2125 *mesh_energy += pme->work[thread].energy;
2126 m_add(vir,pme->work[thread].vir,vir);
2130 static int solve_pme_yzx_wrapper(gmx_pme_t pme,t_complex *grid,
2131 real ewaldcoeff,real vol,
2132 gmx_bool bEnerVir,real *mesh_energy,matrix vir)
2137 nthread = pme->nthread;
2139 #pragma omp parallel for num_threads(nthread) schedule(static)
2140 for(thread=0; thread<nthread; thread++)
2144 n = solve_pme_yzx(pme,grid,ewaldcoeff,vol,bEnerVir,nthread,thread);
2153 get_pme_ener_vir(pme,nthread,mesh_energy,vir);
2160 #define DO_FSPLINE(order) \
2161 for(ithx=0; (ithx<order); ithx++) \
2163 index_x = (i0+ithx)*pny*pnz; \
2167 for(ithy=0; (ithy<order); ithy++) \
2169 index_xy = index_x+(j0+ithy)*pnz; \
2174 for(ithz=0; (ithz<order); ithz++) \
2176 gval = grid[index_xy+(k0+ithz)]; \
2177 fxy1 += thz[ithz]*gval; \
2178 fz1 += dthz[ithz]*gval; \
2187 void gather_f_bsplines(gmx_pme_t pme,real *grid,
2188 gmx_bool bClearF,pme_atomcomm_t *atc,
2189 splinedata_t *spline,
2192 /* sum forces for local particles */
2193 int nn,n,ithx,ithy,ithz,i0,j0,k0;
2194 int index_x,index_xy;
2195 int nx,ny,nz,pnx,pny,pnz;
2197 real tx,ty,dx,dy,qn;
2200 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
2202 real rxx,ryx,ryy,rzx,rzy,rzz;
2205 pme_spline_work_t *work;
2207 work = &pme->spline_work;
2209 order = pme->pme_order;
2210 thx = spline->theta[XX];
2211 thy = spline->theta[YY];
2212 thz = spline->theta[ZZ];
2213 dthx = spline->dtheta[XX];
2214 dthy = spline->dtheta[YY];
2215 dthz = spline->dtheta[ZZ];
2219 pnx = pme->pmegrid_nx;
2220 pny = pme->pmegrid_ny;
2221 pnz = pme->pmegrid_nz;
2223 rxx = pme->recipbox[XX][XX];
2224 ryx = pme->recipbox[YY][XX];
2225 ryy = pme->recipbox[YY][YY];
2226 rzx = pme->recipbox[ZZ][XX];
2227 rzy = pme->recipbox[ZZ][YY];
2228 rzz = pme->recipbox[ZZ][ZZ];
2230 for(nn=0; nn<spline->n; nn++)
2232 n = spline->ind[nn];
2246 idxptr = atc->idx[n];
2253 /* Pointer arithmetic alert, next six statements */
2254 thx = spline->theta[XX] + norder;
2255 thy = spline->theta[YY] + norder;
2256 thz = spline->theta[ZZ] + norder;
2257 dthx = spline->dtheta[XX] + norder;
2258 dthy = spline->dtheta[YY] + norder;
2259 dthz = spline->dtheta[ZZ] + norder;
2264 #ifdef PME_SSE_UNALIGNED
2265 #define PME_GATHER_F_SSE_ORDER4
2267 #define PME_GATHER_F_SSE_ALIGNED
2270 #include "pme_sse_single.h"
2277 #define PME_GATHER_F_SSE_ALIGNED
2279 #include "pme_sse_single.h"
2289 atc->f[n][XX] += -qn*( fx*nx*rxx );
2290 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2291 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2294 /* Since the energy and not forces are interpolated
2295 * the net force might not be exactly zero.
2296 * This can be solved by also interpolating F, but
2297 * that comes at a cost.
2298 * A better hack is to remove the net force every
2299 * step, but that must be done at a higher level
2300 * since this routine doesn't see all atoms if running
2301 * in parallel. Don't know how important it is? EL 990726
2306 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
2307 pme_atomcomm_t *atc)
2309 splinedata_t *spline;
2310 int n,ithx,ithy,ithz,i0,j0,k0;
2311 int index_x,index_xy;
2313 real energy,pot,tx,ty,qn,gval;
2314 real *thx,*thy,*thz;
2318 spline = &atc->spline[0];
2320 order = pme->pme_order;
2323 for(n=0; (n<atc->n); n++) {
2327 idxptr = atc->idx[n];
2334 /* Pointer arithmetic alert, next three statements */
2335 thx = spline->theta[XX] + norder;
2336 thy = spline->theta[YY] + norder;
2337 thz = spline->theta[ZZ] + norder;
2340 for(ithx=0; (ithx<order); ithx++)
2342 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2345 for(ithy=0; (ithy<order); ithy++)
2347 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2350 for(ithz=0; (ithz<order); ithz++)
2352 gval = grid[index_xy+(k0+ithz)];
2353 pot += tx*ty*thz[ithz]*gval;
2366 /* Macro to force loop unrolling by fixing order.
2367 * This gives a significant performance gain.
2369 #define CALC_SPLINE(order) \
2373 real data[PME_ORDER_MAX]; \
2374 real ddata[PME_ORDER_MAX]; \
2376 for(j=0; (j<DIM); j++) \
2380 /* dr is relative offset from lower cell limit */ \
2381 data[order-1] = 0; \
2385 for(k=3; (k<order); k++) \
2387 div = 1.0/(k - 1.0); \
2388 data[k-1] = div*dr*data[k-2]; \
2389 for(l=1; (l<(k-1)); l++) \
2391 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2394 data[0] = div*(1-dr)*data[0]; \
2396 /* differentiate */ \
2397 ddata[0] = -data[0]; \
2398 for(k=1; (k<order); k++) \
2400 ddata[k] = data[k-1] - data[k]; \
2403 div = 1.0/(order - 1); \
2404 data[order-1] = div*dr*data[order-2]; \
2405 for(l=1; (l<(order-1)); l++) \
2407 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2408 (order-l-dr)*data[order-l-1]); \
2410 data[0] = div*(1 - dr)*data[0]; \
2412 for(k=0; k<order; k++) \
2414 theta[j][i*order+k] = data[k]; \
2415 dtheta[j][i*order+k] = ddata[k]; \
2420 void make_bsplines(splinevec theta,splinevec dtheta,int order,
2421 rvec fractx[],int nr,int ind[],real charge[],
2422 gmx_bool bFreeEnergy)
2424 /* construct splines for local atoms */
2430 /* With free energy we do not use the charge check.
2431 * In most cases this will be more efficient than calling make_bsplines
2432 * twice, since usually more than half the particles have charges.
2435 if (bFreeEnergy || charge[ii] != 0.0) {
2438 case 4: CALC_SPLINE(4); break;
2439 case 5: CALC_SPLINE(5); break;
2440 default: CALC_SPLINE(order); break;
2447 void make_dft_mod(real *mod,real *data,int ndata)
2452 for(i=0;i<ndata;i++) {
2454 for(j=0;j<ndata;j++) {
2455 arg=(2.0*M_PI*i*j)/ndata;
2456 sc+=data[j]*cos(arg);
2457 ss+=data[j]*sin(arg);
2461 for(i=0;i<ndata;i++)
2463 mod[i]=(mod[i-1]+mod[i+1])*0.5;
2468 void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
2470 int nmax=max(nx,max(ny,nz));
2471 real *data,*ddata,*bsp_data;
2477 snew(bsp_data,nmax);
2483 for(k=3;k<order;k++) {
2486 for(l=1;l<(k-1);l++)
2487 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2488 data[0]=div*data[0];
2492 for(k=1;k<order;k++)
2493 ddata[k]=data[k-1]-data[k];
2496 for(l=1;l<(order-1);l++)
2497 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2498 data[0]=div*data[0];
2502 for(i=1;i<=order;i++)
2503 bsp_data[i]=data[i-1];
2505 make_dft_mod(bsp_mod[XX],bsp_data,nx);
2506 make_dft_mod(bsp_mod[YY],bsp_data,ny);
2507 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
2514 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2522 for(i=1; i<=nslab/2; i++) {
2523 fw = (atc->nodeid + i) % nslab;
2524 bw = (atc->nodeid - i + nslab) % nslab;
2525 if (n < nslab - 1) {
2526 atc->node_dest[n] = fw;
2527 atc->node_src[n] = bw;
2530 if (n < nslab - 1) {
2531 atc->node_dest[n] = bw;
2532 atc->node_src[n] = fw;
2538 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
2544 fprintf(log,"Destroying PME data structures.\n");
2547 sfree((*pmedata)->nnx);
2548 sfree((*pmedata)->nny);
2549 sfree((*pmedata)->nnz);
2551 pmegrids_destroy(&(*pmedata)->pmegridA);
2553 sfree((*pmedata)->fftgridA);
2554 sfree((*pmedata)->cfftgridA);
2555 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2557 if ((*pmedata)->pmegridB.grid.grid != NULL)
2559 pmegrids_destroy(&(*pmedata)->pmegridB);
2560 sfree((*pmedata)->fftgridB);
2561 sfree((*pmedata)->cfftgridB);
2562 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2564 for(thread=0; thread<(*pmedata)->nthread; thread++)
2566 free_work(&(*pmedata)->work[thread]);
2568 sfree((*pmedata)->work);
2576 static int mult_up(int n,int f)
2578 return ((n + f - 1)/f)*f;
2582 static double pme_load_imbalance(gmx_pme_t pme)
2587 nma = pme->nnodes_major;
2588 nmi = pme->nnodes_minor;
2590 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
2591 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
2592 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
2594 /* pme_solve is roughly double the cost of an fft */
2596 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2599 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
2600 int dimind,gmx_bool bSpread)
2604 atc->dimind = dimind;
2609 if (pme->nnodes > 1)
2611 atc->mpi_comm = pme->mpi_comm_d[dimind];
2612 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
2613 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
2617 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
2621 atc->bSpread = bSpread;
2622 atc->pme_order = pme->pme_order;
2626 /* These three allocations are not required for particle decomp. */
2627 snew(atc->node_dest,atc->nslab);
2628 snew(atc->node_src,atc->nslab);
2629 setup_coordinate_communication(atc);
2631 snew(atc->count_thread,pme->nthread);
2632 for(thread=0; thread<pme->nthread; thread++)
2634 snew(atc->count_thread[thread],atc->nslab);
2636 atc->count = atc->count_thread[0];
2637 snew(atc->rcount,atc->nslab);
2638 snew(atc->buf_index,atc->nslab);
2641 atc->nthread = pme->nthread;
2642 if (atc->nthread > 1)
2644 snew(atc->thread_plist,atc->nthread);
2646 snew(atc->spline,atc->nthread);
2647 for(thread=0; thread<atc->nthread; thread++)
2649 if (atc->nthread > 1)
2651 snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
2652 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2658 init_overlap_comm(pme_overlap_t * ol,
2668 int lbnd,rbnd,maxlr,b,i;
2671 pme_grid_comm_t *pgc;
2673 int fft_start,fft_end,send_index1,recv_index1;
2676 ol->mpi_comm = comm;
2679 ol->nnodes = nnodes;
2680 ol->nodeid = nodeid;
2682 /* Linear translation of the PME grid wo'nt affect reciprocal space
2683 * calculations, so to optimize we only interpolate "upwards",
2684 * which also means we only have to consider overlap in one direction.
2685 * I.e., particles on this node might also be spread to grid indices
2686 * that belong to higher nodes (modulo nnodes)
2689 snew(ol->s2g0,ol->nnodes+1);
2690 snew(ol->s2g1,ol->nnodes);
2691 if (debug) { fprintf(debug,"PME slab boundaries:"); }
2692 for(i=0; i<nnodes; i++)
2694 /* s2g0 the local interpolation grid start.
2695 * s2g1 the local interpolation grid end.
2696 * Because grid overlap communication only goes forward,
2697 * the grid the slabs for fft's should be rounded down.
2699 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2700 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2704 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
2707 ol->s2g0[nnodes] = ndata;
2708 if (debug) { fprintf(debug,"\n"); }
2710 /* Determine with how many nodes we need to communicate the grid overlap */
2716 for(i=0; i<nnodes; i++)
2718 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2719 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2725 while (bCont && b < nnodes);
2726 ol->noverlap_nodes = b - 1;
2728 snew(ol->send_id,ol->noverlap_nodes);
2729 snew(ol->recv_id,ol->noverlap_nodes);
2730 for(b=0; b<ol->noverlap_nodes; b++)
2732 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2733 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2735 snew(ol->comm_data, ol->noverlap_nodes);
2737 for(b=0; b<ol->noverlap_nodes; b++)
2739 pgc = &ol->comm_data[b];
2741 fft_start = ol->s2g0[ol->send_id[b]];
2742 fft_end = ol->s2g0[ol->send_id[b]+1];
2743 if (ol->send_id[b] < nodeid)
2748 send_index1 = ol->s2g1[nodeid];
2749 send_index1 = min(send_index1,fft_end);
2750 pgc->send_index0 = fft_start;
2751 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
2753 /* We always start receiving to the first index of our slab */
2754 fft_start = ol->s2g0[ol->nodeid];
2755 fft_end = ol->s2g0[ol->nodeid+1];
2756 recv_index1 = ol->s2g1[ol->recv_id[b]];
2757 if (ol->recv_id[b] > nodeid)
2759 recv_index1 -= ndata;
2761 recv_index1 = min(recv_index1,fft_end);
2762 pgc->recv_index0 = fft_start;
2763 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
2766 /* For non-divisible grid we need pme_order iso pme_order-1 */
2767 snew(ol->sendbuf,norder*commplainsize);
2768 snew(ol->recvbuf,norder*commplainsize);
2772 make_gridindex5_to_localindex(int n,int local_start,int local_range,
2773 int **global_to_local,
2774 real **fraction_shift)
2782 for(i=0; (i<5*n); i++)
2784 /* Determine the global to local grid index */
2785 gtl[i] = (i - local_start + n) % n;
2786 /* For coordinates that fall within the local grid the fraction
2787 * is correct, we don't need to shift it.
2790 if (local_range < n)
2792 /* Due to rounding issues i could be 1 beyond the lower or
2793 * upper boundary of the local grid. Correct the index for this.
2794 * If we shift the index, we need to shift the fraction by
2795 * the same amount in the other direction to not affect
2797 * Note that due to this shifting the weights at the end of
2798 * the spline might change, but that will only involve values
2799 * between zero and values close to the precision of a real,
2800 * which is anyhow the accuracy of the whole mesh calculation.
2802 /* With local_range=0 we should not change i=local_start */
2803 if (i % n != local_start)
2810 else if (gtl[i] == local_range)
2812 gtl[i] = local_range - 1;
2819 *global_to_local = gtl;
2820 *fraction_shift = fsh;
2823 static void sse_mask_init(pme_spline_work_t *work,int order)
2830 zero_SSE = _mm_setzero_ps();
2832 for(of=0; of<8-(order-1); of++)
2836 tmp[i] = (i >= of && i < of+order ? 1 : 0);
2838 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
2839 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
2840 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
2841 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
2847 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2851 if (*nk % nnodes != 0)
2853 nk_new = nnodes*(*nk/nnodes + 1);
2855 if (2*nk_new >= 3*(*nk))
2857 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2858 dim,*nk,dim,nnodes,dim);
2863 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2864 dim,*nk,dim,nnodes,dim,nk_new,dim);
2871 int gmx_pme_init(gmx_pme_t * pmedata,
2877 gmx_bool bFreeEnergy,
2878 gmx_bool bReproducible,
2883 pme_atomcomm_t *atc;
2887 fprintf(debug,"Creating PME data structures.\n");
2890 pme->redist_init = FALSE;
2891 pme->sum_qgrid_tmp = NULL;
2892 pme->sum_qgrid_dd_tmp = NULL;
2893 pme->buf_nalloc = 0;
2894 pme->redist_buf_nalloc = 0;
2897 pme->bPPnode = TRUE;
2899 pme->nnodes_major = nnodes_major;
2900 pme->nnodes_minor = nnodes_minor;
2905 pme->mpi_comm = cr->mpi_comm_mygroup;
2907 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2908 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2912 if (pme->nnodes == 1)
2914 pme->ndecompdim = 0;
2915 pme->nodeid_major = 0;
2916 pme->nodeid_minor = 0;
2918 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
2923 if (nnodes_minor == 1)
2926 pme->mpi_comm_d[0] = pme->mpi_comm;
2927 pme->mpi_comm_d[1] = MPI_COMM_NULL;
2929 pme->ndecompdim = 1;
2930 pme->nodeid_major = pme->nodeid;
2931 pme->nodeid_minor = 0;
2934 else if (nnodes_major == 1)
2937 pme->mpi_comm_d[0] = MPI_COMM_NULL;
2938 pme->mpi_comm_d[1] = pme->mpi_comm;
2940 pme->ndecompdim = 1;
2941 pme->nodeid_major = 0;
2942 pme->nodeid_minor = pme->nodeid;
2946 if (pme->nnodes % nnodes_major != 0)
2948 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
2950 pme->ndecompdim = 2;
2953 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
2954 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
2955 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
2956 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
2958 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
2959 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
2960 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
2961 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
2964 pme->bPPnode = (cr->duty & DUTY_PP);
2967 pme->nthread = nthread;
2969 if (ir->ePBC == epbcSCREW)
2971 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
2974 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
2978 pme->pme_order = ir->pme_order;
2979 pme->epsilon_r = ir->epsilon_r;
2981 if (pme->pme_order > PME_ORDER_MAX)
2983 gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
2984 pme->pme_order,PME_ORDER_MAX);
2987 /* Currently pme.c supports only the fft5d FFT code.
2988 * Therefore the grid always needs to be divisible by nnodes.
2989 * When the old 1D code is also supported again, change this check.
2991 * This check should be done before calling gmx_pme_init
2992 * and fplog should be passed iso stderr.
2994 if (pme->ndecompdim >= 2)
2996 if (pme->ndecompdim >= 1)
2999 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3000 'x',nnodes_major,&pme->nkx);
3001 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3002 'y',nnodes_minor,&pme->nky);
3006 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3007 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3008 pme->nkz <= pme->pme_order)
3010 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
3013 if (pme->nnodes > 1) {
3017 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3018 MPI_Type_commit(&(pme->rvec_mpi));
3021 /* Note that the charge spreading and force gathering, which usually
3022 * takes about the same amount of time as FFT+solve_pme,
3023 * is always fully load balanced
3024 * (unless the charge distribution is inhomogeneous).
3027 imbal = pme_load_imbalance(pme);
3028 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3032 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3033 " For optimal PME load balancing\n"
3034 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3035 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3037 (int)((imbal-1)*100 + 0.5),
3038 pme->nkx,pme->nky,pme->nnodes_major,
3039 pme->nky,pme->nkz,pme->nnodes_minor);
3043 /* For non-divisible grid we need pme_order iso pme_order-1 */
3044 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3045 * y is always copied through a buffer: we don't need padding in z,
3046 * but we do need the overlap in x because of the communication order.
3048 init_overlap_comm(&pme->overlap[0],pme->pme_order,
3052 pme->nnodes_major,pme->nodeid_major,
3054 (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3056 init_overlap_comm(&pme->overlap[1],pme->pme_order,
3060 pme->nnodes_minor,pme->nodeid_minor,
3062 (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
3064 /* Check for a limitation of the (current) sum_fftgrid_dd code */
3065 if (pme->nthread > 1 &&
3066 (pme->overlap[0].noverlap_nodes > 1 ||
3067 pme->overlap[1].noverlap_nodes > 1))
3069 gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
3072 snew(pme->bsp_mod[XX],pme->nkx);
3073 snew(pme->bsp_mod[YY],pme->nky);
3074 snew(pme->bsp_mod[ZZ],pme->nkz);
3076 /* The required size of the interpolation grid, including overlap.
3077 * The allocated size (pmegrid_n?) might be slightly larger.
3079 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3080 pme->overlap[0].s2g0[pme->nodeid_major];
3081 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3082 pme->overlap[1].s2g0[pme->nodeid_minor];
3083 pme->pmegrid_nz_base = pme->nkz;
3084 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3085 set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
3087 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3088 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3089 pme->pmegrid_start_iz = 0;
3091 make_gridindex5_to_localindex(pme->nkx,
3092 pme->pmegrid_start_ix,
3093 pme->pmegrid_nx - (pme->pme_order-1),
3094 &pme->nnx,&pme->fshx);
3095 make_gridindex5_to_localindex(pme->nky,
3096 pme->pmegrid_start_iy,
3097 pme->pmegrid_ny - (pme->pme_order-1),
3098 &pme->nny,&pme->fshy);
3099 make_gridindex5_to_localindex(pme->nkz,
3100 pme->pmegrid_start_iz,
3101 pme->pmegrid_nz_base,
3102 &pme->nnz,&pme->fshz);
3104 pmegrids_init(&pme->pmegridA,
3105 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3106 pme->pmegrid_nz_base,
3109 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3110 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3112 sse_mask_init(&pme->spline_work,pme->pme_order);
3114 ndata[0] = pme->nkx;
3115 ndata[1] = pme->nky;
3116 ndata[2] = pme->nkz;
3118 /* This routine will allocate the grid data to fit the FFTs */
3119 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
3120 &pme->fftgridA,&pme->cfftgridA,
3122 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3123 bReproducible,pme->nthread);
3127 pmegrids_init(&pme->pmegridB,
3128 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3129 pme->pmegrid_nz_base,
3132 pme->nkx % pme->nnodes_major != 0,
3133 pme->nky % pme->nnodes_minor != 0);
3135 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
3136 &pme->fftgridB,&pme->cfftgridB,
3138 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3139 bReproducible,pme->nthread);
3143 pme->pmegridB.grid.grid = NULL;
3144 pme->fftgridB = NULL;
3145 pme->cfftgridB = NULL;
3148 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3150 /* Use atc[0] for spreading */
3151 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
3152 if (pme->ndecompdim >= 2)
3154 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
3157 if (pme->nnodes == 1) {
3158 pme->atc[0].n = homenr;
3159 pme_realloc_atomcomm_things(&pme->atc[0]);
3165 /* Use fft5d, order after FFT is y major, z, x minor */
3167 snew(pme->work,pme->nthread);
3168 for(thread=0; thread<pme->nthread; thread++)
3170 realloc_work(&pme->work[thread],pme->nkx);
3180 static void copy_local_grid(gmx_pme_t pme,
3181 pmegrids_t *pmegrids,int thread,real *fftgrid)
3183 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3187 int offx,offy,offz,x,y,z,i0,i0t;
3192 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3196 fft_my = local_fft_size[YY];
3197 fft_mz = local_fft_size[ZZ];
3199 pmegrid = &pmegrids->grid_th[thread];
3201 nsx = pmegrid->n[XX];
3202 nsy = pmegrid->n[YY];
3203 nsz = pmegrid->n[ZZ];
3205 for(d=0; d<DIM; d++)
3207 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3208 local_fft_ndata[d] - pmegrid->offset[d]);
3211 offx = pmegrid->offset[XX];
3212 offy = pmegrid->offset[YY];
3213 offz = pmegrid->offset[ZZ];
3215 /* Directly copy the non-overlapping parts of the local grids.
3216 * This also initializes the full grid.
3218 grid_th = pmegrid->grid;
3219 for(x=0; x<nf[XX]; x++)
3221 for(y=0; y<nf[YY]; y++)
3223 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3224 i0t = (x*nsy + y)*nsz;
3225 for(z=0; z<nf[ZZ]; z++)
3227 fftgrid[i0+z] = grid_th[i0t+z];
3233 static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
3235 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3236 pme_overlap_t *overlap;
3240 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3244 /* Major dimension */
3245 overlap = &pme->overlap[0];
3247 nind = overlap->comm_data[0].send_nindex;
3249 for(y=0; y<local_fft_ndata[YY]; y++) {
3255 for(x=0; x<nind; x++) {
3256 for(y=0; y<local_fft_ndata[YY]; y++) {
3258 for(z=0; z<local_fft_ndata[ZZ]; z++) {
3259 if (sendbuf[i] != 0) n++;
3269 reduce_threadgrid_overlap(gmx_pme_t pme,
3270 const pmegrids_t *pmegrids,int thread,
3271 real *fftgrid,real *commbuf_x,real *commbuf_y)
3273 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3274 int fft_nx,fft_ny,fft_nz;
3279 int offx,offy,offz,x,y,z,i0,i0t;
3280 int sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
3281 gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
3282 gmx_bool bCommX,bCommY;
3285 const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
3286 const real *grid_th;
3289 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3293 fft_nx = local_fft_ndata[XX];
3294 fft_ny = local_fft_ndata[YY];
3295 fft_nz = local_fft_ndata[ZZ];
3297 fft_my = local_fft_size[YY];
3298 fft_mz = local_fft_size[ZZ];
3300 /* This routine is called when all thread have finished spreading.
3301 * Here each thread sums grid contributions calculated by other threads
3302 * to the thread local grid volume.
3303 * To minimize the number of grid copying operations,
3304 * this routines sums immediately from the pmegrid to the fftgrid.
3307 /* Determine which part of the full node grid we should operate on,
3308 * this is our thread local part of the full grid.
3310 pmegrid = &pmegrids->grid_th[thread];
3312 for(d=0; d<DIM; d++)
3314 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3315 local_fft_ndata[d]);
3318 offx = pmegrid->offset[XX];
3319 offy = pmegrid->offset[YY];
3320 offz = pmegrid->offset[ZZ];
3327 /* Now loop over all the thread data blocks that contribute
3328 * to the grid region we (our thread) are operating on.
3330 /* Note that ffy_nx/y is equal to the number of grid points
3331 * between the first point of our node grid and the one of the next node.
3333 for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
3335 fx = pmegrid->ci[XX] + sx;
3339 fx += pmegrids->nc[XX];
3341 bCommX = (pme->nnodes_major > 1);
3343 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3344 ox += pmegrid_g->offset[XX];
3347 tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
3351 tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
3354 for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
3356 fy = pmegrid->ci[YY] + sy;
3360 fy += pmegrids->nc[YY];
3362 bCommY = (pme->nnodes_minor > 1);
3364 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3365 oy += pmegrid_g->offset[YY];
3368 ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
3372 ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
3375 for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
3377 fz = pmegrid->ci[ZZ] + sz;
3381 fz += pmegrids->nc[ZZ];
3384 pmegrid_g = &pmegrids->grid_th[fz];
3385 oz += pmegrid_g->offset[ZZ];
3386 tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
3388 if (sx == 0 && sy == 0 && sz == 0)
3390 /* We have already added our local contribution
3391 * before calling this routine, so skip it here.
3396 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3398 pmegrid_f = &pmegrids->grid_th[thread_f];
3400 grid_th = pmegrid_f->grid;
3402 nsx = pmegrid_f->n[XX];
3403 nsy = pmegrid_f->n[YY];
3404 nsz = pmegrid_f->n[ZZ];
3406 #ifdef DEBUG_PME_REDUCE
3407 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3408 pme->nodeid,thread,thread_f,
3409 pme->pmegrid_start_ix,
3410 pme->pmegrid_start_iy,
3411 pme->pmegrid_start_iz,
3413 offx-ox,tx1-ox,offx,tx1,
3414 offy-oy,ty1-oy,offy,ty1,
3415 offz-oz,tz1-oz,offz,tz1);
3418 if (!(bCommX || bCommY))
3420 /* Copy from the thread local grid to the node grid */
3421 for(x=offx; x<tx1; x++)
3423 for(y=offy; y<ty1; y++)
3425 i0 = (x*fft_my + y)*fft_mz;
3426 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3427 for(z=offz; z<tz1; z++)
3429 fftgrid[i0+z] += grid_th[i0t+z];
3436 /* The order of this conditional decides
3437 * where the corner volume gets stored with x+y decomp.
3441 commbuf = commbuf_y;
3442 buf_my = ty1 - offy;
3445 /* We index commbuf modulo the local grid size */
3446 commbuf += buf_my*fft_nx*fft_nz;
3448 bClearBuf = bClearBufXY;
3449 bClearBufXY = FALSE;
3453 bClearBuf = bClearBufY;
3459 commbuf = commbuf_x;
3461 bClearBuf = bClearBufX;
3465 /* Copy to the communication buffer */
3466 for(x=offx; x<tx1; x++)
3468 for(y=offy; y<ty1; y++)
3470 i0 = (x*buf_my + y)*fft_nz;
3471 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3475 /* First access of commbuf, initialize it */
3476 for(z=offz; z<tz1; z++)
3478 commbuf[i0+z] = grid_th[i0t+z];
3483 for(z=offz; z<tz1; z++)
3485 commbuf[i0+z] += grid_th[i0t+z];
3497 static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
3499 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3500 pme_overlap_t *overlap;
3502 int recv_index0,recv_nindex;
3506 int ipulse,send_id,recv_id,datasize,gridsize,size_yx;
3507 real *sendptr,*recvptr;
3508 int x,y,z,indg,indb;
3510 /* Note that this routine is only used for forward communication.
3511 * Since the force gathering, unlike the charge spreading,
3512 * can be trivially parallelized over the particles,
3513 * the backwards process is much simpler and can use the "old"
3514 * communication setup.
3517 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3522 /* Currently supports only a single communication pulse */
3524 /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
3525 if (pme->nnodes_minor > 1)
3527 /* Major dimension */
3528 overlap = &pme->overlap[1];
3530 if (pme->nnodes_major > 1)
3532 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3538 datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
3542 send_id = overlap->send_id[ipulse];
3543 recv_id = overlap->recv_id[ipulse];
3544 send_nindex = overlap->comm_data[ipulse].send_nindex;
3545 /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
3547 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3549 sendptr = overlap->sendbuf;
3550 recvptr = overlap->recvbuf;
3553 printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
3554 local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
3555 printf("node %d send %f, %f\n",pme->nodeid,
3556 sendptr[0],sendptr[send_nindex*datasize-1]);
3560 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3562 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3564 overlap->mpi_comm,&stat);
3567 for(x=0; x<local_fft_ndata[XX]; x++)
3569 for(y=0; y<recv_nindex; y++)
3571 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3572 indb = (x*recv_nindex + y)*local_fft_ndata[ZZ];
3573 for(z=0; z<local_fft_ndata[ZZ]; z++)
3575 fftgrid[indg+z] += recvptr[indb+z];
3579 if (pme->nnodes_major > 1)
3581 sendptr = pme->overlap[0].sendbuf;
3582 for(x=0; x<size_yx; x++)
3584 for(y=0; y<recv_nindex; y++)
3586 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3587 indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
3588 for(z=0; z<local_fft_ndata[ZZ]; z++)
3590 sendptr[indg+z] += recvptr[indb+z];
3597 /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
3598 if (pme->nnodes_major > 1)
3600 /* Major dimension */
3601 overlap = &pme->overlap[0];
3603 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3604 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3608 send_id = overlap->send_id[ipulse];
3609 recv_id = overlap->recv_id[ipulse];
3610 send_nindex = overlap->comm_data[ipulse].send_nindex;
3611 /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
3613 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3615 sendptr = overlap->sendbuf;
3616 recvptr = overlap->recvbuf;
3620 fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
3621 send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
3625 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3627 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3629 overlap->mpi_comm,&stat);
3632 for(x=0; x<recv_nindex; x++)
3634 for(y=0; y<local_fft_ndata[YY]; y++)
3636 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3637 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3638 for(z=0; z<local_fft_ndata[ZZ]; z++)
3640 fftgrid[indg+z] += recvptr[indb+z];
3648 static void spread_on_grid(gmx_pme_t pme,
3649 pme_atomcomm_t *atc,pmegrids_t *grids,
3650 gmx_bool bCalcSplines,gmx_bool bSpread,
3654 #ifdef PME_TIME_THREADS
3655 gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
3656 static double cs1=0,cs2=0,cs3=0;
3657 static double cs1a[6]={0,0,0,0,0,0};
3661 nthread = pme->nthread;
3663 #ifdef PME_TIME_THREADS
3664 c1 = omp_cyc_start();
3668 #pragma omp parallel for num_threads(nthread) schedule(static)
3669 for(thread=0; thread<nthread; thread++)
3673 start = atc->n* thread /nthread;
3674 end = atc->n*(thread+1)/nthread;
3676 /* Compute fftgrid index for all atoms,
3677 * with help of some extra variables.
3679 calc_interpolation_idx(pme,atc,start,end,thread);
3682 #ifdef PME_TIME_THREADS
3683 c1 = omp_cyc_end(c1);
3687 #ifdef PME_TIME_THREADS
3688 c2 = omp_cyc_start();
3690 #pragma omp parallel for num_threads(nthread) schedule(static)
3691 for(thread=0; thread<nthread; thread++)
3693 splinedata_t *spline;
3696 /* make local bsplines */
3697 if (grids->nthread == 1)
3699 spline = &atc->spline[0];
3703 grid = &grids->grid;
3707 spline = &atc->spline[thread];
3709 make_thread_local_ind(atc,thread,spline);
3711 grid = &grids->grid_th[thread];
3716 make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
3717 atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
3722 /* put local atoms on grid. */
3723 #ifdef PME_TIME_SPREAD
3724 ct1a = omp_cyc_start();
3726 spread_q_bsplines_thread(grid,atc,spline,&pme->spline_work);
3728 if (grids->nthread > 1)
3730 copy_local_grid(pme,grids,thread,fftgrid);
3732 #ifdef PME_TIME_SPREAD
3733 ct1a = omp_cyc_end(ct1a);
3734 cs1a[thread] += (double)ct1a;
3738 #ifdef PME_TIME_THREADS
3739 c2 = omp_cyc_end(c2);
3743 if (grids->nthread > 1)
3745 #ifdef PME_TIME_THREADS
3746 c3 = omp_cyc_start();
3748 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3749 for(thread=0; thread<grids->nthread; thread++)
3751 reduce_threadgrid_overlap(pme,grids,thread,
3753 pme->overlap[0].sendbuf,
3754 pme->overlap[1].sendbuf);
3755 #ifdef PRINT_PME_SENDBUF
3756 print_sendbuf(pme,pme->overlap[0].sendbuf);
3759 #ifdef PME_TIME_THREADS
3760 c3 = omp_cyc_end(c3);
3764 if (pme->nnodes > 1)
3766 /* Communicate the overlapping part of the fftgrid */
3767 sum_fftgrid_dd(pme,fftgrid);
3771 #ifdef PME_TIME_THREADS
3775 printf("idx %.2f spread %.2f red %.2f",
3776 cs1*1e-9,cs2*1e-9,cs3*1e-9);
3777 #ifdef PME_TIME_SPREAD
3778 for(thread=0; thread<nthread; thread++)
3779 printf(" %.2f",cs1a[thread]*1e-9);
3787 static void dump_grid(FILE *fp,
3788 int sx,int sy,int sz,int nx,int ny,int nz,
3789 int my,int mz,const real *g)
3799 fprintf(fp,"%2d %2d %2d %6.3f\n",
3800 sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
3806 static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
3808 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3810 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3816 pme->pmegrid_start_ix,
3817 pme->pmegrid_start_iy,
3818 pme->pmegrid_start_iz,
3819 pme->pmegrid_nx-pme->pme_order+1,
3820 pme->pmegrid_ny-pme->pme_order+1,
3821 pme->pmegrid_nz-pme->pme_order+1,
3828 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
3830 pme_atomcomm_t *atc;
3833 if (pme->nnodes > 1)
3835 gmx_incons("gmx_pme_calc_energy called in parallel");
3839 gmx_incons("gmx_pme_calc_energy with free energy");
3842 atc = &pme->atc_energy;
3844 atc->bSpread = TRUE;
3845 atc->pme_order = pme->pme_order;
3847 pme_realloc_atomcomm_things(atc);
3851 /* We only use the A-charges grid */
3852 grid = &pme->pmegridA;
3854 spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
3856 *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
3860 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
3861 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
3863 /* Reset all the counters related to performance over the run */
3864 wallcycle_stop(wcycle,ewcRUN);
3865 wallcycle_reset_all(wcycle);
3867 ir->init_step += step_rel;
3868 ir->nsteps -= step_rel;
3869 wallcycle_start(wcycle,ewcRUN);
3873 int gmx_pmeonly(gmx_pme_t pme,
3874 t_commrec *cr, t_nrnb *nrnb,
3875 gmx_wallcycle_t wcycle,
3876 real ewaldcoeff, gmx_bool bGatherOnly,
3879 gmx_pme_pp_t pme_pp;
3882 rvec *x_pp=NULL,*f_pp=NULL;
3883 real *chargeA=NULL,*chargeB=NULL;
3885 int maxshift_x=0,maxshift_y=0;
3886 real energy,dvdlambda;
3891 gmx_large_int_t step,step_rel;
3894 pme_pp = gmx_pme_pp_init(cr);
3899 do /****** this is a quasi-loop over time steps! */
3901 /* Domain decomposition */
3902 natoms = gmx_pme_recv_q_x(pme_pp,
3903 &chargeA,&chargeB,box,&x_pp,&f_pp,
3904 &maxshift_x,&maxshift_y,
3910 /* We should stop: break out of the loop */
3914 step_rel = step - ir->init_step;
3917 wallcycle_start(wcycle,ewcRUN);
3919 wallcycle_start(wcycle,ewcPMEMESH);
3923 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
3924 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
3925 &energy,lambda,&dvdlambda,
3926 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
3928 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
3930 gmx_pme_send_force_vir_ener(pme_pp,
3931 f_pp,vir,energy,dvdlambda,
3936 if (step_rel == wcycle_get_reset_counters(wcycle))
3938 /* Reset all the counters related to performance over the run */
3939 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
3940 wcycle_set_reset_counters(wcycle, 0);
3943 } /***** end of quasi-loop, we stop with the break above */
3949 int gmx_pme_do(gmx_pme_t pme,
3950 int start, int homenr,
3952 real *chargeA, real *chargeB,
3953 matrix box, t_commrec *cr,
3954 int maxshift_x, int maxshift_y,
3955 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
3956 matrix vir, real ewaldcoeff,
3957 real *energy, real lambda,
3958 real *dvdlambda, int flags)
3960 int q,d,i,j,ntot,npme;
3963 pme_atomcomm_t *atc=NULL;
3964 pmegrids_t *pmegrid=NULL;
3968 real *charge=NULL,*q_d;
3972 gmx_parallel_3dfft_t pfft_setup;
3974 t_complex * cfftgrid;
3977 if (pme->nnodes > 1) {
3980 if (atc->npd > atc->pd_nalloc) {
3981 atc->pd_nalloc = over_alloc_dd(atc->npd);
3982 srenew(atc->pd,atc->pd_nalloc);
3984 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
3988 /* This could be necessary for TPI */
3989 pme->atc[0].n = homenr;
3992 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
3994 pmegrid = &pme->pmegridA;
3995 fftgrid = pme->fftgridA;
3996 cfftgrid = pme->cfftgridA;
3997 pfft_setup = pme->pfft_setupA;
3998 charge = chargeA+start;
4000 pmegrid = &pme->pmegridB;
4001 fftgrid = pme->fftgridB;
4002 cfftgrid = pme->cfftgridB;
4003 pfft_setup = pme->pfft_setupB;
4004 charge = chargeB+start;
4006 grid = pmegrid->grid.grid;
4007 /* Unpack structure */
4009 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
4010 cr->nnodes,cr->nodeid);
4011 fprintf(debug,"Grid = %p\n",(void*)grid);
4013 gmx_fatal(FARGS,"No grid!");
4017 m_inv_ur0(box,pme->recipbox);
4019 if (pme->nnodes == 1) {
4021 if (DOMAINDECOMP(cr)) {
4023 pme_realloc_atomcomm_things(atc);
4029 wallcycle_start(wcycle,ewcPME_REDISTXF);
4030 for(d=pme->ndecompdim-1; d>=0; d--)
4032 if (d == pme->ndecompdim-1)
4040 n_d = pme->atc[d+1].n;
4046 if (atc->npd > atc->pd_nalloc) {
4047 atc->pd_nalloc = over_alloc_dd(atc->npd);
4048 srenew(atc->pd,atc->pd_nalloc);
4050 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4051 pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
4054 GMX_BARRIER(cr->mpi_comm_mygroup);
4055 /* Redistribute x (only once) and qA or qB */
4056 if (DOMAINDECOMP(cr)) {
4057 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
4059 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
4064 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4068 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
4071 if (flags & GMX_PME_SPREAD_Q)
4073 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4075 /* Spread the charges on a grid */
4076 GMX_MPE_LOG(ev_spread_on_grid_start);
4078 /* Spread the charges on a grid */
4079 spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
4080 GMX_MPE_LOG(ev_spread_on_grid_finish);
4084 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
4086 inc_nrnb(nrnb,eNR_SPREADQBSP,
4087 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4089 if (pme->nthread == 1)
4091 wrap_periodic_pmegrid(pme,grid);
4093 /* sum contributions to local grid from other nodes */
4095 if (pme->nnodes > 1)
4097 GMX_BARRIER(cr->mpi_comm_mygroup);
4098 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
4103 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
4106 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4109 dump_local_fftgrid(pme,fftgrid);
4114 /* Here we start a large thread parallel region */
4115 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4116 for(thread=0; thread<pme->nthread; thread++)
4118 if (flags & GMX_PME_SOLVE)
4125 GMX_BARRIER(cr->mpi_comm_mygroup);
4126 GMX_MPE_LOG(ev_gmxfft3d_start);
4127 wallcycle_start(wcycle,ewcPME_FFT);
4129 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
4130 fftgrid,cfftgrid,thread,wcycle);
4133 wallcycle_stop(wcycle,ewcPME_FFT);
4134 GMX_MPE_LOG(ev_gmxfft3d_finish);
4138 /* solve in k-space for our local cells */
4141 GMX_BARRIER(cr->mpi_comm_mygroup);
4142 GMX_MPE_LOG(ev_solve_pme_start);
4143 wallcycle_start(wcycle,ewcPME_SOLVE);
4146 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
4147 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4148 flags & GMX_PME_CALC_ENER_VIR,
4149 pme->nthread,thread);
4152 wallcycle_stop(wcycle,ewcPME_SOLVE);
4154 GMX_MPE_LOG(ev_solve_pme_finish);
4155 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
4159 if (flags & GMX_PME_CALC_F)
4164 GMX_BARRIER(cr->mpi_comm_mygroup);
4165 GMX_MPE_LOG(ev_gmxfft3d_start);
4167 wallcycle_start(wcycle,ewcPME_FFT);
4169 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
4170 cfftgrid,fftgrid,thread,wcycle);
4173 wallcycle_stop(wcycle,ewcPME_FFT);
4176 GMX_MPE_LOG(ev_gmxfft3d_finish);
4178 if (pme->nodeid == 0)
4180 ntot = pme->nkx*pme->nky*pme->nkz;
4181 npme = ntot*log((real)ntot)/log(2.0);
4182 inc_nrnb(nrnb,eNR_FFT,2*npme);
4185 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4188 copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
4191 /* End of thread parallel section.
4192 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4195 if (flags & GMX_PME_CALC_F)
4197 /* distribute local grid to all nodes */
4199 if (pme->nnodes > 1) {
4200 GMX_BARRIER(cr->mpi_comm_mygroup);
4201 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
4206 unwrap_periodic_pmegrid(pme,grid);
4208 /* interpolate forces for our local atoms */
4209 GMX_BARRIER(cr->mpi_comm_mygroup);
4210 GMX_MPE_LOG(ev_gather_f_bsplines_start);
4214 /* If we are running without parallelization,
4215 * atc->f is the actual force array, not a buffer,
4216 * therefore we should not clear it.
4218 bClearF = (q == 0 && PAR(cr));
4219 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4220 for(thread=0; thread<pme->nthread; thread++)
4222 gather_f_bsplines(pme,grid,bClearF,atc,
4223 &atc->spline[thread],
4224 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
4229 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
4231 inc_nrnb(nrnb,eNR_GATHERFBSP,
4232 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4233 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4236 if (flags & GMX_PME_CALC_ENER_VIR)
4238 /* This should only be called on the master thread
4239 * and after the threads have synchronized.
4241 get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
4245 if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
4246 wallcycle_start(wcycle,ewcPME_REDISTXF);
4247 for(d=0; d<pme->ndecompdim; d++)
4250 if (d == pme->ndecompdim - 1)
4257 n_d = pme->atc[d+1].n;
4258 f_d = pme->atc[d+1].f;
4260 GMX_BARRIER(cr->mpi_comm_mygroup);
4261 if (DOMAINDECOMP(cr)) {
4262 dd_pmeredist_f(pme,atc,n_d,f_d,
4263 d==pme->ndecompdim-1 && pme->bPPnode);
4265 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4269 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4274 *energy = energy_AB[0];
4275 m_add(vir,vir_AB[0],vir);
4277 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4278 *dvdlambda += energy_AB[1] - energy_AB[0];
4279 for(i=0; i<DIM; i++)
4280 for(j=0; j<DIM; j++)
4281 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
4286 fprintf(debug,"PME mesh energy: %g\n",*energy);