1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
90 #include "gmx_cyclecounter.h"
92 /* Single precision, with SSE2 or higher available */
93 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
95 #include "gmx_x86_sse2.h"
96 #include "gmx_math_x86_sse2_single.h"
99 /* Some old AMD processors could have problems with unaligned loads+stores */
101 #define PME_SSE_UNALIGNED
105 #include "mpelogging.h"
108 /* #define PRT_FORCE */
109 /* conditions for on the fly time-measurement */
110 /* #define TAKETIME (step > 1 && timesteps < 10) */
111 #define TAKETIME FALSE
113 /* #define PME_TIME_THREADS */
116 #define mpi_type MPI_DOUBLE
118 #define mpi_type MPI_FLOAT
121 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
122 #define GMX_CACHE_SEP 64
124 /* We only define a maximum to be able to use local arrays without allocation.
125 * An order larger than 12 should never be needed, even for test cases.
126 * If needed it can be changed here.
128 #define PME_ORDER_MAX 12
130 /* Internal datastructures */
146 int *send_id,*recv_id;
147 pme_grid_comm_t *comm_data;
153 int *n; /* Cumulative counts of the number of particles per thread */
154 int nalloc; /* Allocation size of i */
155 int *i; /* Particle indices ordered on thread index (n) */
166 int dimind; /* The index of the dimension, 0=x, 1=y */
173 int *node_dest; /* The nodes to send x and q to with DD */
174 int *node_src; /* The nodes to receive x and q from with DD */
175 int *buf_index; /* Index for commnode into the buffers */
182 int *count; /* The number of atoms to send to each node */
184 int *rcount; /* The number of atoms to receive */
191 gmx_bool bSpread; /* These coordinates are used for spreading */
194 rvec *fractx; /* Fractional coordinate relative to the
195 * lower cell boundary
198 int *thread_idx; /* Which thread should spread which charge */
199 thread_plist_t *thread_plist;
200 splinedata_t *spline;
207 ivec ci; /* The spatial location of this grid */
208 ivec n; /* The size of *grid, including order-1 */
209 ivec offset; /* The grid offset from the full node grid */
210 int order; /* PME spreading order */
211 real *grid; /* The grid local thread, size n */
215 pmegrid_t grid; /* The full node grid (non thread-local) */
216 int nthread; /* The number of threads operating on this grid */
217 ivec nc; /* The local spatial decomposition over the threads */
218 pmegrid_t *grid_th; /* Array of grids for each thread */
219 int **g2t; /* The grid to thread index */
220 ivec nthread_comm; /* The number of threads to communicate with */
226 /* Masks for SSE aligned spreading and gathering */
227 __m128 mask_SSE0[6],mask_SSE1[6];
229 int dummy; /* C89 requires that struct has at least one member */
234 /* work data for solve_pme */
250 typedef struct gmx_pme {
251 int ndecompdim; /* The number of decomposition dimensions */
252 int nodeid; /* Our nodeid in mpi->mpi_comm */
255 int nnodes; /* The number of nodes doing PME */
260 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
262 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
265 int nthread; /* The number of threads doing PME */
267 gmx_bool bPPnode; /* Node also does particle-particle forces */
268 gmx_bool bFEP; /* Compute Free energy contribution */
269 int nkx,nky,nkz; /* Grid dimensions */
270 gmx_bool bP3M; /* Do P3M: optimize the influence function */
274 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
276 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
277 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
278 /* pmegrid_nz might be larger than strictly necessary to ensure
279 * memory alignment, pmegrid_nz_base gives the real base size.
282 /* The local PME grid starting indices */
283 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
285 /* Work data for spreading and gathering */
286 pme_spline_work_t *spline_work;
288 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
289 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
290 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
292 t_complex *cfftgridA; /* Grids for complex FFT data */
293 t_complex *cfftgridB;
294 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
296 gmx_parallel_3dfft_t pfft_setupA;
297 gmx_parallel_3dfft_t pfft_setupB;
300 real *fshx,*fshy,*fshz;
302 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
306 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
308 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
310 rvec *bufv; /* Communication buffer */
311 real *bufr; /* Communication buffer */
312 int buf_nalloc; /* The communication buffer size */
314 /* thread local work data for solve_pme */
317 /* Work data for PME_redist */
318 gmx_bool redist_init;
326 int redist_buf_nalloc;
328 /* Work data for sum_qgrid */
329 real * sum_qgrid_tmp;
330 real * sum_qgrid_dd_tmp;
334 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
335 int start,int end,int thread)
338 int *idxptr,tix,tiy,tiz;
339 real *xptr,*fptr,tx,ty,tz;
340 real rxx,ryx,ryy,rzx,rzy,rzz;
342 int start_ix,start_iy,start_iz;
343 int *g2tx,*g2ty,*g2tz;
345 int *thread_idx=NULL;
346 thread_plist_t *tpl=NULL;
354 start_ix = pme->pmegrid_start_ix;
355 start_iy = pme->pmegrid_start_iy;
356 start_iz = pme->pmegrid_start_iz;
358 rxx = pme->recipbox[XX][XX];
359 ryx = pme->recipbox[YY][XX];
360 ryy = pme->recipbox[YY][YY];
361 rzx = pme->recipbox[ZZ][XX];
362 rzy = pme->recipbox[ZZ][YY];
363 rzz = pme->recipbox[ZZ][ZZ];
365 g2tx = pme->pmegridA.g2t[XX];
366 g2ty = pme->pmegridA.g2t[YY];
367 g2tz = pme->pmegridA.g2t[ZZ];
369 bThreads = (atc->nthread > 1);
372 thread_idx = atc->thread_idx;
374 tpl = &atc->thread_plist[thread];
376 for(i=0; i<atc->nthread; i++)
382 for(i=start; i<end; i++) {
384 idxptr = atc->idx[i];
385 fptr = atc->fractx[i];
387 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
388 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
389 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
390 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
396 /* Because decomposition only occurs in x and y,
397 * we never have a fraction correction in z.
399 fptr[XX] = tx - tix + pme->fshx[tix];
400 fptr[YY] = ty - tiy + pme->fshy[tiy];
403 idxptr[XX] = pme->nnx[tix];
404 idxptr[YY] = pme->nny[tiy];
405 idxptr[ZZ] = pme->nnz[tiz];
408 range_check(idxptr[XX],0,pme->pmegrid_nx);
409 range_check(idxptr[YY],0,pme->pmegrid_ny);
410 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
415 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
416 thread_idx[i] = thread_i;
423 /* Make a list of particle indices sorted on thread */
425 /* Get the cumulative count */
426 for(i=1; i<atc->nthread; i++)
428 tpl_n[i] += tpl_n[i-1];
430 /* The current implementation distributes particles equally
431 * over the threads, so we could actually allocate for that
432 * in pme_realloc_atomcomm_things.
434 if (tpl_n[atc->nthread-1] > tpl->nalloc)
436 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
437 srenew(tpl->i,tpl->nalloc);
439 /* Set tpl_n to the cumulative start */
440 for(i=atc->nthread-1; i>=1; i--)
442 tpl_n[i] = tpl_n[i-1];
446 /* Fill our thread local array with indices sorted on thread */
447 for(i=start; i<end; i++)
449 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
451 /* Now tpl_n contains the cummulative count again */
455 static void make_thread_local_ind(pme_atomcomm_t *atc,
456 int thread,splinedata_t *spline)
461 /* Combine the indices made by each thread into one index */
465 for(t=0; t<atc->nthread; t++)
467 tpl = &atc->thread_plist[t];
468 /* Copy our part (start - end) from the list of thread t */
471 start = tpl->n[thread-1];
473 end = tpl->n[thread];
474 for(i=start; i<end; i++)
476 spline->ind[n++] = tpl->i[i];
484 static void pme_calc_pidx(int start, int end,
485 matrix recipbox, rvec x[],
486 pme_atomcomm_t *atc, int *count)
491 real rxx,ryx,rzx,ryy,rzy;
494 /* Calculate PME task index (pidx) for each grid index.
495 * Here we always assign equally sized slabs to each node
496 * for load balancing reasons (the PME grid spacing is not used).
502 /* Reset the count */
503 for(i=0; i<nslab; i++)
508 if (atc->dimind == 0)
510 rxx = recipbox[XX][XX];
511 ryx = recipbox[YY][XX];
512 rzx = recipbox[ZZ][XX];
513 /* Calculate the node index in x-dimension */
514 for(i=start; i<end; i++)
517 /* Fractional coordinates along box vectors */
518 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
519 si = (int)(s + 2*nslab) % nslab;
526 ryy = recipbox[YY][YY];
527 rzy = recipbox[ZZ][YY];
528 /* Calculate the node index in y-dimension */
529 for(i=start; i<end; i++)
532 /* Fractional coordinates along box vectors */
533 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
534 si = (int)(s + 2*nslab) % nslab;
541 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
544 int nthread,thread,slab;
546 nthread = atc->nthread;
548 #pragma omp parallel for num_threads(nthread) schedule(static)
549 for(thread=0; thread<nthread; thread++)
551 pme_calc_pidx(natoms* thread /nthread,
552 natoms*(thread+1)/nthread,
553 recipbox,x,atc,atc->count_thread[thread]);
555 /* Non-parallel reduction, since nslab is small */
557 for(thread=1; thread<nthread; thread++)
559 for(slab=0; slab<atc->nslab; slab++)
561 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
566 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
570 srenew(spline->ind,atc->nalloc);
571 /* Initialize the index to identity so it works without threads */
572 for(i=0; i<atc->nalloc; i++)
579 srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
580 srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
584 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
586 int nalloc_old,i,j,nalloc_tpl;
588 /* We have to avoid a NULL pointer for atc->x to avoid
589 * possible fatal errors in MPI routines.
591 if (atc->n > atc->nalloc || atc->nalloc == 0)
593 nalloc_old = atc->nalloc;
594 atc->nalloc = over_alloc_dd(max(atc->n,1));
596 if (atc->nslab > 1) {
597 srenew(atc->x,atc->nalloc);
598 srenew(atc->q,atc->nalloc);
599 srenew(atc->f,atc->nalloc);
600 for(i=nalloc_old; i<atc->nalloc; i++)
602 clear_rvec(atc->f[i]);
606 srenew(atc->fractx,atc->nalloc);
607 srenew(atc->idx ,atc->nalloc);
609 if (atc->nthread > 1)
611 srenew(atc->thread_idx,atc->nalloc);
614 for(i=0; i<atc->nthread; i++)
616 pme_realloc_splinedata(&atc->spline[i],atc);
622 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
623 int n, gmx_bool bXF, rvec *x_f, real *charge,
625 /* Redistribute particle data for PME calculation */
626 /* domain decomposition by x coordinate */
631 if(FALSE == pme->redist_init) {
632 snew(pme->scounts,atc->nslab);
633 snew(pme->rcounts,atc->nslab);
634 snew(pme->sdispls,atc->nslab);
635 snew(pme->rdispls,atc->nslab);
636 snew(pme->sidx,atc->nslab);
637 pme->redist_init = TRUE;
639 if (n > pme->redist_buf_nalloc) {
640 pme->redist_buf_nalloc = over_alloc_dd(n);
641 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
648 /* forward, redistribution from pp to pme */
650 /* Calculate send counts and exchange them with other nodes */
651 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
652 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
653 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
655 /* Calculate send and receive displacements and index into send
660 for(i=1; i<atc->nslab; i++) {
661 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
662 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
663 pme->sidx[i]=pme->sdispls[i];
665 /* Total # of particles to be received */
666 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
668 pme_realloc_atomcomm_things(atc);
670 /* Copy particle coordinates into send buffer and exchange*/
671 for(i=0; (i<n); i++) {
672 ii=DIM*pme->sidx[pme->idxa[i]];
673 pme->sidx[pme->idxa[i]]++;
674 pme->redist_buf[ii+XX]=x_f[i][XX];
675 pme->redist_buf[ii+YY]=x_f[i][YY];
676 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
678 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
679 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
680 pme->rvec_mpi, atc->mpi_comm);
683 /* Copy charge into send buffer and exchange*/
684 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
685 for(i=0; (i<n); i++) {
686 ii=pme->sidx[pme->idxa[i]];
687 pme->sidx[pme->idxa[i]]++;
688 pme->redist_buf[ii]=charge[i];
690 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
691 atc->q, pme->rcounts, pme->rdispls, mpi_type,
694 else { /* backward, redistribution from pme to pp */
695 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
696 pme->redist_buf, pme->scounts, pme->sdispls,
697 pme->rvec_mpi, atc->mpi_comm);
699 /* Copy data from receive buffer */
700 for(i=0; i<atc->nslab; i++)
701 pme->sidx[i] = pme->sdispls[i];
702 for(i=0; (i<n); i++) {
703 ii = DIM*pme->sidx[pme->idxa[i]];
704 x_f[i][XX] += pme->redist_buf[ii+XX];
705 x_f[i][YY] += pme->redist_buf[ii+YY];
706 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
707 pme->sidx[pme->idxa[i]]++;
713 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
714 gmx_bool bBackward,int shift,
715 void *buf_s,int nbyte_s,
716 void *buf_r,int nbyte_r)
722 if (bBackward == FALSE) {
723 dest = atc->node_dest[shift];
724 src = atc->node_src[shift];
726 dest = atc->node_src[shift];
727 src = atc->node_dest[shift];
730 if (nbyte_s > 0 && nbyte_r > 0) {
731 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
733 buf_r,nbyte_r,MPI_BYTE,
735 atc->mpi_comm,&stat);
736 } else if (nbyte_s > 0) {
737 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
740 } else if (nbyte_r > 0) {
741 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
743 atc->mpi_comm,&stat);
748 static void dd_pmeredist_x_q(gmx_pme_t pme,
749 int n, gmx_bool bX, rvec *x, real *charge,
752 int *commnode,*buf_index;
753 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
755 commnode = atc->node_dest;
756 buf_index = atc->buf_index;
758 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
761 for(i=0; i<nnodes_comm; i++) {
762 buf_index[commnode[i]] = nsend;
763 nsend += atc->count[commnode[i]];
766 if (atc->count[atc->nodeid] + nsend != n)
767 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
768 "This usually means that your system is not well equilibrated.",
769 n - (atc->count[atc->nodeid] + nsend),
770 pme->nodeid,'x'+atc->dimind);
772 if (nsend > pme->buf_nalloc) {
773 pme->buf_nalloc = over_alloc_dd(nsend);
774 srenew(pme->bufv,pme->buf_nalloc);
775 srenew(pme->bufr,pme->buf_nalloc);
778 atc->n = atc->count[atc->nodeid];
779 for(i=0; i<nnodes_comm; i++) {
780 scount = atc->count[commnode[i]];
781 /* Communicate the count */
783 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
784 atc->dimind,atc->nodeid,commnode[i],scount);
785 pme_dd_sendrecv(atc,FALSE,i,
787 &atc->rcount[i],sizeof(int));
788 atc->n += atc->rcount[i];
791 pme_realloc_atomcomm_things(atc);
797 if (node == atc->nodeid) {
798 /* Copy direct to the receive buffer */
800 copy_rvec(x[i],atc->x[local_pos]);
802 atc->q[local_pos] = charge[i];
805 /* Copy to the send buffer */
807 copy_rvec(x[i],pme->bufv[buf_index[node]]);
809 pme->bufr[buf_index[node]] = charge[i];
815 for(i=0; i<nnodes_comm; i++) {
816 scount = atc->count[commnode[i]];
817 rcount = atc->rcount[i];
818 if (scount > 0 || rcount > 0) {
820 /* Communicate the coordinates */
821 pme_dd_sendrecv(atc,FALSE,i,
822 pme->bufv[buf_pos],scount*sizeof(rvec),
823 atc->x[local_pos],rcount*sizeof(rvec));
825 /* Communicate the charges */
826 pme_dd_sendrecv(atc,FALSE,i,
827 pme->bufr+buf_pos,scount*sizeof(real),
828 atc->q+local_pos,rcount*sizeof(real));
830 local_pos += atc->rcount[i];
835 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
839 int *commnode,*buf_index;
840 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
842 commnode = atc->node_dest;
843 buf_index = atc->buf_index;
845 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
847 local_pos = atc->count[atc->nodeid];
849 for(i=0; i<nnodes_comm; i++) {
850 scount = atc->rcount[i];
851 rcount = atc->count[commnode[i]];
852 if (scount > 0 || rcount > 0) {
853 /* Communicate the forces */
854 pme_dd_sendrecv(atc,TRUE,i,
855 atc->f[local_pos],scount*sizeof(rvec),
856 pme->bufv[buf_pos],rcount*sizeof(rvec));
859 buf_index[commnode[i]] = buf_pos;
869 if (node == atc->nodeid)
871 /* Add from the local force array */
872 rvec_inc(f[i],atc->f[local_pos]);
877 /* Add from the receive buffer */
878 rvec_inc(f[i],pme->bufv[buf_index[node]]);
888 if (node == atc->nodeid)
890 /* Copy from the local force array */
891 copy_rvec(atc->f[local_pos],f[i]);
896 /* Copy from the receive buffer */
897 copy_rvec(pme->bufv[buf_index[node]],f[i]);
906 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
908 pme_overlap_t *overlap;
909 int send_index0,send_nindex;
910 int recv_index0,recv_nindex;
912 int i,j,k,ix,iy,iz,icnt;
913 int ipulse,send_id,recv_id,datasize;
915 real *sendptr,*recvptr;
917 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
918 overlap = &pme->overlap[1];
920 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
922 /* Since we have already (un)wrapped the overlap in the z-dimension,
923 * we only have to communicate 0 to nkz (not pmegrid_nz).
925 if (direction==GMX_SUM_QGRID_FORWARD)
927 send_id = overlap->send_id[ipulse];
928 recv_id = overlap->recv_id[ipulse];
929 send_index0 = overlap->comm_data[ipulse].send_index0;
930 send_nindex = overlap->comm_data[ipulse].send_nindex;
931 recv_index0 = overlap->comm_data[ipulse].recv_index0;
932 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
936 send_id = overlap->recv_id[ipulse];
937 recv_id = overlap->send_id[ipulse];
938 send_index0 = overlap->comm_data[ipulse].recv_index0;
939 send_nindex = overlap->comm_data[ipulse].recv_nindex;
940 recv_index0 = overlap->comm_data[ipulse].send_index0;
941 recv_nindex = overlap->comm_data[ipulse].send_nindex;
944 /* Copy data to contiguous send buffer */
947 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
948 pme->nodeid,overlap->nodeid,send_id,
949 pme->pmegrid_start_iy,
950 send_index0-pme->pmegrid_start_iy,
951 send_index0-pme->pmegrid_start_iy+send_nindex);
954 for(i=0;i<pme->pmegrid_nx;i++)
957 for(j=0;j<send_nindex;j++)
959 iy = j + send_index0 - pme->pmegrid_start_iy;
960 for(k=0;k<pme->nkz;k++)
963 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
968 datasize = pme->pmegrid_nx * pme->nkz;
970 MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
972 overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
974 overlap->mpi_comm,&stat);
976 /* Get data from contiguous recv buffer */
979 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
980 pme->nodeid,overlap->nodeid,recv_id,
981 pme->pmegrid_start_iy,
982 recv_index0-pme->pmegrid_start_iy,
983 recv_index0-pme->pmegrid_start_iy+recv_nindex);
986 for(i=0;i<pme->pmegrid_nx;i++)
989 for(j=0;j<recv_nindex;j++)
991 iy = j + recv_index0 - pme->pmegrid_start_iy;
992 for(k=0;k<pme->nkz;k++)
995 if(direction==GMX_SUM_QGRID_FORWARD)
997 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1001 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1008 /* Major dimension is easier, no copying required,
1009 * but we might have to sum to separate array.
1010 * Since we don't copy, we have to communicate up to pmegrid_nz,
1011 * not nkz as for the minor direction.
1013 overlap = &pme->overlap[0];
1015 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
1017 if(direction==GMX_SUM_QGRID_FORWARD)
1019 send_id = overlap->send_id[ipulse];
1020 recv_id = overlap->recv_id[ipulse];
1021 send_index0 = overlap->comm_data[ipulse].send_index0;
1022 send_nindex = overlap->comm_data[ipulse].send_nindex;
1023 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1024 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1025 recvptr = overlap->recvbuf;
1029 send_id = overlap->recv_id[ipulse];
1030 recv_id = overlap->send_id[ipulse];
1031 send_index0 = overlap->comm_data[ipulse].recv_index0;
1032 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1033 recv_index0 = overlap->comm_data[ipulse].send_index0;
1034 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1035 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1038 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1039 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1043 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1044 pme->nodeid,overlap->nodeid,send_id,
1045 pme->pmegrid_start_ix,
1046 send_index0-pme->pmegrid_start_ix,
1047 send_index0-pme->pmegrid_start_ix+send_nindex);
1048 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1049 pme->nodeid,overlap->nodeid,recv_id,
1050 pme->pmegrid_start_ix,
1051 recv_index0-pme->pmegrid_start_ix,
1052 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1055 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
1057 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
1059 overlap->mpi_comm,&stat);
1061 /* ADD data from contiguous recv buffer */
1062 if(direction==GMX_SUM_QGRID_FORWARD)
1064 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1065 for(i=0;i<recv_nindex*datasize;i++)
1067 p[i] += overlap->recvbuf[i];
1076 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1078 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1079 ivec local_pme_size;
1083 /* Dimensions should be identical for A/B grid, so we just use A here */
1084 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1089 local_pme_size[0] = pme->pmegrid_nx;
1090 local_pme_size[1] = pme->pmegrid_ny;
1091 local_pme_size[2] = pme->pmegrid_nz;
1093 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1094 the offset is identical, and the PME grid always has more data (due to overlap)
1099 char fn[STRLEN],format[STRLEN];
1101 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
1102 fp = ffopen(fn,"w");
1103 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
1104 fp2 = ffopen(fn,"w");
1105 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1108 for(ix=0;ix<local_fft_ndata[XX];ix++)
1110 for(iy=0;iy<local_fft_ndata[YY];iy++)
1112 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1114 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1115 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1116 fftgrid[fftidx] = pmegrid[pmeidx];
1118 val = 100*pmegrid[pmeidx];
1119 if (pmegrid[pmeidx] != 0)
1120 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
1121 5.0*ix,5.0*iy,5.0*iz,1.0,val);
1122 if (pmegrid[pmeidx] != 0)
1123 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
1125 pme->pmegrid_start_ix + ix,
1126 pme->pmegrid_start_iy + iy,
1127 pme->pmegrid_start_iz + iz,
1142 static gmx_cycles_t omp_cyc_start()
1144 return gmx_cycles_read();
1147 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1149 return gmx_cycles_read() - c;
1154 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1155 int nthread,int thread)
1157 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1158 ivec local_pme_size;
1159 int ixy0,ixy1,ixy,ix,iy,iz;
1161 #ifdef PME_TIME_THREADS
1163 static double cs1=0;
1167 #ifdef PME_TIME_THREADS
1168 c1 = omp_cyc_start();
1170 /* Dimensions should be identical for A/B grid, so we just use A here */
1171 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1176 local_pme_size[0] = pme->pmegrid_nx;
1177 local_pme_size[1] = pme->pmegrid_ny;
1178 local_pme_size[2] = pme->pmegrid_nz;
1180 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1181 the offset is identical, and the PME grid always has more data (due to overlap)
1183 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1184 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1186 for(ixy=ixy0;ixy<ixy1;ixy++)
1188 ix = ixy/local_fft_ndata[YY];
1189 iy = ixy - ix*local_fft_ndata[YY];
1191 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1192 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1193 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1195 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1199 #ifdef PME_TIME_THREADS
1200 c1 = omp_cyc_end(c1);
1205 printf("copy %.2f\n",cs1*1e-9);
1214 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1216 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1222 pnx = pme->pmegrid_nx;
1223 pny = pme->pmegrid_ny;
1224 pnz = pme->pmegrid_nz;
1226 overlap = pme->pme_order - 1;
1228 /* Add periodic overlap in z */
1229 for(ix=0; ix<pme->pmegrid_nx; ix++)
1231 for(iy=0; iy<pme->pmegrid_ny; iy++)
1233 for(iz=0; iz<overlap; iz++)
1235 pmegrid[(ix*pny+iy)*pnz+iz] +=
1236 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1241 if (pme->nnodes_minor == 1)
1243 for(ix=0; ix<pme->pmegrid_nx; ix++)
1245 for(iy=0; iy<overlap; iy++)
1247 for(iz=0; iz<nz; iz++)
1249 pmegrid[(ix*pny+iy)*pnz+iz] +=
1250 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1256 if (pme->nnodes_major == 1)
1258 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1260 for(ix=0; ix<overlap; ix++)
1262 for(iy=0; iy<ny_x; iy++)
1264 for(iz=0; iz<nz; iz++)
1266 pmegrid[(ix*pny+iy)*pnz+iz] +=
1267 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1276 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1278 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
1284 pnx = pme->pmegrid_nx;
1285 pny = pme->pmegrid_ny;
1286 pnz = pme->pmegrid_nz;
1288 overlap = pme->pme_order - 1;
1290 if (pme->nnodes_major == 1)
1292 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1294 for(ix=0; ix<overlap; ix++)
1298 for(iy=0; iy<ny_x; iy++)
1300 for(iz=0; iz<nz; iz++)
1302 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1303 pmegrid[(ix*pny+iy)*pnz+iz];
1309 if (pme->nnodes_minor == 1)
1311 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1312 for(ix=0; ix<pme->pmegrid_nx; ix++)
1316 for(iy=0; iy<overlap; iy++)
1318 for(iz=0; iz<nz; iz++)
1320 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1321 pmegrid[(ix*pny+iy)*pnz+iz];
1327 /* Copy periodic overlap in z */
1328 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1329 for(ix=0; ix<pme->pmegrid_nx; ix++)
1333 for(iy=0; iy<pme->pmegrid_ny; iy++)
1335 for(iz=0; iz<overlap; iz++)
1337 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1338 pmegrid[(ix*pny+iy)*pnz+iz];
1344 static void clear_grid(int nx,int ny,int nz,real *grid,
1346 int fx,int fy,int fz,
1350 int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
1353 nc = 2 + (order - 2)/FLBS;
1354 ncz = 2 + (order - 2)/FLBSZ;
1356 for(fsx=fx; fsx<fx+nc; fsx++)
1358 for(fsy=fy; fsy<fy+nc; fsy++)
1360 for(fsz=fz; fsz<fz+ncz; fsz++)
1362 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1363 if (flag[flind] == 0)
1368 g0x = (gx*ny + gy)*nz + gz;
1369 for(x=0; x<FLBS; x++)
1372 for(y=0; y<FLBS; y++)
1374 for(z=0; z<FLBSZ; z++)
1390 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1391 #define DO_BSPLINE(order) \
1392 for(ithx=0; (ithx<order); ithx++) \
1394 index_x = (i0+ithx)*pny*pnz; \
1395 valx = qn*thx[ithx]; \
1397 for(ithy=0; (ithy<order); ithy++) \
1399 valxy = valx*thy[ithy]; \
1400 index_xy = index_x+(j0+ithy)*pnz; \
1402 for(ithz=0; (ithz<order); ithz++) \
1404 index_xyz = index_xy+(k0+ithz); \
1405 grid[index_xyz] += valxy*thz[ithz]; \
1411 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1412 pme_atomcomm_t *atc, splinedata_t *spline,
1413 pme_spline_work_t *work)
1416 /* spread charges from home atoms to local grid */
1419 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1421 int order,norder,index_x,index_xy,index_xyz;
1423 real *thx,*thy,*thz;
1424 int localsize, bndsize;
1425 int pnx,pny,pnz,ndatatot;
1428 pnx = pmegrid->n[XX];
1429 pny = pmegrid->n[YY];
1430 pnz = pmegrid->n[ZZ];
1432 offx = pmegrid->offset[XX];
1433 offy = pmegrid->offset[YY];
1434 offz = pmegrid->offset[ZZ];
1436 ndatatot = pnx*pny*pnz;
1437 grid = pmegrid->grid;
1438 for(i=0;i<ndatatot;i++)
1443 order = pmegrid->order;
1445 for(nn=0; nn<spline->n; nn++)
1447 n = spline->ind[nn];
1452 idxptr = atc->idx[n];
1455 i0 = idxptr[XX] - offx;
1456 j0 = idxptr[YY] - offy;
1457 k0 = idxptr[ZZ] - offz;
1459 thx = spline->theta[XX] + norder;
1460 thy = spline->theta[YY] + norder;
1461 thz = spline->theta[ZZ] + norder;
1466 #ifdef PME_SSE_UNALIGNED
1467 #define PME_SPREAD_SSE_ORDER4
1469 #define PME_SPREAD_SSE_ALIGNED
1472 #include "pme_sse_single.h"
1479 #define PME_SPREAD_SSE_ALIGNED
1481 #include "pme_sse_single.h"
1494 static void set_grid_alignment(int *pmegrid_nz,int pme_order)
1498 #ifndef PME_SSE_UNALIGNED
1503 /* Round nz up to a multiple of 4 to ensure alignment */
1504 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1509 static void set_gridsize_alignment(int *gridsize,int pme_order)
1512 #ifndef PME_SSE_UNALIGNED
1515 /* Add extra elements to ensured aligned operations do not go
1516 * beyond the allocated grid size.
1517 * Note that for pme_order=5, the pme grid z-size alignment
1518 * ensures that we will not go beyond the grid size.
1526 static void pmegrid_init(pmegrid_t *grid,
1527 int cx, int cy, int cz,
1528 int x0, int y0, int z0,
1529 int x1, int y1, int z1,
1530 gmx_bool set_alignment,
1539 grid->offset[XX] = x0;
1540 grid->offset[YY] = y0;
1541 grid->offset[ZZ] = z0;
1542 grid->n[XX] = x1 - x0 + pme_order - 1;
1543 grid->n[YY] = y1 - y0 + pme_order - 1;
1544 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1547 set_grid_alignment(&nz,pme_order);
1552 else if (nz != grid->n[ZZ])
1554 gmx_incons("pmegrid_init call with an unaligned z size");
1557 grid->order = pme_order;
1560 gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
1561 set_gridsize_alignment(&gridsize,pme_order);
1562 snew_aligned(grid->grid,gridsize,16);
1570 static int div_round_up(int enumerator,int denominator)
1572 return (enumerator + denominator - 1)/denominator;
1575 static void make_subgrid_division(const ivec n,int ovl,int nthread,
1578 int gsize_opt,gsize;
1583 for(nsx=1; nsx<=nthread; nsx++)
1585 if (nthread % nsx == 0)
1587 for(nsy=1; nsy<=nthread; nsy++)
1589 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1591 nsz = nthread/(nsx*nsy);
1593 /* Determine the number of grid points per thread */
1595 (div_round_up(n[XX],nsx) + ovl)*
1596 (div_round_up(n[YY],nsy) + ovl)*
1597 (div_round_up(n[ZZ],nsz) + ovl);
1599 /* Minimize the number of grids points per thread
1600 * and, secondarily, the number of cuts in minor dimensions.
1602 if (gsize_opt == -1 ||
1603 gsize < gsize_opt ||
1604 (gsize == gsize_opt &&
1605 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1617 env = getenv("GMX_PME_THREAD_DIVISION");
1620 sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
1623 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1625 gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
1629 static void pmegrids_init(pmegrids_t *grids,
1630 int nx,int ny,int nz,int nz_base,
1636 ivec n,n_base,g0,g1;
1637 int t,x,y,z,d,i,tfac;
1640 n[XX] = nx - (pme_order - 1);
1641 n[YY] = ny - (pme_order - 1);
1642 n[ZZ] = nz - (pme_order - 1);
1644 copy_ivec(n,n_base);
1645 n_base[ZZ] = nz_base;
1647 pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
1650 grids->nthread = nthread;
1652 make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
1654 if (grids->nthread > 1)
1660 for(d=0; d<DIM; d++)
1662 nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
1664 set_grid_alignment(&nst[ZZ],pme_order);
1668 fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
1669 grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
1670 fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1672 nst[XX],nst[YY],nst[ZZ]);
1675 snew(grids->grid_th,grids->nthread);
1677 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1678 set_gridsize_alignment(&gridsize,pme_order);
1679 snew_aligned(grid_all,
1680 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1683 for(x=0; x<grids->nc[XX]; x++)
1685 for(y=0; y<grids->nc[YY]; y++)
1687 for(z=0; z<grids->nc[ZZ]; z++)
1689 pmegrid_init(&grids->grid_th[t],
1691 (n[XX]*(x ))/grids->nc[XX],
1692 (n[YY]*(y ))/grids->nc[YY],
1693 (n[ZZ]*(z ))/grids->nc[ZZ],
1694 (n[XX]*(x+1))/grids->nc[XX],
1695 (n[YY]*(y+1))/grids->nc[YY],
1696 (n[ZZ]*(z+1))/grids->nc[ZZ],
1699 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1706 snew(grids->g2t,DIM);
1708 for(d=DIM-1; d>=0; d--)
1710 snew(grids->g2t[d],n[d]);
1712 for(i=0; i<n[d]; i++)
1714 /* The second check should match the parameters
1715 * of the pmegrid_init call above.
1717 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1721 grids->g2t[d][i] = t*tfac;
1724 tfac *= grids->nc[d];
1728 case XX: max_comm_lines = overlap_x; break;
1729 case YY: max_comm_lines = overlap_y; break;
1730 case ZZ: max_comm_lines = pme_order - 1; break;
1732 grids->nthread_comm[d] = 0;
1733 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
1735 grids->nthread_comm[d]++;
1739 fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
1740 'x'+d,grids->nthread_comm[d]);
1742 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1743 * work, but this is not a problematic restriction.
1745 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1747 gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
1753 static void pmegrids_destroy(pmegrids_t *grids)
1757 if (grids->grid.grid != NULL)
1759 sfree(grids->grid.grid);
1761 if (grids->nthread > 0)
1763 for(t=0; t<grids->nthread; t++)
1765 sfree(grids->grid_th[t].grid);
1767 sfree(grids->grid_th);
1773 static void realloc_work(pme_work_t *work,int nkx)
1775 if (nkx > work->nalloc)
1778 srenew(work->mhx ,work->nalloc);
1779 srenew(work->mhy ,work->nalloc);
1780 srenew(work->mhz ,work->nalloc);
1781 srenew(work->m2 ,work->nalloc);
1782 /* Allocate an aligned pointer for SSE operations, including 3 extra
1783 * elements at the end since SSE operates on 4 elements at a time.
1785 sfree_aligned(work->denom);
1786 sfree_aligned(work->tmp1);
1787 sfree_aligned(work->eterm);
1788 snew_aligned(work->denom,work->nalloc+3,16);
1789 snew_aligned(work->tmp1 ,work->nalloc+3,16);
1790 snew_aligned(work->eterm,work->nalloc+3,16);
1791 srenew(work->m2inv,work->nalloc);
1796 static void free_work(pme_work_t *work)
1802 sfree_aligned(work->denom);
1803 sfree_aligned(work->tmp1);
1804 sfree_aligned(work->eterm);
1810 /* Calculate exponentials through SSE in float precision */
1811 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1814 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
1817 __m128 tmp_d1,d_inv,tmp_r,tmp_e;
1819 f_sse = _mm_load1_ps(&f);
1820 for(kx=0; kx<end; kx+=4)
1822 tmp_d1 = _mm_load_ps(d_aligned+kx);
1823 lu = _mm_rcp_ps(tmp_d1);
1824 d_inv = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
1825 tmp_r = _mm_load_ps(r_aligned+kx);
1826 tmp_r = gmx_mm_exp_ps(tmp_r);
1827 tmp_e = _mm_mul_ps(f_sse,d_inv);
1828 tmp_e = _mm_mul_ps(tmp_e,tmp_r);
1829 _mm_store_ps(e_aligned+kx,tmp_e);
1834 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1837 for(kx=start; kx<end; kx++)
1841 for(kx=start; kx<end; kx++)
1845 for(kx=start; kx<end; kx++)
1847 e[kx] = f*r[kx]*d[kx];
1853 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1854 real ewaldcoeff,real vol,
1856 int nthread,int thread)
1858 /* do recip sum over local cells in grid */
1859 /* y major, z middle, x minor or continuous */
1861 int kx,ky,kz,maxkx,maxky,maxkz;
1862 int nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
1864 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1865 real ets2,struct2,vfactor,ets2vf;
1866 real d1,d2,energy=0;
1868 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1869 real rxx,ryx,ryy,rzx,rzy,rzz;
1871 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
1872 real mhxk,mhyk,mhzk,m2k;
1875 ivec local_ndata,local_offset,local_size;
1878 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1884 /* Dimensions should be identical for A/B grid, so we just use A here */
1885 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1891 rxx = pme->recipbox[XX][XX];
1892 ryx = pme->recipbox[YY][XX];
1893 ryy = pme->recipbox[YY][YY];
1894 rzx = pme->recipbox[ZZ][XX];
1895 rzy = pme->recipbox[ZZ][YY];
1896 rzz = pme->recipbox[ZZ][ZZ];
1902 work = &pme->work[thread];
1907 denom = work->denom;
1909 eterm = work->eterm;
1910 m2inv = work->m2inv;
1912 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1913 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1915 for(iyz=iyz0; iyz<iyz1; iyz++)
1917 iy = iyz/local_ndata[ZZ];
1918 iz = iyz - iy*local_ndata[ZZ];
1920 ky = iy + local_offset[YY];
1931 by = M_PI*vol*pme->bsp_mod[YY][ky];
1933 kz = iz + local_offset[ZZ];
1937 bz = pme->bsp_mod[ZZ][kz];
1939 /* 0.5 correction for corner points */
1941 if (kz == 0 || kz == (nz+1)/2)
1946 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1948 /* We should skip the k-space point (0,0,0) */
1949 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
1951 kxstart = local_offset[XX];
1955 kxstart = local_offset[XX] + 1;
1958 kxend = local_offset[XX] + local_ndata[XX];
1962 /* More expensive inner loop, especially because of the storage
1963 * of the mh elements in array's.
1964 * Because x is the minor grid index, all mh elements
1965 * depend on kx for triclinic unit cells.
1968 /* Two explicit loops to avoid a conditional inside the loop */
1969 for(kx=kxstart; kx<maxkx; kx++)
1974 mhyk = mx * ryx + my * ryy;
1975 mhzk = mx * rzx + my * rzy + mz * rzz;
1976 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1981 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1982 tmp1[kx] = -factor*m2k;
1985 for(kx=maxkx; kx<kxend; kx++)
1990 mhyk = mx * ryx + my * ryy;
1991 mhzk = mx * rzx + my * rzy + mz * rzz;
1992 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1997 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1998 tmp1[kx] = -factor*m2k;
2001 for(kx=kxstart; kx<kxend; kx++)
2003 m2inv[kx] = 1.0/m2[kx];
2006 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2008 for(kx=kxstart; kx<kxend; kx++,p0++)
2013 p0->re = d1*eterm[kx];
2014 p0->im = d2*eterm[kx];
2016 struct2 = 2.0*(d1*d1+d2*d2);
2018 tmp1[kx] = eterm[kx]*struct2;
2021 for(kx=kxstart; kx<kxend; kx++)
2023 ets2 = corner_fac*tmp1[kx];
2024 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2027 ets2vf = ets2*vfactor;
2028 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2029 virxy += ets2vf*mhx[kx]*mhy[kx];
2030 virxz += ets2vf*mhx[kx]*mhz[kx];
2031 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2032 viryz += ets2vf*mhy[kx]*mhz[kx];
2033 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2038 /* We don't need to calculate the energy and the virial.
2039 * In this case the triclinic overhead is small.
2042 /* Two explicit loops to avoid a conditional inside the loop */
2044 for(kx=kxstart; kx<maxkx; kx++)
2049 mhyk = mx * ryx + my * ryy;
2050 mhzk = mx * rzx + my * rzy + mz * rzz;
2051 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2052 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2053 tmp1[kx] = -factor*m2k;
2056 for(kx=maxkx; kx<kxend; kx++)
2061 mhyk = mx * ryx + my * ryy;
2062 mhzk = mx * rzx + my * rzy + mz * rzz;
2063 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2064 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2065 tmp1[kx] = -factor*m2k;
2068 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2070 for(kx=kxstart; kx<kxend; kx++,p0++)
2075 p0->re = d1*eterm[kx];
2076 p0->im = d2*eterm[kx];
2083 /* Update virial with local values.
2084 * The virial is symmetric by definition.
2085 * this virial seems ok for isotropic scaling, but I'm
2086 * experiencing problems on semiisotropic membranes.
2087 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2089 work->vir[XX][XX] = 0.25*virxx;
2090 work->vir[YY][YY] = 0.25*viryy;
2091 work->vir[ZZ][ZZ] = 0.25*virzz;
2092 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2093 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2094 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2096 /* This energy should be corrected for a charged system */
2097 work->energy = 0.5*energy;
2100 /* Return the loop count */
2101 return local_ndata[YY]*local_ndata[XX];
2104 static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
2105 real *mesh_energy,matrix vir)
2107 /* This function sums output over threads
2108 * and should therefore only be called after thread synchronization.
2112 *mesh_energy = pme->work[0].energy;
2113 copy_mat(pme->work[0].vir,vir);
2115 for(thread=1; thread<nthread; thread++)
2117 *mesh_energy += pme->work[thread].energy;
2118 m_add(vir,pme->work[thread].vir,vir);
2122 #define DO_FSPLINE(order) \
2123 for(ithx=0; (ithx<order); ithx++) \
2125 index_x = (i0+ithx)*pny*pnz; \
2129 for(ithy=0; (ithy<order); ithy++) \
2131 index_xy = index_x+(j0+ithy)*pnz; \
2136 for(ithz=0; (ithz<order); ithz++) \
2138 gval = grid[index_xy+(k0+ithz)]; \
2139 fxy1 += thz[ithz]*gval; \
2140 fz1 += dthz[ithz]*gval; \
2149 static void gather_f_bsplines(gmx_pme_t pme,real *grid,
2150 gmx_bool bClearF,pme_atomcomm_t *atc,
2151 splinedata_t *spline,
2154 /* sum forces for local particles */
2155 int nn,n,ithx,ithy,ithz,i0,j0,k0;
2156 int index_x,index_xy;
2157 int nx,ny,nz,pnx,pny,pnz;
2159 real tx,ty,dx,dy,qn;
2162 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
2164 real rxx,ryx,ryy,rzx,rzy,rzz;
2167 pme_spline_work_t *work;
2169 work = pme->spline_work;
2171 order = pme->pme_order;
2172 thx = spline->theta[XX];
2173 thy = spline->theta[YY];
2174 thz = spline->theta[ZZ];
2175 dthx = spline->dtheta[XX];
2176 dthy = spline->dtheta[YY];
2177 dthz = spline->dtheta[ZZ];
2181 pnx = pme->pmegrid_nx;
2182 pny = pme->pmegrid_ny;
2183 pnz = pme->pmegrid_nz;
2185 rxx = pme->recipbox[XX][XX];
2186 ryx = pme->recipbox[YY][XX];
2187 ryy = pme->recipbox[YY][YY];
2188 rzx = pme->recipbox[ZZ][XX];
2189 rzy = pme->recipbox[ZZ][YY];
2190 rzz = pme->recipbox[ZZ][ZZ];
2192 for(nn=0; nn<spline->n; nn++)
2194 n = spline->ind[nn];
2195 qn = scale*atc->q[n];
2208 idxptr = atc->idx[n];
2215 /* Pointer arithmetic alert, next six statements */
2216 thx = spline->theta[XX] + norder;
2217 thy = spline->theta[YY] + norder;
2218 thz = spline->theta[ZZ] + norder;
2219 dthx = spline->dtheta[XX] + norder;
2220 dthy = spline->dtheta[YY] + norder;
2221 dthz = spline->dtheta[ZZ] + norder;
2226 #ifdef PME_SSE_UNALIGNED
2227 #define PME_GATHER_F_SSE_ORDER4
2229 #define PME_GATHER_F_SSE_ALIGNED
2232 #include "pme_sse_single.h"
2239 #define PME_GATHER_F_SSE_ALIGNED
2241 #include "pme_sse_single.h"
2251 atc->f[n][XX] += -qn*( fx*nx*rxx );
2252 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2253 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2256 /* Since the energy and not forces are interpolated
2257 * the net force might not be exactly zero.
2258 * This can be solved by also interpolating F, but
2259 * that comes at a cost.
2260 * A better hack is to remove the net force every
2261 * step, but that must be done at a higher level
2262 * since this routine doesn't see all atoms if running
2263 * in parallel. Don't know how important it is? EL 990726
2268 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
2269 pme_atomcomm_t *atc)
2271 splinedata_t *spline;
2272 int n,ithx,ithy,ithz,i0,j0,k0;
2273 int index_x,index_xy;
2275 real energy,pot,tx,ty,qn,gval;
2276 real *thx,*thy,*thz;
2280 spline = &atc->spline[0];
2282 order = pme->pme_order;
2285 for(n=0; (n<atc->n); n++) {
2289 idxptr = atc->idx[n];
2296 /* Pointer arithmetic alert, next three statements */
2297 thx = spline->theta[XX] + norder;
2298 thy = spline->theta[YY] + norder;
2299 thz = spline->theta[ZZ] + norder;
2302 for(ithx=0; (ithx<order); ithx++)
2304 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2307 for(ithy=0; (ithy<order); ithy++)
2309 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2312 for(ithz=0; (ithz<order); ithz++)
2314 gval = grid[index_xy+(k0+ithz)];
2315 pot += tx*ty*thz[ithz]*gval;
2328 /* Macro to force loop unrolling by fixing order.
2329 * This gives a significant performance gain.
2331 #define CALC_SPLINE(order) \
2335 real data[PME_ORDER_MAX]; \
2336 real ddata[PME_ORDER_MAX]; \
2338 for(j=0; (j<DIM); j++) \
2342 /* dr is relative offset from lower cell limit */ \
2343 data[order-1] = 0; \
2347 for(k=3; (k<order); k++) \
2349 div = 1.0/(k - 1.0); \
2350 data[k-1] = div*dr*data[k-2]; \
2351 for(l=1; (l<(k-1)); l++) \
2353 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2356 data[0] = div*(1-dr)*data[0]; \
2358 /* differentiate */ \
2359 ddata[0] = -data[0]; \
2360 for(k=1; (k<order); k++) \
2362 ddata[k] = data[k-1] - data[k]; \
2365 div = 1.0/(order - 1); \
2366 data[order-1] = div*dr*data[order-2]; \
2367 for(l=1; (l<(order-1)); l++) \
2369 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2370 (order-l-dr)*data[order-l-1]); \
2372 data[0] = div*(1 - dr)*data[0]; \
2374 for(k=0; k<order; k++) \
2376 theta[j][i*order+k] = data[k]; \
2377 dtheta[j][i*order+k] = ddata[k]; \
2382 void make_bsplines(splinevec theta,splinevec dtheta,int order,
2383 rvec fractx[],int nr,int ind[],real charge[],
2384 gmx_bool bFreeEnergy)
2386 /* construct splines for local atoms */
2392 /* With free energy we do not use the charge check.
2393 * In most cases this will be more efficient than calling make_bsplines
2394 * twice, since usually more than half the particles have charges.
2397 if (bFreeEnergy || charge[ii] != 0.0) {
2400 case 4: CALC_SPLINE(4); break;
2401 case 5: CALC_SPLINE(5); break;
2402 default: CALC_SPLINE(order); break;
2409 void make_dft_mod(real *mod,real *data,int ndata)
2414 for(i=0;i<ndata;i++) {
2416 for(j=0;j<ndata;j++) {
2417 arg=(2.0*M_PI*i*j)/ndata;
2418 sc+=data[j]*cos(arg);
2419 ss+=data[j]*sin(arg);
2423 for(i=0;i<ndata;i++)
2425 mod[i]=(mod[i-1]+mod[i+1])*0.5;
2429 static void make_bspline_moduli(splinevec bsp_mod,
2430 int nx,int ny,int nz,int order)
2432 int nmax=max(nx,max(ny,nz));
2433 real *data,*ddata,*bsp_data;
2439 snew(bsp_data,nmax);
2445 for(k=3;k<order;k++) {
2448 for(l=1;l<(k-1);l++)
2449 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2450 data[0]=div*data[0];
2454 for(k=1;k<order;k++)
2455 ddata[k]=data[k-1]-data[k];
2458 for(l=1;l<(order-1);l++)
2459 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2460 data[0]=div*data[0];
2464 for(i=1;i<=order;i++)
2465 bsp_data[i]=data[i-1];
2467 make_dft_mod(bsp_mod[XX],bsp_data,nx);
2468 make_dft_mod(bsp_mod[YY],bsp_data,ny);
2469 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
2477 /* Return the P3M optimal influence function */
2478 static double do_p3m_influence(double z, int order)
2485 /* The formula and most constants can be found in:
2486 * Ballenegger et al., JCTC 8, 936 (2012)
2491 return 1.0 - 2.0*z2/3.0;
2494 return 1.0 - z2 + 2.0*z4/15.0;
2497 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2500 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2503 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2506 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2508 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2515 /* Calculate the P3M B-spline moduli for one dimension */
2516 static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
2518 double zarg,zai,sinzai,infl;
2523 gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
2530 for(i=-maxk; i<0; i++)
2534 infl = do_p3m_influence(sinzai,order);
2535 bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
2538 for(i=1; i<maxk; i++)
2542 infl = do_p3m_influence(sinzai,order);
2543 bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
2547 /* Calculate the P3M B-spline moduli */
2548 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2549 int nx,int ny,int nz,int order)
2551 make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
2552 make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
2553 make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
2557 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2565 for(i=1; i<=nslab/2; i++) {
2566 fw = (atc->nodeid + i) % nslab;
2567 bw = (atc->nodeid - i + nslab) % nslab;
2568 if (n < nslab - 1) {
2569 atc->node_dest[n] = fw;
2570 atc->node_src[n] = bw;
2573 if (n < nslab - 1) {
2574 atc->node_dest[n] = bw;
2575 atc->node_src[n] = fw;
2581 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
2587 fprintf(log,"Destroying PME data structures.\n");
2590 sfree((*pmedata)->nnx);
2591 sfree((*pmedata)->nny);
2592 sfree((*pmedata)->nnz);
2594 pmegrids_destroy(&(*pmedata)->pmegridA);
2596 sfree((*pmedata)->fftgridA);
2597 sfree((*pmedata)->cfftgridA);
2598 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2600 if ((*pmedata)->pmegridB.grid.grid != NULL)
2602 pmegrids_destroy(&(*pmedata)->pmegridB);
2603 sfree((*pmedata)->fftgridB);
2604 sfree((*pmedata)->cfftgridB);
2605 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2607 for(thread=0; thread<(*pmedata)->nthread; thread++)
2609 free_work(&(*pmedata)->work[thread]);
2611 sfree((*pmedata)->work);
2619 static int mult_up(int n,int f)
2621 return ((n + f - 1)/f)*f;
2625 static double pme_load_imbalance(gmx_pme_t pme)
2630 nma = pme->nnodes_major;
2631 nmi = pme->nnodes_minor;
2633 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
2634 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
2635 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
2637 /* pme_solve is roughly double the cost of an fft */
2639 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2642 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
2643 int dimind,gmx_bool bSpread)
2647 atc->dimind = dimind;
2652 if (pme->nnodes > 1)
2654 atc->mpi_comm = pme->mpi_comm_d[dimind];
2655 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
2656 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
2660 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
2664 atc->bSpread = bSpread;
2665 atc->pme_order = pme->pme_order;
2669 /* These three allocations are not required for particle decomp. */
2670 snew(atc->node_dest,atc->nslab);
2671 snew(atc->node_src,atc->nslab);
2672 setup_coordinate_communication(atc);
2674 snew(atc->count_thread,pme->nthread);
2675 for(thread=0; thread<pme->nthread; thread++)
2677 snew(atc->count_thread[thread],atc->nslab);
2679 atc->count = atc->count_thread[0];
2680 snew(atc->rcount,atc->nslab);
2681 snew(atc->buf_index,atc->nslab);
2684 atc->nthread = pme->nthread;
2685 if (atc->nthread > 1)
2687 snew(atc->thread_plist,atc->nthread);
2689 snew(atc->spline,atc->nthread);
2690 for(thread=0; thread<atc->nthread; thread++)
2692 if (atc->nthread > 1)
2694 snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
2695 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2701 init_overlap_comm(pme_overlap_t * ol,
2711 int lbnd,rbnd,maxlr,b,i;
2714 pme_grid_comm_t *pgc;
2716 int fft_start,fft_end,send_index1,recv_index1;
2719 ol->mpi_comm = comm;
2722 ol->nnodes = nnodes;
2723 ol->nodeid = nodeid;
2725 /* Linear translation of the PME grid wo'nt affect reciprocal space
2726 * calculations, so to optimize we only interpolate "upwards",
2727 * which also means we only have to consider overlap in one direction.
2728 * I.e., particles on this node might also be spread to grid indices
2729 * that belong to higher nodes (modulo nnodes)
2732 snew(ol->s2g0,ol->nnodes+1);
2733 snew(ol->s2g1,ol->nnodes);
2734 if (debug) { fprintf(debug,"PME slab boundaries:"); }
2735 for(i=0; i<nnodes; i++)
2737 /* s2g0 the local interpolation grid start.
2738 * s2g1 the local interpolation grid end.
2739 * Because grid overlap communication only goes forward,
2740 * the grid the slabs for fft's should be rounded down.
2742 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2743 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2747 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
2750 ol->s2g0[nnodes] = ndata;
2751 if (debug) { fprintf(debug,"\n"); }
2753 /* Determine with how many nodes we need to communicate the grid overlap */
2759 for(i=0; i<nnodes; i++)
2761 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2762 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2768 while (bCont && b < nnodes);
2769 ol->noverlap_nodes = b - 1;
2771 snew(ol->send_id,ol->noverlap_nodes);
2772 snew(ol->recv_id,ol->noverlap_nodes);
2773 for(b=0; b<ol->noverlap_nodes; b++)
2775 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2776 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2778 snew(ol->comm_data, ol->noverlap_nodes);
2780 for(b=0; b<ol->noverlap_nodes; b++)
2782 pgc = &ol->comm_data[b];
2784 fft_start = ol->s2g0[ol->send_id[b]];
2785 fft_end = ol->s2g0[ol->send_id[b]+1];
2786 if (ol->send_id[b] < nodeid)
2791 send_index1 = ol->s2g1[nodeid];
2792 send_index1 = min(send_index1,fft_end);
2793 pgc->send_index0 = fft_start;
2794 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
2796 /* We always start receiving to the first index of our slab */
2797 fft_start = ol->s2g0[ol->nodeid];
2798 fft_end = ol->s2g0[ol->nodeid+1];
2799 recv_index1 = ol->s2g1[ol->recv_id[b]];
2800 if (ol->recv_id[b] > nodeid)
2802 recv_index1 -= ndata;
2804 recv_index1 = min(recv_index1,fft_end);
2805 pgc->recv_index0 = fft_start;
2806 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
2809 /* For non-divisible grid we need pme_order iso pme_order-1 */
2810 snew(ol->sendbuf,norder*commplainsize);
2811 snew(ol->recvbuf,norder*commplainsize);
2815 make_gridindex5_to_localindex(int n,int local_start,int local_range,
2816 int **global_to_local,
2817 real **fraction_shift)
2825 for(i=0; (i<5*n); i++)
2827 /* Determine the global to local grid index */
2828 gtl[i] = (i - local_start + n) % n;
2829 /* For coordinates that fall within the local grid the fraction
2830 * is correct, we don't need to shift it.
2833 if (local_range < n)
2835 /* Due to rounding issues i could be 1 beyond the lower or
2836 * upper boundary of the local grid. Correct the index for this.
2837 * If we shift the index, we need to shift the fraction by
2838 * the same amount in the other direction to not affect
2840 * Note that due to this shifting the weights at the end of
2841 * the spline might change, but that will only involve values
2842 * between zero and values close to the precision of a real,
2843 * which is anyhow the accuracy of the whole mesh calculation.
2845 /* With local_range=0 we should not change i=local_start */
2846 if (i % n != local_start)
2853 else if (gtl[i] == local_range)
2855 gtl[i] = local_range - 1;
2862 *global_to_local = gtl;
2863 *fraction_shift = fsh;
2866 static pme_spline_work_t *make_pme_spline_work(int order)
2868 pme_spline_work_t *work;
2875 snew_aligned(work,1,16);
2877 zero_SSE = _mm_setzero_ps();
2879 /* Generate bit masks to mask out the unused grid entries,
2880 * as we only operate on order of the 8 grid entries that are
2881 * load into 2 SSE float registers.
2883 for(of=0; of<8-(order-1); of++)
2887 tmp[i] = (i >= of && i < of+order ? 1 : 0);
2889 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
2890 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
2891 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
2892 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
2902 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2906 if (*nk % nnodes != 0)
2908 nk_new = nnodes*(*nk/nnodes + 1);
2910 if (2*nk_new >= 3*(*nk))
2912 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2913 dim,*nk,dim,nnodes,dim);
2918 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2919 dim,*nk,dim,nnodes,dim,nk_new,dim);
2926 int gmx_pme_init(gmx_pme_t * pmedata,
2932 gmx_bool bFreeEnergy,
2933 gmx_bool bReproducible,
2938 pme_atomcomm_t *atc;
2942 fprintf(debug,"Creating PME data structures.\n");
2945 pme->redist_init = FALSE;
2946 pme->sum_qgrid_tmp = NULL;
2947 pme->sum_qgrid_dd_tmp = NULL;
2948 pme->buf_nalloc = 0;
2949 pme->redist_buf_nalloc = 0;
2952 pme->bPPnode = TRUE;
2954 pme->nnodes_major = nnodes_major;
2955 pme->nnodes_minor = nnodes_minor;
2958 if (nnodes_major*nnodes_minor > 1)
2960 pme->mpi_comm = cr->mpi_comm_mygroup;
2962 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2963 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2964 if (pme->nnodes != nnodes_major*nnodes_minor)
2966 gmx_incons("PME node count mismatch");
2971 pme->mpi_comm = MPI_COMM_NULL;
2975 if (pme->nnodes == 1)
2978 pme->mpi_comm_d[0] = MPI_COMM_NULL;
2979 pme->mpi_comm_d[1] = MPI_COMM_NULL;
2981 pme->ndecompdim = 0;
2982 pme->nodeid_major = 0;
2983 pme->nodeid_minor = 0;
2985 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
2990 if (nnodes_minor == 1)
2993 pme->mpi_comm_d[0] = pme->mpi_comm;
2994 pme->mpi_comm_d[1] = MPI_COMM_NULL;
2996 pme->ndecompdim = 1;
2997 pme->nodeid_major = pme->nodeid;
2998 pme->nodeid_minor = 0;
3001 else if (nnodes_major == 1)
3004 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3005 pme->mpi_comm_d[1] = pme->mpi_comm;
3007 pme->ndecompdim = 1;
3008 pme->nodeid_major = 0;
3009 pme->nodeid_minor = pme->nodeid;
3013 if (pme->nnodes % nnodes_major != 0)
3015 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3017 pme->ndecompdim = 2;
3020 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
3021 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
3022 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
3023 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3025 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
3026 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
3027 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
3028 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
3031 pme->bPPnode = (cr->duty & DUTY_PP);
3034 pme->nthread = nthread;
3036 if (ir->ePBC == epbcSCREW)
3038 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
3041 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
3045 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3046 pme->pme_order = ir->pme_order;
3047 pme->epsilon_r = ir->epsilon_r;
3049 if (pme->pme_order > PME_ORDER_MAX)
3051 gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3052 pme->pme_order,PME_ORDER_MAX);
3055 /* Currently pme.c supports only the fft5d FFT code.
3056 * Therefore the grid always needs to be divisible by nnodes.
3057 * When the old 1D code is also supported again, change this check.
3059 * This check should be done before calling gmx_pme_init
3060 * and fplog should be passed iso stderr.
3062 if (pme->ndecompdim >= 2)
3064 if (pme->ndecompdim >= 1)
3067 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3068 'x',nnodes_major,&pme->nkx);
3069 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3070 'y',nnodes_minor,&pme->nky);
3074 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3075 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3076 pme->nkz <= pme->pme_order)
3078 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
3081 if (pme->nnodes > 1) {
3085 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3086 MPI_Type_commit(&(pme->rvec_mpi));
3089 /* Note that the charge spreading and force gathering, which usually
3090 * takes about the same amount of time as FFT+solve_pme,
3091 * is always fully load balanced
3092 * (unless the charge distribution is inhomogeneous).
3095 imbal = pme_load_imbalance(pme);
3096 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3100 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3101 " For optimal PME load balancing\n"
3102 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3103 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3105 (int)((imbal-1)*100 + 0.5),
3106 pme->nkx,pme->nky,pme->nnodes_major,
3107 pme->nky,pme->nkz,pme->nnodes_minor);
3111 /* For non-divisible grid we need pme_order iso pme_order-1 */
3112 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3113 * y is always copied through a buffer: we don't need padding in z,
3114 * but we do need the overlap in x because of the communication order.
3116 init_overlap_comm(&pme->overlap[0],pme->pme_order,
3120 pme->nnodes_major,pme->nodeid_major,
3122 (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3124 init_overlap_comm(&pme->overlap[1],pme->pme_order,
3128 pme->nnodes_minor,pme->nodeid_minor,
3130 (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
3132 /* Check for a limitation of the (current) sum_fftgrid_dd code */
3133 if (pme->nthread > 1 &&
3134 (pme->overlap[0].noverlap_nodes > 1 ||
3135 pme->overlap[1].noverlap_nodes > 1))
3137 gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
3140 snew(pme->bsp_mod[XX],pme->nkx);
3141 snew(pme->bsp_mod[YY],pme->nky);
3142 snew(pme->bsp_mod[ZZ],pme->nkz);
3144 /* The required size of the interpolation grid, including overlap.
3145 * The allocated size (pmegrid_n?) might be slightly larger.
3147 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3148 pme->overlap[0].s2g0[pme->nodeid_major];
3149 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3150 pme->overlap[1].s2g0[pme->nodeid_minor];
3151 pme->pmegrid_nz_base = pme->nkz;
3152 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3153 set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
3155 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3156 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3157 pme->pmegrid_start_iz = 0;
3159 make_gridindex5_to_localindex(pme->nkx,
3160 pme->pmegrid_start_ix,
3161 pme->pmegrid_nx - (pme->pme_order-1),
3162 &pme->nnx,&pme->fshx);
3163 make_gridindex5_to_localindex(pme->nky,
3164 pme->pmegrid_start_iy,
3165 pme->pmegrid_ny - (pme->pme_order-1),
3166 &pme->nny,&pme->fshy);
3167 make_gridindex5_to_localindex(pme->nkz,
3168 pme->pmegrid_start_iz,
3169 pme->pmegrid_nz_base,
3170 &pme->nnz,&pme->fshz);
3172 pmegrids_init(&pme->pmegridA,
3173 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3174 pme->pmegrid_nz_base,
3177 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3178 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3180 pme->spline_work = make_pme_spline_work(pme->pme_order);
3182 ndata[0] = pme->nkx;
3183 ndata[1] = pme->nky;
3184 ndata[2] = pme->nkz;
3186 /* This routine will allocate the grid data to fit the FFTs */
3187 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
3188 &pme->fftgridA,&pme->cfftgridA,
3190 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3191 bReproducible,pme->nthread);
3195 pmegrids_init(&pme->pmegridB,
3196 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3197 pme->pmegrid_nz_base,
3200 pme->nkx % pme->nnodes_major != 0,
3201 pme->nky % pme->nnodes_minor != 0);
3203 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
3204 &pme->fftgridB,&pme->cfftgridB,
3206 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3207 bReproducible,pme->nthread);
3211 pme->pmegridB.grid.grid = NULL;
3212 pme->fftgridB = NULL;
3213 pme->cfftgridB = NULL;
3218 /* Use plain SPME B-spline interpolation */
3219 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3223 /* Use the P3M grid-optimized influence function */
3224 make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3227 /* Use atc[0] for spreading */
3228 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
3229 if (pme->ndecompdim >= 2)
3231 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
3234 if (pme->nnodes == 1) {
3235 pme->atc[0].n = homenr;
3236 pme_realloc_atomcomm_things(&pme->atc[0]);
3242 /* Use fft5d, order after FFT is y major, z, x minor */
3244 snew(pme->work,pme->nthread);
3245 for(thread=0; thread<pme->nthread; thread++)
3247 realloc_work(&pme->work[thread],pme->nkx);
3257 static void copy_local_grid(gmx_pme_t pme,
3258 pmegrids_t *pmegrids,int thread,real *fftgrid)
3260 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3264 int offx,offy,offz,x,y,z,i0,i0t;
3269 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3273 fft_my = local_fft_size[YY];
3274 fft_mz = local_fft_size[ZZ];
3276 pmegrid = &pmegrids->grid_th[thread];
3278 nsx = pmegrid->n[XX];
3279 nsy = pmegrid->n[YY];
3280 nsz = pmegrid->n[ZZ];
3282 for(d=0; d<DIM; d++)
3284 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3285 local_fft_ndata[d] - pmegrid->offset[d]);
3288 offx = pmegrid->offset[XX];
3289 offy = pmegrid->offset[YY];
3290 offz = pmegrid->offset[ZZ];
3292 /* Directly copy the non-overlapping parts of the local grids.
3293 * This also initializes the full grid.
3295 grid_th = pmegrid->grid;
3296 for(x=0; x<nf[XX]; x++)
3298 for(y=0; y<nf[YY]; y++)
3300 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3301 i0t = (x*nsy + y)*nsz;
3302 for(z=0; z<nf[ZZ]; z++)
3304 fftgrid[i0+z] = grid_th[i0t+z];
3310 static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
3312 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3313 pme_overlap_t *overlap;
3317 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3321 /* Major dimension */
3322 overlap = &pme->overlap[0];
3324 nind = overlap->comm_data[0].send_nindex;
3326 for(y=0; y<local_fft_ndata[YY]; y++) {
3332 for(x=0; x<nind; x++) {
3333 for(y=0; y<local_fft_ndata[YY]; y++) {
3335 for(z=0; z<local_fft_ndata[ZZ]; z++) {
3336 if (sendbuf[i] != 0) n++;
3346 reduce_threadgrid_overlap(gmx_pme_t pme,
3347 const pmegrids_t *pmegrids,int thread,
3348 real *fftgrid,real *commbuf_x,real *commbuf_y)
3350 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3351 int fft_nx,fft_ny,fft_nz;
3356 int offx,offy,offz,x,y,z,i0,i0t;
3357 int sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
3358 gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
3359 gmx_bool bCommX,bCommY;
3362 const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
3363 const real *grid_th;
3366 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3370 fft_nx = local_fft_ndata[XX];
3371 fft_ny = local_fft_ndata[YY];
3372 fft_nz = local_fft_ndata[ZZ];
3374 fft_my = local_fft_size[YY];
3375 fft_mz = local_fft_size[ZZ];
3377 /* This routine is called when all thread have finished spreading.
3378 * Here each thread sums grid contributions calculated by other threads
3379 * to the thread local grid volume.
3380 * To minimize the number of grid copying operations,
3381 * this routines sums immediately from the pmegrid to the fftgrid.
3384 /* Determine which part of the full node grid we should operate on,
3385 * this is our thread local part of the full grid.
3387 pmegrid = &pmegrids->grid_th[thread];
3389 for(d=0; d<DIM; d++)
3391 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3392 local_fft_ndata[d]);
3395 offx = pmegrid->offset[XX];
3396 offy = pmegrid->offset[YY];
3397 offz = pmegrid->offset[ZZ];
3404 /* Now loop over all the thread data blocks that contribute
3405 * to the grid region we (our thread) are operating on.
3407 /* Note that ffy_nx/y is equal to the number of grid points
3408 * between the first point of our node grid and the one of the next node.
3410 for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
3412 fx = pmegrid->ci[XX] + sx;
3416 fx += pmegrids->nc[XX];
3418 bCommX = (pme->nnodes_major > 1);
3420 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3421 ox += pmegrid_g->offset[XX];
3424 tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
3428 tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
3431 for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
3433 fy = pmegrid->ci[YY] + sy;
3437 fy += pmegrids->nc[YY];
3439 bCommY = (pme->nnodes_minor > 1);
3441 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3442 oy += pmegrid_g->offset[YY];
3445 ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
3449 ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
3452 for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
3454 fz = pmegrid->ci[ZZ] + sz;
3458 fz += pmegrids->nc[ZZ];
3461 pmegrid_g = &pmegrids->grid_th[fz];
3462 oz += pmegrid_g->offset[ZZ];
3463 tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
3465 if (sx == 0 && sy == 0 && sz == 0)
3467 /* We have already added our local contribution
3468 * before calling this routine, so skip it here.
3473 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3475 pmegrid_f = &pmegrids->grid_th[thread_f];
3477 grid_th = pmegrid_f->grid;
3479 nsx = pmegrid_f->n[XX];
3480 nsy = pmegrid_f->n[YY];
3481 nsz = pmegrid_f->n[ZZ];
3483 #ifdef DEBUG_PME_REDUCE
3484 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3485 pme->nodeid,thread,thread_f,
3486 pme->pmegrid_start_ix,
3487 pme->pmegrid_start_iy,
3488 pme->pmegrid_start_iz,
3490 offx-ox,tx1-ox,offx,tx1,
3491 offy-oy,ty1-oy,offy,ty1,
3492 offz-oz,tz1-oz,offz,tz1);
3495 if (!(bCommX || bCommY))
3497 /* Copy from the thread local grid to the node grid */
3498 for(x=offx; x<tx1; x++)
3500 for(y=offy; y<ty1; y++)
3502 i0 = (x*fft_my + y)*fft_mz;
3503 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3504 for(z=offz; z<tz1; z++)
3506 fftgrid[i0+z] += grid_th[i0t+z];
3513 /* The order of this conditional decides
3514 * where the corner volume gets stored with x+y decomp.
3518 commbuf = commbuf_y;
3519 buf_my = ty1 - offy;
3522 /* We index commbuf modulo the local grid size */
3523 commbuf += buf_my*fft_nx*fft_nz;
3525 bClearBuf = bClearBufXY;
3526 bClearBufXY = FALSE;
3530 bClearBuf = bClearBufY;
3536 commbuf = commbuf_x;
3538 bClearBuf = bClearBufX;
3542 /* Copy to the communication buffer */
3543 for(x=offx; x<tx1; x++)
3545 for(y=offy; y<ty1; y++)
3547 i0 = (x*buf_my + y)*fft_nz;
3548 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3552 /* First access of commbuf, initialize it */
3553 for(z=offz; z<tz1; z++)
3555 commbuf[i0+z] = grid_th[i0t+z];
3560 for(z=offz; z<tz1; z++)
3562 commbuf[i0+z] += grid_th[i0t+z];
3574 static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
3576 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3577 pme_overlap_t *overlap;
3579 int recv_index0,recv_nindex;
3583 int ipulse,send_id,recv_id,datasize,gridsize,size_yx;
3584 real *sendptr,*recvptr;
3585 int x,y,z,indg,indb;
3587 /* Note that this routine is only used for forward communication.
3588 * Since the force gathering, unlike the charge spreading,
3589 * can be trivially parallelized over the particles,
3590 * the backwards process is much simpler and can use the "old"
3591 * communication setup.
3594 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3599 /* Currently supports only a single communication pulse */
3601 /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
3602 if (pme->nnodes_minor > 1)
3604 /* Major dimension */
3605 overlap = &pme->overlap[1];
3607 if (pme->nnodes_major > 1)
3609 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3615 datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
3619 send_id = overlap->send_id[ipulse];
3620 recv_id = overlap->recv_id[ipulse];
3621 send_nindex = overlap->comm_data[ipulse].send_nindex;
3622 /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
3624 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3626 sendptr = overlap->sendbuf;
3627 recvptr = overlap->recvbuf;
3630 printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
3631 local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
3632 printf("node %d send %f, %f\n",pme->nodeid,
3633 sendptr[0],sendptr[send_nindex*datasize-1]);
3637 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3639 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3641 overlap->mpi_comm,&stat);
3644 for(x=0; x<local_fft_ndata[XX]; x++)
3646 for(y=0; y<recv_nindex; y++)
3648 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3649 indb = (x*recv_nindex + y)*local_fft_ndata[ZZ];
3650 for(z=0; z<local_fft_ndata[ZZ]; z++)
3652 fftgrid[indg+z] += recvptr[indb+z];
3656 if (pme->nnodes_major > 1)
3658 sendptr = pme->overlap[0].sendbuf;
3659 for(x=0; x<size_yx; x++)
3661 for(y=0; y<recv_nindex; y++)
3663 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3664 indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
3665 for(z=0; z<local_fft_ndata[ZZ]; z++)
3667 sendptr[indg+z] += recvptr[indb+z];
3674 /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
3675 if (pme->nnodes_major > 1)
3677 /* Major dimension */
3678 overlap = &pme->overlap[0];
3680 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3681 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3685 send_id = overlap->send_id[ipulse];
3686 recv_id = overlap->recv_id[ipulse];
3687 send_nindex = overlap->comm_data[ipulse].send_nindex;
3688 /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
3690 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3692 sendptr = overlap->sendbuf;
3693 recvptr = overlap->recvbuf;
3697 fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
3698 send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
3702 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3704 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3706 overlap->mpi_comm,&stat);
3709 for(x=0; x<recv_nindex; x++)
3711 for(y=0; y<local_fft_ndata[YY]; y++)
3713 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3714 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3715 for(z=0; z<local_fft_ndata[ZZ]; z++)
3717 fftgrid[indg+z] += recvptr[indb+z];
3725 static void spread_on_grid(gmx_pme_t pme,
3726 pme_atomcomm_t *atc,pmegrids_t *grids,
3727 gmx_bool bCalcSplines,gmx_bool bSpread,
3731 #ifdef PME_TIME_THREADS
3732 gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
3733 static double cs1=0,cs2=0,cs3=0;
3734 static double cs1a[6]={0,0,0,0,0,0};
3738 nthread = pme->nthread;
3741 #ifdef PME_TIME_THREADS
3742 c1 = omp_cyc_start();
3746 #pragma omp parallel for num_threads(nthread) schedule(static)
3747 for(thread=0; thread<nthread; thread++)
3751 start = atc->n* thread /nthread;
3752 end = atc->n*(thread+1)/nthread;
3754 /* Compute fftgrid index for all atoms,
3755 * with help of some extra variables.
3757 calc_interpolation_idx(pme,atc,start,end,thread);
3760 #ifdef PME_TIME_THREADS
3761 c1 = omp_cyc_end(c1);
3765 #ifdef PME_TIME_THREADS
3766 c2 = omp_cyc_start();
3768 #pragma omp parallel for num_threads(nthread) schedule(static)
3769 for(thread=0; thread<nthread; thread++)
3771 splinedata_t *spline;
3774 /* make local bsplines */
3775 if (grids == NULL || grids->nthread == 1)
3777 spline = &atc->spline[0];
3781 grid = &grids->grid;
3785 spline = &atc->spline[thread];
3787 make_thread_local_ind(atc,thread,spline);
3789 grid = &grids->grid_th[thread];
3794 make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
3795 atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
3800 /* put local atoms on grid. */
3801 #ifdef PME_TIME_SPREAD
3802 ct1a = omp_cyc_start();
3804 spread_q_bsplines_thread(grid,atc,spline,pme->spline_work);
3806 if (grids->nthread > 1)
3808 copy_local_grid(pme,grids,thread,fftgrid);
3810 #ifdef PME_TIME_SPREAD
3811 ct1a = omp_cyc_end(ct1a);
3812 cs1a[thread] += (double)ct1a;
3816 #ifdef PME_TIME_THREADS
3817 c2 = omp_cyc_end(c2);
3821 if (bSpread && grids->nthread > 1)
3823 #ifdef PME_TIME_THREADS
3824 c3 = omp_cyc_start();
3826 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3827 for(thread=0; thread<grids->nthread; thread++)
3829 reduce_threadgrid_overlap(pme,grids,thread,
3831 pme->overlap[0].sendbuf,
3832 pme->overlap[1].sendbuf);
3833 #ifdef PRINT_PME_SENDBUF
3834 print_sendbuf(pme,pme->overlap[0].sendbuf);
3837 #ifdef PME_TIME_THREADS
3838 c3 = omp_cyc_end(c3);
3842 if (pme->nnodes > 1)
3844 /* Communicate the overlapping part of the fftgrid */
3845 sum_fftgrid_dd(pme,fftgrid);
3849 #ifdef PME_TIME_THREADS
3853 printf("idx %.2f spread %.2f red %.2f",
3854 cs1*1e-9,cs2*1e-9,cs3*1e-9);
3855 #ifdef PME_TIME_SPREAD
3856 for(thread=0; thread<nthread; thread++)
3857 printf(" %.2f",cs1a[thread]*1e-9);
3865 static void dump_grid(FILE *fp,
3866 int sx,int sy,int sz,int nx,int ny,int nz,
3867 int my,int mz,const real *g)
3877 fprintf(fp,"%2d %2d %2d %6.3f\n",
3878 sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
3884 static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
3886 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3888 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3894 pme->pmegrid_start_ix,
3895 pme->pmegrid_start_iy,
3896 pme->pmegrid_start_iz,
3897 pme->pmegrid_nx-pme->pme_order+1,
3898 pme->pmegrid_ny-pme->pme_order+1,
3899 pme->pmegrid_nz-pme->pme_order+1,
3906 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
3908 pme_atomcomm_t *atc;
3911 if (pme->nnodes > 1)
3913 gmx_incons("gmx_pme_calc_energy called in parallel");
3917 gmx_incons("gmx_pme_calc_energy with free energy");
3920 atc = &pme->atc_energy;
3922 if (atc->spline == NULL)
3924 snew(atc->spline,atc->nthread);
3927 atc->bSpread = TRUE;
3928 atc->pme_order = pme->pme_order;
3930 pme_realloc_atomcomm_things(atc);
3934 /* We only use the A-charges grid */
3935 grid = &pme->pmegridA;
3937 spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
3939 *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
3943 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
3944 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
3946 /* Reset all the counters related to performance over the run */
3947 wallcycle_stop(wcycle,ewcRUN);
3948 wallcycle_reset_all(wcycle);
3950 ir->init_step += step_rel;
3951 ir->nsteps -= step_rel;
3952 wallcycle_start(wcycle,ewcRUN);
3956 int gmx_pmeonly(gmx_pme_t pme,
3957 t_commrec *cr, t_nrnb *nrnb,
3958 gmx_wallcycle_t wcycle,
3959 real ewaldcoeff, gmx_bool bGatherOnly,
3962 gmx_pme_pp_t pme_pp;
3965 rvec *x_pp=NULL,*f_pp=NULL;
3966 real *chargeA=NULL,*chargeB=NULL;
3968 int maxshift_x=0,maxshift_y=0;
3969 real energy,dvdlambda;
3974 gmx_large_int_t step,step_rel;
3977 pme_pp = gmx_pme_pp_init(cr);
3982 do /****** this is a quasi-loop over time steps! */
3984 /* Domain decomposition */
3985 natoms = gmx_pme_recv_q_x(pme_pp,
3986 &chargeA,&chargeB,box,&x_pp,&f_pp,
3987 &maxshift_x,&maxshift_y,
3993 /* We should stop: break out of the loop */
3997 step_rel = step - ir->init_step;
4000 wallcycle_start(wcycle,ewcRUN);
4002 wallcycle_start(wcycle,ewcPMEMESH);
4006 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
4007 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
4008 &energy,lambda,&dvdlambda,
4009 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4011 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
4013 gmx_pme_send_force_vir_ener(pme_pp,
4014 f_pp,vir,energy,dvdlambda,
4019 if (step_rel == wcycle_get_reset_counters(wcycle))
4021 /* Reset all the counters related to performance over the run */
4022 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
4023 wcycle_set_reset_counters(wcycle, 0);
4026 } /***** end of quasi-loop, we stop with the break above */
4032 int gmx_pme_do(gmx_pme_t pme,
4033 int start, int homenr,
4035 real *chargeA, real *chargeB,
4036 matrix box, t_commrec *cr,
4037 int maxshift_x, int maxshift_y,
4038 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4039 matrix vir, real ewaldcoeff,
4040 real *energy, real lambda,
4041 real *dvdlambda, int flags)
4043 int q,d,i,j,ntot,npme;
4046 pme_atomcomm_t *atc=NULL;
4047 pmegrids_t *pmegrid=NULL;
4051 real *charge=NULL,*q_d;
4055 gmx_parallel_3dfft_t pfft_setup;
4057 t_complex * cfftgrid;
4059 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4060 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4062 assert(pme->nnodes > 0);
4063 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4065 if (pme->nnodes > 1) {
4068 if (atc->npd > atc->pd_nalloc) {
4069 atc->pd_nalloc = over_alloc_dd(atc->npd);
4070 srenew(atc->pd,atc->pd_nalloc);
4072 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4076 /* This could be necessary for TPI */
4077 pme->atc[0].n = homenr;
4080 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
4082 pmegrid = &pme->pmegridA;
4083 fftgrid = pme->fftgridA;
4084 cfftgrid = pme->cfftgridA;
4085 pfft_setup = pme->pfft_setupA;
4086 charge = chargeA+start;
4088 pmegrid = &pme->pmegridB;
4089 fftgrid = pme->fftgridB;
4090 cfftgrid = pme->cfftgridB;
4091 pfft_setup = pme->pfft_setupB;
4092 charge = chargeB+start;
4094 grid = pmegrid->grid.grid;
4095 /* Unpack structure */
4097 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
4098 cr->nnodes,cr->nodeid);
4099 fprintf(debug,"Grid = %p\n",(void*)grid);
4101 gmx_fatal(FARGS,"No grid!");
4105 m_inv_ur0(box,pme->recipbox);
4107 if (pme->nnodes == 1) {
4109 if (DOMAINDECOMP(cr)) {
4111 pme_realloc_atomcomm_things(atc);
4117 wallcycle_start(wcycle,ewcPME_REDISTXF);
4118 for(d=pme->ndecompdim-1; d>=0; d--)
4120 if (d == pme->ndecompdim-1)
4128 n_d = pme->atc[d+1].n;
4134 if (atc->npd > atc->pd_nalloc) {
4135 atc->pd_nalloc = over_alloc_dd(atc->npd);
4136 srenew(atc->pd,atc->pd_nalloc);
4138 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4139 pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
4142 GMX_BARRIER(cr->mpi_comm_mygroup);
4143 /* Redistribute x (only once) and qA or qB */
4144 if (DOMAINDECOMP(cr)) {
4145 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
4147 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
4152 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4156 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
4159 if (flags & GMX_PME_SPREAD_Q)
4161 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4163 /* Spread the charges on a grid */
4164 GMX_MPE_LOG(ev_spread_on_grid_start);
4166 /* Spread the charges on a grid */
4167 spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
4168 GMX_MPE_LOG(ev_spread_on_grid_finish);
4172 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
4174 inc_nrnb(nrnb,eNR_SPREADQBSP,
4175 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4177 if (pme->nthread == 1)
4179 wrap_periodic_pmegrid(pme,grid);
4181 /* sum contributions to local grid from other nodes */
4183 if (pme->nnodes > 1)
4185 GMX_BARRIER(cr->mpi_comm_mygroup);
4186 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
4191 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
4194 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4197 dump_local_fftgrid(pme,fftgrid);
4202 /* Here we start a large thread parallel region */
4203 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4204 for(thread=0; thread<pme->nthread; thread++)
4206 if (flags & GMX_PME_SOLVE)
4213 GMX_BARRIER(cr->mpi_comm_mygroup);
4214 GMX_MPE_LOG(ev_gmxfft3d_start);
4215 wallcycle_start(wcycle,ewcPME_FFT);
4217 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
4218 fftgrid,cfftgrid,thread,wcycle);
4221 wallcycle_stop(wcycle,ewcPME_FFT);
4222 GMX_MPE_LOG(ev_gmxfft3d_finish);
4226 /* solve in k-space for our local cells */
4229 GMX_BARRIER(cr->mpi_comm_mygroup);
4230 GMX_MPE_LOG(ev_solve_pme_start);
4231 wallcycle_start(wcycle,ewcPME_SOLVE);
4234 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
4235 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4237 pme->nthread,thread);
4240 wallcycle_stop(wcycle,ewcPME_SOLVE);
4242 GMX_MPE_LOG(ev_solve_pme_finish);
4243 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
4252 GMX_BARRIER(cr->mpi_comm_mygroup);
4253 GMX_MPE_LOG(ev_gmxfft3d_start);
4255 wallcycle_start(wcycle,ewcPME_FFT);
4257 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
4258 cfftgrid,fftgrid,thread,wcycle);
4261 wallcycle_stop(wcycle,ewcPME_FFT);
4264 GMX_MPE_LOG(ev_gmxfft3d_finish);
4266 if (pme->nodeid == 0)
4268 ntot = pme->nkx*pme->nky*pme->nkz;
4269 npme = ntot*log((real)ntot)/log(2.0);
4270 inc_nrnb(nrnb,eNR_FFT,2*npme);
4273 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4276 copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
4279 /* End of thread parallel section.
4280 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4285 /* distribute local grid to all nodes */
4287 if (pme->nnodes > 1) {
4288 GMX_BARRIER(cr->mpi_comm_mygroup);
4289 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
4294 unwrap_periodic_pmegrid(pme,grid);
4296 /* interpolate forces for our local atoms */
4297 GMX_BARRIER(cr->mpi_comm_mygroup);
4298 GMX_MPE_LOG(ev_gather_f_bsplines_start);
4302 /* If we are running without parallelization,
4303 * atc->f is the actual force array, not a buffer,
4304 * therefore we should not clear it.
4306 bClearF = (q == 0 && PAR(cr));
4307 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4308 for(thread=0; thread<pme->nthread; thread++)
4310 gather_f_bsplines(pme,grid,bClearF,atc,
4311 &atc->spline[thread],
4312 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
4317 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
4319 inc_nrnb(nrnb,eNR_GATHERFBSP,
4320 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4321 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4326 /* This should only be called on the master thread
4327 * and after the threads have synchronized.
4329 get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
4333 if (bCalcF && pme->nnodes > 1) {
4334 wallcycle_start(wcycle,ewcPME_REDISTXF);
4335 for(d=0; d<pme->ndecompdim; d++)
4338 if (d == pme->ndecompdim - 1)
4345 n_d = pme->atc[d+1].n;
4346 f_d = pme->atc[d+1].f;
4348 GMX_BARRIER(cr->mpi_comm_mygroup);
4349 if (DOMAINDECOMP(cr)) {
4350 dd_pmeredist_f(pme,atc,n_d,f_d,
4351 d==pme->ndecompdim-1 && pme->bPPnode);
4353 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4357 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4364 *energy = energy_AB[0];
4365 m_add(vir,vir_AB[0],vir);
4367 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4368 *dvdlambda += energy_AB[1] - energy_AB[0];
4369 for(i=0; i<DIM; i++)
4371 for(j=0; j<DIM; j++)
4373 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4374 lambda*vir_AB[1][i][j];
4386 fprintf(debug,"PME mesh energy: %g\n",*energy);