1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
90 #include "gmx_cyclecounter.h"
93 /* Single precision, with SSE2 or higher available */
94 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
96 #include "gmx_x86_sse2.h"
97 #include "gmx_math_x86_sse2_single.h"
100 /* Some old AMD processors could have problems with unaligned loads+stores */
102 #define PME_SSE_UNALIGNED
107 /* #define PRT_FORCE */
108 /* conditions for on the fly time-measurement */
109 /* #define TAKETIME (step > 1 && timesteps < 10) */
110 #define TAKETIME FALSE
112 /* #define PME_TIME_THREADS */
115 #define mpi_type MPI_DOUBLE
117 #define mpi_type MPI_FLOAT
120 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
121 #define GMX_CACHE_SEP 64
123 /* We only define a maximum to be able to use local arrays without allocation.
124 * An order larger than 12 should never be needed, even for test cases.
125 * If needed it can be changed here.
127 #define PME_ORDER_MAX 12
129 /* Internal datastructures */
145 int *send_id,*recv_id;
146 pme_grid_comm_t *comm_data;
152 int *n; /* Cumulative counts of the number of particles per thread */
153 int nalloc; /* Allocation size of i */
154 int *i; /* Particle indices ordered on thread index (n) */
165 int dimind; /* The index of the dimension, 0=x, 1=y */
172 int *node_dest; /* The nodes to send x and q to with DD */
173 int *node_src; /* The nodes to receive x and q from with DD */
174 int *buf_index; /* Index for commnode into the buffers */
181 int *count; /* The number of atoms to send to each node */
183 int *rcount; /* The number of atoms to receive */
190 gmx_bool bSpread; /* These coordinates are used for spreading */
193 rvec *fractx; /* Fractional coordinate relative to the
194 * lower cell boundary
197 int *thread_idx; /* Which thread should spread which charge */
198 thread_plist_t *thread_plist;
199 splinedata_t *spline;
206 ivec ci; /* The spatial location of this grid */
207 ivec n; /* The size of *grid, including order-1 */
208 ivec offset; /* The grid offset from the full node grid */
209 int order; /* PME spreading order */
210 real *grid; /* The grid local thread, size n */
214 pmegrid_t grid; /* The full node grid (non thread-local) */
215 int nthread; /* The number of threads operating on this grid */
216 ivec nc; /* The local spatial decomposition over the threads */
217 pmegrid_t *grid_th; /* Array of grids for each thread */
218 int **g2t; /* The grid to thread index */
219 ivec nthread_comm; /* The number of threads to communicate with */
225 /* Masks for SSE aligned spreading and gathering */
226 __m128 mask_SSE0[6],mask_SSE1[6];
228 int dummy; /* C89 requires that struct has at least one member */
233 /* work data for solve_pme */
249 typedef struct gmx_pme {
250 int ndecompdim; /* The number of decomposition dimensions */
251 int nodeid; /* Our nodeid in mpi->mpi_comm */
254 int nnodes; /* The number of nodes doing PME */
259 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
261 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
264 int nthread; /* The number of threads doing PME */
266 gmx_bool bPPnode; /* Node also does particle-particle forces */
267 gmx_bool bFEP; /* Compute Free energy contribution */
268 int nkx,nky,nkz; /* Grid dimensions */
269 gmx_bool bP3M; /* Do P3M: optimize the influence function */
273 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
275 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
276 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
277 /* pmegrid_nz might be larger than strictly necessary to ensure
278 * memory alignment, pmegrid_nz_base gives the real base size.
281 /* The local PME grid starting indices */
282 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
284 /* Work data for spreading and gathering */
285 pme_spline_work_t *spline_work;
287 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
288 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
289 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
291 t_complex *cfftgridA; /* Grids for complex FFT data */
292 t_complex *cfftgridB;
293 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
295 gmx_parallel_3dfft_t pfft_setupA;
296 gmx_parallel_3dfft_t pfft_setupB;
299 real *fshx,*fshy,*fshz;
301 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
305 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
307 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
309 rvec *bufv; /* Communication buffer */
310 real *bufr; /* Communication buffer */
311 int buf_nalloc; /* The communication buffer size */
313 /* thread local work data for solve_pme */
316 /* Work data for PME_redist */
317 gmx_bool redist_init;
325 int redist_buf_nalloc;
327 /* Work data for sum_qgrid */
328 real * sum_qgrid_tmp;
329 real * sum_qgrid_dd_tmp;
333 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
334 int start,int end,int thread)
337 int *idxptr,tix,tiy,tiz;
338 real *xptr,*fptr,tx,ty,tz;
339 real rxx,ryx,ryy,rzx,rzy,rzz;
341 int start_ix,start_iy,start_iz;
342 int *g2tx,*g2ty,*g2tz;
344 int *thread_idx=NULL;
345 thread_plist_t *tpl=NULL;
353 start_ix = pme->pmegrid_start_ix;
354 start_iy = pme->pmegrid_start_iy;
355 start_iz = pme->pmegrid_start_iz;
357 rxx = pme->recipbox[XX][XX];
358 ryx = pme->recipbox[YY][XX];
359 ryy = pme->recipbox[YY][YY];
360 rzx = pme->recipbox[ZZ][XX];
361 rzy = pme->recipbox[ZZ][YY];
362 rzz = pme->recipbox[ZZ][ZZ];
364 g2tx = pme->pmegridA.g2t[XX];
365 g2ty = pme->pmegridA.g2t[YY];
366 g2tz = pme->pmegridA.g2t[ZZ];
368 bThreads = (atc->nthread > 1);
371 thread_idx = atc->thread_idx;
373 tpl = &atc->thread_plist[thread];
375 for(i=0; i<atc->nthread; i++)
381 for(i=start; i<end; i++) {
383 idxptr = atc->idx[i];
384 fptr = atc->fractx[i];
386 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
387 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
388 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
389 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
395 /* Because decomposition only occurs in x and y,
396 * we never have a fraction correction in z.
398 fptr[XX] = tx - tix + pme->fshx[tix];
399 fptr[YY] = ty - tiy + pme->fshy[tiy];
402 idxptr[XX] = pme->nnx[tix];
403 idxptr[YY] = pme->nny[tiy];
404 idxptr[ZZ] = pme->nnz[tiz];
407 range_check(idxptr[XX],0,pme->pmegrid_nx);
408 range_check(idxptr[YY],0,pme->pmegrid_ny);
409 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
414 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
415 thread_idx[i] = thread_i;
422 /* Make a list of particle indices sorted on thread */
424 /* Get the cumulative count */
425 for(i=1; i<atc->nthread; i++)
427 tpl_n[i] += tpl_n[i-1];
429 /* The current implementation distributes particles equally
430 * over the threads, so we could actually allocate for that
431 * in pme_realloc_atomcomm_things.
433 if (tpl_n[atc->nthread-1] > tpl->nalloc)
435 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
436 srenew(tpl->i,tpl->nalloc);
438 /* Set tpl_n to the cumulative start */
439 for(i=atc->nthread-1; i>=1; i--)
441 tpl_n[i] = tpl_n[i-1];
445 /* Fill our thread local array with indices sorted on thread */
446 for(i=start; i<end; i++)
448 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
450 /* Now tpl_n contains the cummulative count again */
454 static void make_thread_local_ind(pme_atomcomm_t *atc,
455 int thread,splinedata_t *spline)
460 /* Combine the indices made by each thread into one index */
464 for(t=0; t<atc->nthread; t++)
466 tpl = &atc->thread_plist[t];
467 /* Copy our part (start - end) from the list of thread t */
470 start = tpl->n[thread-1];
472 end = tpl->n[thread];
473 for(i=start; i<end; i++)
475 spline->ind[n++] = tpl->i[i];
483 static void pme_calc_pidx(int start, int end,
484 matrix recipbox, rvec x[],
485 pme_atomcomm_t *atc, int *count)
490 real rxx,ryx,rzx,ryy,rzy;
493 /* Calculate PME task index (pidx) for each grid index.
494 * Here we always assign equally sized slabs to each node
495 * for load balancing reasons (the PME grid spacing is not used).
501 /* Reset the count */
502 for(i=0; i<nslab; i++)
507 if (atc->dimind == 0)
509 rxx = recipbox[XX][XX];
510 ryx = recipbox[YY][XX];
511 rzx = recipbox[ZZ][XX];
512 /* Calculate the node index in x-dimension */
513 for(i=start; i<end; i++)
516 /* Fractional coordinates along box vectors */
517 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
518 si = (int)(s + 2*nslab) % nslab;
525 ryy = recipbox[YY][YY];
526 rzy = recipbox[ZZ][YY];
527 /* Calculate the node index in y-dimension */
528 for(i=start; i<end; i++)
531 /* Fractional coordinates along box vectors */
532 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
533 si = (int)(s + 2*nslab) % nslab;
540 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
543 int nthread,thread,slab;
545 nthread = atc->nthread;
547 #pragma omp parallel for num_threads(nthread) schedule(static)
548 for(thread=0; thread<nthread; thread++)
550 pme_calc_pidx(natoms* thread /nthread,
551 natoms*(thread+1)/nthread,
552 recipbox,x,atc,atc->count_thread[thread]);
554 /* Non-parallel reduction, since nslab is small */
556 for(thread=1; thread<nthread; thread++)
558 for(slab=0; slab<atc->nslab; slab++)
560 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
565 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
569 srenew(spline->ind,atc->nalloc);
570 /* Initialize the index to identity so it works without threads */
571 for(i=0; i<atc->nalloc; i++)
578 srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
579 srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
583 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
585 int nalloc_old,i,j,nalloc_tpl;
587 /* We have to avoid a NULL pointer for atc->x to avoid
588 * possible fatal errors in MPI routines.
590 if (atc->n > atc->nalloc || atc->nalloc == 0)
592 nalloc_old = atc->nalloc;
593 atc->nalloc = over_alloc_dd(max(atc->n,1));
595 if (atc->nslab > 1) {
596 srenew(atc->x,atc->nalloc);
597 srenew(atc->q,atc->nalloc);
598 srenew(atc->f,atc->nalloc);
599 for(i=nalloc_old; i<atc->nalloc; i++)
601 clear_rvec(atc->f[i]);
605 srenew(atc->fractx,atc->nalloc);
606 srenew(atc->idx ,atc->nalloc);
608 if (atc->nthread > 1)
610 srenew(atc->thread_idx,atc->nalloc);
613 for(i=0; i<atc->nthread; i++)
615 pme_realloc_splinedata(&atc->spline[i],atc);
621 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
622 int n, gmx_bool bXF, rvec *x_f, real *charge,
624 /* Redistribute particle data for PME calculation */
625 /* domain decomposition by x coordinate */
630 if(FALSE == pme->redist_init) {
631 snew(pme->scounts,atc->nslab);
632 snew(pme->rcounts,atc->nslab);
633 snew(pme->sdispls,atc->nslab);
634 snew(pme->rdispls,atc->nslab);
635 snew(pme->sidx,atc->nslab);
636 pme->redist_init = TRUE;
638 if (n > pme->redist_buf_nalloc) {
639 pme->redist_buf_nalloc = over_alloc_dd(n);
640 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
647 /* forward, redistribution from pp to pme */
649 /* Calculate send counts and exchange them with other nodes */
650 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
651 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
652 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
654 /* Calculate send and receive displacements and index into send
659 for(i=1; i<atc->nslab; i++) {
660 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
661 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
662 pme->sidx[i]=pme->sdispls[i];
664 /* Total # of particles to be received */
665 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
667 pme_realloc_atomcomm_things(atc);
669 /* Copy particle coordinates into send buffer and exchange*/
670 for(i=0; (i<n); i++) {
671 ii=DIM*pme->sidx[pme->idxa[i]];
672 pme->sidx[pme->idxa[i]]++;
673 pme->redist_buf[ii+XX]=x_f[i][XX];
674 pme->redist_buf[ii+YY]=x_f[i][YY];
675 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
677 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
678 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
679 pme->rvec_mpi, atc->mpi_comm);
682 /* Copy charge into send buffer and exchange*/
683 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
684 for(i=0; (i<n); i++) {
685 ii=pme->sidx[pme->idxa[i]];
686 pme->sidx[pme->idxa[i]]++;
687 pme->redist_buf[ii]=charge[i];
689 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
690 atc->q, pme->rcounts, pme->rdispls, mpi_type,
693 else { /* backward, redistribution from pme to pp */
694 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
695 pme->redist_buf, pme->scounts, pme->sdispls,
696 pme->rvec_mpi, atc->mpi_comm);
698 /* Copy data from receive buffer */
699 for(i=0; i<atc->nslab; i++)
700 pme->sidx[i] = pme->sdispls[i];
701 for(i=0; (i<n); i++) {
702 ii = DIM*pme->sidx[pme->idxa[i]];
703 x_f[i][XX] += pme->redist_buf[ii+XX];
704 x_f[i][YY] += pme->redist_buf[ii+YY];
705 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
706 pme->sidx[pme->idxa[i]]++;
712 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
713 gmx_bool bBackward,int shift,
714 void *buf_s,int nbyte_s,
715 void *buf_r,int nbyte_r)
721 if (bBackward == FALSE) {
722 dest = atc->node_dest[shift];
723 src = atc->node_src[shift];
725 dest = atc->node_src[shift];
726 src = atc->node_dest[shift];
729 if (nbyte_s > 0 && nbyte_r > 0) {
730 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
732 buf_r,nbyte_r,MPI_BYTE,
734 atc->mpi_comm,&stat);
735 } else if (nbyte_s > 0) {
736 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
739 } else if (nbyte_r > 0) {
740 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
742 atc->mpi_comm,&stat);
747 static void dd_pmeredist_x_q(gmx_pme_t pme,
748 int n, gmx_bool bX, rvec *x, real *charge,
751 int *commnode,*buf_index;
752 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
754 commnode = atc->node_dest;
755 buf_index = atc->buf_index;
757 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
760 for(i=0; i<nnodes_comm; i++) {
761 buf_index[commnode[i]] = nsend;
762 nsend += atc->count[commnode[i]];
765 if (atc->count[atc->nodeid] + nsend != n)
766 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
767 "This usually means that your system is not well equilibrated.",
768 n - (atc->count[atc->nodeid] + nsend),
769 pme->nodeid,'x'+atc->dimind);
771 if (nsend > pme->buf_nalloc) {
772 pme->buf_nalloc = over_alloc_dd(nsend);
773 srenew(pme->bufv,pme->buf_nalloc);
774 srenew(pme->bufr,pme->buf_nalloc);
777 atc->n = atc->count[atc->nodeid];
778 for(i=0; i<nnodes_comm; i++) {
779 scount = atc->count[commnode[i]];
780 /* Communicate the count */
782 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
783 atc->dimind,atc->nodeid,commnode[i],scount);
784 pme_dd_sendrecv(atc,FALSE,i,
786 &atc->rcount[i],sizeof(int));
787 atc->n += atc->rcount[i];
790 pme_realloc_atomcomm_things(atc);
796 if (node == atc->nodeid) {
797 /* Copy direct to the receive buffer */
799 copy_rvec(x[i],atc->x[local_pos]);
801 atc->q[local_pos] = charge[i];
804 /* Copy to the send buffer */
806 copy_rvec(x[i],pme->bufv[buf_index[node]]);
808 pme->bufr[buf_index[node]] = charge[i];
814 for(i=0; i<nnodes_comm; i++) {
815 scount = atc->count[commnode[i]];
816 rcount = atc->rcount[i];
817 if (scount > 0 || rcount > 0) {
819 /* Communicate the coordinates */
820 pme_dd_sendrecv(atc,FALSE,i,
821 pme->bufv[buf_pos],scount*sizeof(rvec),
822 atc->x[local_pos],rcount*sizeof(rvec));
824 /* Communicate the charges */
825 pme_dd_sendrecv(atc,FALSE,i,
826 pme->bufr+buf_pos,scount*sizeof(real),
827 atc->q+local_pos,rcount*sizeof(real));
829 local_pos += atc->rcount[i];
834 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
838 int *commnode,*buf_index;
839 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
841 commnode = atc->node_dest;
842 buf_index = atc->buf_index;
844 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
846 local_pos = atc->count[atc->nodeid];
848 for(i=0; i<nnodes_comm; i++) {
849 scount = atc->rcount[i];
850 rcount = atc->count[commnode[i]];
851 if (scount > 0 || rcount > 0) {
852 /* Communicate the forces */
853 pme_dd_sendrecv(atc,TRUE,i,
854 atc->f[local_pos],scount*sizeof(rvec),
855 pme->bufv[buf_pos],rcount*sizeof(rvec));
858 buf_index[commnode[i]] = buf_pos;
868 if (node == atc->nodeid)
870 /* Add from the local force array */
871 rvec_inc(f[i],atc->f[local_pos]);
876 /* Add from the receive buffer */
877 rvec_inc(f[i],pme->bufv[buf_index[node]]);
887 if (node == atc->nodeid)
889 /* Copy from the local force array */
890 copy_rvec(atc->f[local_pos],f[i]);
895 /* Copy from the receive buffer */
896 copy_rvec(pme->bufv[buf_index[node]],f[i]);
905 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
907 pme_overlap_t *overlap;
908 int send_index0,send_nindex;
909 int recv_index0,recv_nindex;
911 int i,j,k,ix,iy,iz,icnt;
912 int ipulse,send_id,recv_id,datasize;
914 real *sendptr,*recvptr;
916 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
917 overlap = &pme->overlap[1];
919 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
921 /* Since we have already (un)wrapped the overlap in the z-dimension,
922 * we only have to communicate 0 to nkz (not pmegrid_nz).
924 if (direction==GMX_SUM_QGRID_FORWARD)
926 send_id = overlap->send_id[ipulse];
927 recv_id = overlap->recv_id[ipulse];
928 send_index0 = overlap->comm_data[ipulse].send_index0;
929 send_nindex = overlap->comm_data[ipulse].send_nindex;
930 recv_index0 = overlap->comm_data[ipulse].recv_index0;
931 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
935 send_id = overlap->recv_id[ipulse];
936 recv_id = overlap->send_id[ipulse];
937 send_index0 = overlap->comm_data[ipulse].recv_index0;
938 send_nindex = overlap->comm_data[ipulse].recv_nindex;
939 recv_index0 = overlap->comm_data[ipulse].send_index0;
940 recv_nindex = overlap->comm_data[ipulse].send_nindex;
943 /* Copy data to contiguous send buffer */
946 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
947 pme->nodeid,overlap->nodeid,send_id,
948 pme->pmegrid_start_iy,
949 send_index0-pme->pmegrid_start_iy,
950 send_index0-pme->pmegrid_start_iy+send_nindex);
953 for(i=0;i<pme->pmegrid_nx;i++)
956 for(j=0;j<send_nindex;j++)
958 iy = j + send_index0 - pme->pmegrid_start_iy;
959 for(k=0;k<pme->nkz;k++)
962 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
967 datasize = pme->pmegrid_nx * pme->nkz;
969 MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
971 overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
973 overlap->mpi_comm,&stat);
975 /* Get data from contiguous recv buffer */
978 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
979 pme->nodeid,overlap->nodeid,recv_id,
980 pme->pmegrid_start_iy,
981 recv_index0-pme->pmegrid_start_iy,
982 recv_index0-pme->pmegrid_start_iy+recv_nindex);
985 for(i=0;i<pme->pmegrid_nx;i++)
988 for(j=0;j<recv_nindex;j++)
990 iy = j + recv_index0 - pme->pmegrid_start_iy;
991 for(k=0;k<pme->nkz;k++)
994 if(direction==GMX_SUM_QGRID_FORWARD)
996 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1000 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1007 /* Major dimension is easier, no copying required,
1008 * but we might have to sum to separate array.
1009 * Since we don't copy, we have to communicate up to pmegrid_nz,
1010 * not nkz as for the minor direction.
1012 overlap = &pme->overlap[0];
1014 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
1016 if(direction==GMX_SUM_QGRID_FORWARD)
1018 send_id = overlap->send_id[ipulse];
1019 recv_id = overlap->recv_id[ipulse];
1020 send_index0 = overlap->comm_data[ipulse].send_index0;
1021 send_nindex = overlap->comm_data[ipulse].send_nindex;
1022 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1023 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1024 recvptr = overlap->recvbuf;
1028 send_id = overlap->recv_id[ipulse];
1029 recv_id = overlap->send_id[ipulse];
1030 send_index0 = overlap->comm_data[ipulse].recv_index0;
1031 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1032 recv_index0 = overlap->comm_data[ipulse].send_index0;
1033 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1034 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1037 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1038 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1042 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1043 pme->nodeid,overlap->nodeid,send_id,
1044 pme->pmegrid_start_ix,
1045 send_index0-pme->pmegrid_start_ix,
1046 send_index0-pme->pmegrid_start_ix+send_nindex);
1047 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1048 pme->nodeid,overlap->nodeid,recv_id,
1049 pme->pmegrid_start_ix,
1050 recv_index0-pme->pmegrid_start_ix,
1051 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1054 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
1056 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
1058 overlap->mpi_comm,&stat);
1060 /* ADD data from contiguous recv buffer */
1061 if(direction==GMX_SUM_QGRID_FORWARD)
1063 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1064 for(i=0;i<recv_nindex*datasize;i++)
1066 p[i] += overlap->recvbuf[i];
1075 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1077 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1078 ivec local_pme_size;
1082 /* Dimensions should be identical for A/B grid, so we just use A here */
1083 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1088 local_pme_size[0] = pme->pmegrid_nx;
1089 local_pme_size[1] = pme->pmegrid_ny;
1090 local_pme_size[2] = pme->pmegrid_nz;
1092 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1093 the offset is identical, and the PME grid always has more data (due to overlap)
1098 char fn[STRLEN],format[STRLEN];
1100 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
1101 fp = ffopen(fn,"w");
1102 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
1103 fp2 = ffopen(fn,"w");
1104 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1107 for(ix=0;ix<local_fft_ndata[XX];ix++)
1109 for(iy=0;iy<local_fft_ndata[YY];iy++)
1111 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1113 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1114 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1115 fftgrid[fftidx] = pmegrid[pmeidx];
1117 val = 100*pmegrid[pmeidx];
1118 if (pmegrid[pmeidx] != 0)
1119 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
1120 5.0*ix,5.0*iy,5.0*iz,1.0,val);
1121 if (pmegrid[pmeidx] != 0)
1122 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
1124 pme->pmegrid_start_ix + ix,
1125 pme->pmegrid_start_iy + iy,
1126 pme->pmegrid_start_iz + iz,
1141 static gmx_cycles_t omp_cyc_start()
1143 return gmx_cycles_read();
1146 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1148 return gmx_cycles_read() - c;
1153 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1154 int nthread,int thread)
1156 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1157 ivec local_pme_size;
1158 int ixy0,ixy1,ixy,ix,iy,iz;
1160 #ifdef PME_TIME_THREADS
1162 static double cs1=0;
1166 #ifdef PME_TIME_THREADS
1167 c1 = omp_cyc_start();
1169 /* Dimensions should be identical for A/B grid, so we just use A here */
1170 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1175 local_pme_size[0] = pme->pmegrid_nx;
1176 local_pme_size[1] = pme->pmegrid_ny;
1177 local_pme_size[2] = pme->pmegrid_nz;
1179 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1180 the offset is identical, and the PME grid always has more data (due to overlap)
1182 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1183 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1185 for(ixy=ixy0;ixy<ixy1;ixy++)
1187 ix = ixy/local_fft_ndata[YY];
1188 iy = ixy - ix*local_fft_ndata[YY];
1190 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1191 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1192 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1194 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1198 #ifdef PME_TIME_THREADS
1199 c1 = omp_cyc_end(c1);
1204 printf("copy %.2f\n",cs1*1e-9);
1213 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1215 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1221 pnx = pme->pmegrid_nx;
1222 pny = pme->pmegrid_ny;
1223 pnz = pme->pmegrid_nz;
1225 overlap = pme->pme_order - 1;
1227 /* Add periodic overlap in z */
1228 for(ix=0; ix<pme->pmegrid_nx; ix++)
1230 for(iy=0; iy<pme->pmegrid_ny; iy++)
1232 for(iz=0; iz<overlap; iz++)
1234 pmegrid[(ix*pny+iy)*pnz+iz] +=
1235 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1240 if (pme->nnodes_minor == 1)
1242 for(ix=0; ix<pme->pmegrid_nx; ix++)
1244 for(iy=0; iy<overlap; iy++)
1246 for(iz=0; iz<nz; iz++)
1248 pmegrid[(ix*pny+iy)*pnz+iz] +=
1249 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1255 if (pme->nnodes_major == 1)
1257 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1259 for(ix=0; ix<overlap; ix++)
1261 for(iy=0; iy<ny_x; iy++)
1263 for(iz=0; iz<nz; iz++)
1265 pmegrid[(ix*pny+iy)*pnz+iz] +=
1266 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1275 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1277 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
1283 pnx = pme->pmegrid_nx;
1284 pny = pme->pmegrid_ny;
1285 pnz = pme->pmegrid_nz;
1287 overlap = pme->pme_order - 1;
1289 if (pme->nnodes_major == 1)
1291 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1293 for(ix=0; ix<overlap; ix++)
1297 for(iy=0; iy<ny_x; iy++)
1299 for(iz=0; iz<nz; iz++)
1301 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1302 pmegrid[(ix*pny+iy)*pnz+iz];
1308 if (pme->nnodes_minor == 1)
1310 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1311 for(ix=0; ix<pme->pmegrid_nx; ix++)
1315 for(iy=0; iy<overlap; iy++)
1317 for(iz=0; iz<nz; iz++)
1319 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1320 pmegrid[(ix*pny+iy)*pnz+iz];
1326 /* Copy periodic overlap in z */
1327 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1328 for(ix=0; ix<pme->pmegrid_nx; ix++)
1332 for(iy=0; iy<pme->pmegrid_ny; iy++)
1334 for(iz=0; iz<overlap; iz++)
1336 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1337 pmegrid[(ix*pny+iy)*pnz+iz];
1343 static void clear_grid(int nx,int ny,int nz,real *grid,
1345 int fx,int fy,int fz,
1349 int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
1352 nc = 2 + (order - 2)/FLBS;
1353 ncz = 2 + (order - 2)/FLBSZ;
1355 for(fsx=fx; fsx<fx+nc; fsx++)
1357 for(fsy=fy; fsy<fy+nc; fsy++)
1359 for(fsz=fz; fsz<fz+ncz; fsz++)
1361 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1362 if (flag[flind] == 0)
1367 g0x = (gx*ny + gy)*nz + gz;
1368 for(x=0; x<FLBS; x++)
1371 for(y=0; y<FLBS; y++)
1373 for(z=0; z<FLBSZ; z++)
1389 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1390 #define DO_BSPLINE(order) \
1391 for(ithx=0; (ithx<order); ithx++) \
1393 index_x = (i0+ithx)*pny*pnz; \
1394 valx = qn*thx[ithx]; \
1396 for(ithy=0; (ithy<order); ithy++) \
1398 valxy = valx*thy[ithy]; \
1399 index_xy = index_x+(j0+ithy)*pnz; \
1401 for(ithz=0; (ithz<order); ithz++) \
1403 index_xyz = index_xy+(k0+ithz); \
1404 grid[index_xyz] += valxy*thz[ithz]; \
1410 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1411 pme_atomcomm_t *atc, splinedata_t *spline,
1412 pme_spline_work_t *work)
1415 /* spread charges from home atoms to local grid */
1418 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1420 int order,norder,index_x,index_xy,index_xyz;
1422 real *thx,*thy,*thz;
1423 int localsize, bndsize;
1424 int pnx,pny,pnz,ndatatot;
1427 pnx = pmegrid->n[XX];
1428 pny = pmegrid->n[YY];
1429 pnz = pmegrid->n[ZZ];
1431 offx = pmegrid->offset[XX];
1432 offy = pmegrid->offset[YY];
1433 offz = pmegrid->offset[ZZ];
1435 ndatatot = pnx*pny*pnz;
1436 grid = pmegrid->grid;
1437 for(i=0;i<ndatatot;i++)
1442 order = pmegrid->order;
1444 for(nn=0; nn<spline->n; nn++)
1446 n = spline->ind[nn];
1451 idxptr = atc->idx[n];
1454 i0 = idxptr[XX] - offx;
1455 j0 = idxptr[YY] - offy;
1456 k0 = idxptr[ZZ] - offz;
1458 thx = spline->theta[XX] + norder;
1459 thy = spline->theta[YY] + norder;
1460 thz = spline->theta[ZZ] + norder;
1465 #ifdef PME_SSE_UNALIGNED
1466 #define PME_SPREAD_SSE_ORDER4
1468 #define PME_SPREAD_SSE_ALIGNED
1471 #include "pme_sse_single.h"
1478 #define PME_SPREAD_SSE_ALIGNED
1480 #include "pme_sse_single.h"
1493 static void set_grid_alignment(int *pmegrid_nz,int pme_order)
1497 #ifndef PME_SSE_UNALIGNED
1502 /* Round nz up to a multiple of 4 to ensure alignment */
1503 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1508 static void set_gridsize_alignment(int *gridsize,int pme_order)
1511 #ifndef PME_SSE_UNALIGNED
1514 /* Add extra elements to ensured aligned operations do not go
1515 * beyond the allocated grid size.
1516 * Note that for pme_order=5, the pme grid z-size alignment
1517 * ensures that we will not go beyond the grid size.
1525 static void pmegrid_init(pmegrid_t *grid,
1526 int cx, int cy, int cz,
1527 int x0, int y0, int z0,
1528 int x1, int y1, int z1,
1529 gmx_bool set_alignment,
1538 grid->offset[XX] = x0;
1539 grid->offset[YY] = y0;
1540 grid->offset[ZZ] = z0;
1541 grid->n[XX] = x1 - x0 + pme_order - 1;
1542 grid->n[YY] = y1 - y0 + pme_order - 1;
1543 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1546 set_grid_alignment(&nz,pme_order);
1551 else if (nz != grid->n[ZZ])
1553 gmx_incons("pmegrid_init call with an unaligned z size");
1556 grid->order = pme_order;
1559 gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
1560 set_gridsize_alignment(&gridsize,pme_order);
1561 snew_aligned(grid->grid,gridsize,16);
1569 static int div_round_up(int enumerator,int denominator)
1571 return (enumerator + denominator - 1)/denominator;
1574 static void make_subgrid_division(const ivec n,int ovl,int nthread,
1577 int gsize_opt,gsize;
1582 for(nsx=1; nsx<=nthread; nsx++)
1584 if (nthread % nsx == 0)
1586 for(nsy=1; nsy<=nthread; nsy++)
1588 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1590 nsz = nthread/(nsx*nsy);
1592 /* Determine the number of grid points per thread */
1594 (div_round_up(n[XX],nsx) + ovl)*
1595 (div_round_up(n[YY],nsy) + ovl)*
1596 (div_round_up(n[ZZ],nsz) + ovl);
1598 /* Minimize the number of grids points per thread
1599 * and, secondarily, the number of cuts in minor dimensions.
1601 if (gsize_opt == -1 ||
1602 gsize < gsize_opt ||
1603 (gsize == gsize_opt &&
1604 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1616 env = getenv("GMX_PME_THREAD_DIVISION");
1619 sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
1622 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1624 gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
1628 static void pmegrids_init(pmegrids_t *grids,
1629 int nx,int ny,int nz,int nz_base,
1635 ivec n,n_base,g0,g1;
1636 int t,x,y,z,d,i,tfac;
1639 n[XX] = nx - (pme_order - 1);
1640 n[YY] = ny - (pme_order - 1);
1641 n[ZZ] = nz - (pme_order - 1);
1643 copy_ivec(n,n_base);
1644 n_base[ZZ] = nz_base;
1646 pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
1649 grids->nthread = nthread;
1651 make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
1653 if (grids->nthread > 1)
1659 for(d=0; d<DIM; d++)
1661 nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
1663 set_grid_alignment(&nst[ZZ],pme_order);
1667 fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
1668 grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
1669 fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1671 nst[XX],nst[YY],nst[ZZ]);
1674 snew(grids->grid_th,grids->nthread);
1676 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1677 set_gridsize_alignment(&gridsize,pme_order);
1678 snew_aligned(grid_all,
1679 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1682 for(x=0; x<grids->nc[XX]; x++)
1684 for(y=0; y<grids->nc[YY]; y++)
1686 for(z=0; z<grids->nc[ZZ]; z++)
1688 pmegrid_init(&grids->grid_th[t],
1690 (n[XX]*(x ))/grids->nc[XX],
1691 (n[YY]*(y ))/grids->nc[YY],
1692 (n[ZZ]*(z ))/grids->nc[ZZ],
1693 (n[XX]*(x+1))/grids->nc[XX],
1694 (n[YY]*(y+1))/grids->nc[YY],
1695 (n[ZZ]*(z+1))/grids->nc[ZZ],
1698 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1705 snew(grids->g2t,DIM);
1707 for(d=DIM-1; d>=0; d--)
1709 snew(grids->g2t[d],n[d]);
1711 for(i=0; i<n[d]; i++)
1713 /* The second check should match the parameters
1714 * of the pmegrid_init call above.
1716 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1720 grids->g2t[d][i] = t*tfac;
1723 tfac *= grids->nc[d];
1727 case XX: max_comm_lines = overlap_x; break;
1728 case YY: max_comm_lines = overlap_y; break;
1729 case ZZ: max_comm_lines = pme_order - 1; break;
1731 grids->nthread_comm[d] = 0;
1732 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
1734 grids->nthread_comm[d]++;
1738 fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
1739 'x'+d,grids->nthread_comm[d]);
1741 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1742 * work, but this is not a problematic restriction.
1744 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1746 gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
1752 static void pmegrids_destroy(pmegrids_t *grids)
1756 if (grids->grid.grid != NULL)
1758 sfree(grids->grid.grid);
1760 if (grids->nthread > 0)
1762 for(t=0; t<grids->nthread; t++)
1764 sfree(grids->grid_th[t].grid);
1766 sfree(grids->grid_th);
1772 static void realloc_work(pme_work_t *work,int nkx)
1774 if (nkx > work->nalloc)
1777 srenew(work->mhx ,work->nalloc);
1778 srenew(work->mhy ,work->nalloc);
1779 srenew(work->mhz ,work->nalloc);
1780 srenew(work->m2 ,work->nalloc);
1781 /* Allocate an aligned pointer for SSE operations, including 3 extra
1782 * elements at the end since SSE operates on 4 elements at a time.
1784 sfree_aligned(work->denom);
1785 sfree_aligned(work->tmp1);
1786 sfree_aligned(work->eterm);
1787 snew_aligned(work->denom,work->nalloc+3,16);
1788 snew_aligned(work->tmp1 ,work->nalloc+3,16);
1789 snew_aligned(work->eterm,work->nalloc+3,16);
1790 srenew(work->m2inv,work->nalloc);
1795 static void free_work(pme_work_t *work)
1801 sfree_aligned(work->denom);
1802 sfree_aligned(work->tmp1);
1803 sfree_aligned(work->eterm);
1809 /* Calculate exponentials through SSE in float precision */
1810 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1813 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
1816 __m128 tmp_d1,d_inv,tmp_r,tmp_e;
1818 f_sse = _mm_load1_ps(&f);
1819 for(kx=0; kx<end; kx+=4)
1821 tmp_d1 = _mm_load_ps(d_aligned+kx);
1822 lu = _mm_rcp_ps(tmp_d1);
1823 d_inv = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
1824 tmp_r = _mm_load_ps(r_aligned+kx);
1825 tmp_r = gmx_mm_exp_ps(tmp_r);
1826 tmp_e = _mm_mul_ps(f_sse,d_inv);
1827 tmp_e = _mm_mul_ps(tmp_e,tmp_r);
1828 _mm_store_ps(e_aligned+kx,tmp_e);
1833 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1836 for(kx=start; kx<end; kx++)
1840 for(kx=start; kx<end; kx++)
1844 for(kx=start; kx<end; kx++)
1846 e[kx] = f*r[kx]*d[kx];
1852 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1853 real ewaldcoeff,real vol,
1855 int nthread,int thread)
1857 /* do recip sum over local cells in grid */
1858 /* y major, z middle, x minor or continuous */
1860 int kx,ky,kz,maxkx,maxky,maxkz;
1861 int nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
1863 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1864 real ets2,struct2,vfactor,ets2vf;
1865 real d1,d2,energy=0;
1867 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1868 real rxx,ryx,ryy,rzx,rzy,rzz;
1870 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
1871 real mhxk,mhyk,mhzk,m2k;
1874 ivec local_ndata,local_offset,local_size;
1877 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1883 /* Dimensions should be identical for A/B grid, so we just use A here */
1884 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1890 rxx = pme->recipbox[XX][XX];
1891 ryx = pme->recipbox[YY][XX];
1892 ryy = pme->recipbox[YY][YY];
1893 rzx = pme->recipbox[ZZ][XX];
1894 rzy = pme->recipbox[ZZ][YY];
1895 rzz = pme->recipbox[ZZ][ZZ];
1901 work = &pme->work[thread];
1906 denom = work->denom;
1908 eterm = work->eterm;
1909 m2inv = work->m2inv;
1911 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1912 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1914 for(iyz=iyz0; iyz<iyz1; iyz++)
1916 iy = iyz/local_ndata[ZZ];
1917 iz = iyz - iy*local_ndata[ZZ];
1919 ky = iy + local_offset[YY];
1930 by = M_PI*vol*pme->bsp_mod[YY][ky];
1932 kz = iz + local_offset[ZZ];
1936 bz = pme->bsp_mod[ZZ][kz];
1938 /* 0.5 correction for corner points */
1940 if (kz == 0 || kz == (nz+1)/2)
1945 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1947 /* We should skip the k-space point (0,0,0) */
1948 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
1950 kxstart = local_offset[XX];
1954 kxstart = local_offset[XX] + 1;
1957 kxend = local_offset[XX] + local_ndata[XX];
1961 /* More expensive inner loop, especially because of the storage
1962 * of the mh elements in array's.
1963 * Because x is the minor grid index, all mh elements
1964 * depend on kx for triclinic unit cells.
1967 /* Two explicit loops to avoid a conditional inside the loop */
1968 for(kx=kxstart; kx<maxkx; kx++)
1973 mhyk = mx * ryx + my * ryy;
1974 mhzk = mx * rzx + my * rzy + mz * rzz;
1975 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1980 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1981 tmp1[kx] = -factor*m2k;
1984 for(kx=maxkx; kx<kxend; kx++)
1989 mhyk = mx * ryx + my * ryy;
1990 mhzk = mx * rzx + my * rzy + mz * rzz;
1991 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1996 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1997 tmp1[kx] = -factor*m2k;
2000 for(kx=kxstart; kx<kxend; kx++)
2002 m2inv[kx] = 1.0/m2[kx];
2005 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2007 for(kx=kxstart; kx<kxend; kx++,p0++)
2012 p0->re = d1*eterm[kx];
2013 p0->im = d2*eterm[kx];
2015 struct2 = 2.0*(d1*d1+d2*d2);
2017 tmp1[kx] = eterm[kx]*struct2;
2020 for(kx=kxstart; kx<kxend; kx++)
2022 ets2 = corner_fac*tmp1[kx];
2023 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2026 ets2vf = ets2*vfactor;
2027 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2028 virxy += ets2vf*mhx[kx]*mhy[kx];
2029 virxz += ets2vf*mhx[kx]*mhz[kx];
2030 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2031 viryz += ets2vf*mhy[kx]*mhz[kx];
2032 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2037 /* We don't need to calculate the energy and the virial.
2038 * In this case the triclinic overhead is small.
2041 /* Two explicit loops to avoid a conditional inside the loop */
2043 for(kx=kxstart; kx<maxkx; kx++)
2048 mhyk = mx * ryx + my * ryy;
2049 mhzk = mx * rzx + my * rzy + mz * rzz;
2050 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2051 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2052 tmp1[kx] = -factor*m2k;
2055 for(kx=maxkx; kx<kxend; kx++)
2060 mhyk = mx * ryx + my * ryy;
2061 mhzk = mx * rzx + my * rzy + mz * rzz;
2062 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2063 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2064 tmp1[kx] = -factor*m2k;
2067 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2069 for(kx=kxstart; kx<kxend; kx++,p0++)
2074 p0->re = d1*eterm[kx];
2075 p0->im = d2*eterm[kx];
2082 /* Update virial with local values.
2083 * The virial is symmetric by definition.
2084 * this virial seems ok for isotropic scaling, but I'm
2085 * experiencing problems on semiisotropic membranes.
2086 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2088 work->vir[XX][XX] = 0.25*virxx;
2089 work->vir[YY][YY] = 0.25*viryy;
2090 work->vir[ZZ][ZZ] = 0.25*virzz;
2091 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2092 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2093 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2095 /* This energy should be corrected for a charged system */
2096 work->energy = 0.5*energy;
2099 /* Return the loop count */
2100 return local_ndata[YY]*local_ndata[XX];
2103 static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
2104 real *mesh_energy,matrix vir)
2106 /* This function sums output over threads
2107 * and should therefore only be called after thread synchronization.
2111 *mesh_energy = pme->work[0].energy;
2112 copy_mat(pme->work[0].vir,vir);
2114 for(thread=1; thread<nthread; thread++)
2116 *mesh_energy += pme->work[thread].energy;
2117 m_add(vir,pme->work[thread].vir,vir);
2121 #define DO_FSPLINE(order) \
2122 for(ithx=0; (ithx<order); ithx++) \
2124 index_x = (i0+ithx)*pny*pnz; \
2128 for(ithy=0; (ithy<order); ithy++) \
2130 index_xy = index_x+(j0+ithy)*pnz; \
2135 for(ithz=0; (ithz<order); ithz++) \
2137 gval = grid[index_xy+(k0+ithz)]; \
2138 fxy1 += thz[ithz]*gval; \
2139 fz1 += dthz[ithz]*gval; \
2148 static void gather_f_bsplines(gmx_pme_t pme,real *grid,
2149 gmx_bool bClearF,pme_atomcomm_t *atc,
2150 splinedata_t *spline,
2153 /* sum forces for local particles */
2154 int nn,n,ithx,ithy,ithz,i0,j0,k0;
2155 int index_x,index_xy;
2156 int nx,ny,nz,pnx,pny,pnz;
2158 real tx,ty,dx,dy,qn;
2161 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
2163 real rxx,ryx,ryy,rzx,rzy,rzz;
2166 pme_spline_work_t *work;
2168 work = pme->spline_work;
2170 order = pme->pme_order;
2171 thx = spline->theta[XX];
2172 thy = spline->theta[YY];
2173 thz = spline->theta[ZZ];
2174 dthx = spline->dtheta[XX];
2175 dthy = spline->dtheta[YY];
2176 dthz = spline->dtheta[ZZ];
2180 pnx = pme->pmegrid_nx;
2181 pny = pme->pmegrid_ny;
2182 pnz = pme->pmegrid_nz;
2184 rxx = pme->recipbox[XX][XX];
2185 ryx = pme->recipbox[YY][XX];
2186 ryy = pme->recipbox[YY][YY];
2187 rzx = pme->recipbox[ZZ][XX];
2188 rzy = pme->recipbox[ZZ][YY];
2189 rzz = pme->recipbox[ZZ][ZZ];
2191 for(nn=0; nn<spline->n; nn++)
2193 n = spline->ind[nn];
2194 qn = scale*atc->q[n];
2207 idxptr = atc->idx[n];
2214 /* Pointer arithmetic alert, next six statements */
2215 thx = spline->theta[XX] + norder;
2216 thy = spline->theta[YY] + norder;
2217 thz = spline->theta[ZZ] + norder;
2218 dthx = spline->dtheta[XX] + norder;
2219 dthy = spline->dtheta[YY] + norder;
2220 dthz = spline->dtheta[ZZ] + norder;
2225 #ifdef PME_SSE_UNALIGNED
2226 #define PME_GATHER_F_SSE_ORDER4
2228 #define PME_GATHER_F_SSE_ALIGNED
2231 #include "pme_sse_single.h"
2238 #define PME_GATHER_F_SSE_ALIGNED
2240 #include "pme_sse_single.h"
2250 atc->f[n][XX] += -qn*( fx*nx*rxx );
2251 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2252 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2255 /* Since the energy and not forces are interpolated
2256 * the net force might not be exactly zero.
2257 * This can be solved by also interpolating F, but
2258 * that comes at a cost.
2259 * A better hack is to remove the net force every
2260 * step, but that must be done at a higher level
2261 * since this routine doesn't see all atoms if running
2262 * in parallel. Don't know how important it is? EL 990726
2267 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
2268 pme_atomcomm_t *atc)
2270 splinedata_t *spline;
2271 int n,ithx,ithy,ithz,i0,j0,k0;
2272 int index_x,index_xy;
2274 real energy,pot,tx,ty,qn,gval;
2275 real *thx,*thy,*thz;
2279 spline = &atc->spline[0];
2281 order = pme->pme_order;
2284 for(n=0; (n<atc->n); n++) {
2288 idxptr = atc->idx[n];
2295 /* Pointer arithmetic alert, next three statements */
2296 thx = spline->theta[XX] + norder;
2297 thy = spline->theta[YY] + norder;
2298 thz = spline->theta[ZZ] + norder;
2301 for(ithx=0; (ithx<order); ithx++)
2303 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2306 for(ithy=0; (ithy<order); ithy++)
2308 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2311 for(ithz=0; (ithz<order); ithz++)
2313 gval = grid[index_xy+(k0+ithz)];
2314 pot += tx*ty*thz[ithz]*gval;
2327 /* Macro to force loop unrolling by fixing order.
2328 * This gives a significant performance gain.
2330 #define CALC_SPLINE(order) \
2334 real data[PME_ORDER_MAX]; \
2335 real ddata[PME_ORDER_MAX]; \
2337 for(j=0; (j<DIM); j++) \
2341 /* dr is relative offset from lower cell limit */ \
2342 data[order-1] = 0; \
2346 for(k=3; (k<order); k++) \
2348 div = 1.0/(k - 1.0); \
2349 data[k-1] = div*dr*data[k-2]; \
2350 for(l=1; (l<(k-1)); l++) \
2352 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2355 data[0] = div*(1-dr)*data[0]; \
2357 /* differentiate */ \
2358 ddata[0] = -data[0]; \
2359 for(k=1; (k<order); k++) \
2361 ddata[k] = data[k-1] - data[k]; \
2364 div = 1.0/(order - 1); \
2365 data[order-1] = div*dr*data[order-2]; \
2366 for(l=1; (l<(order-1)); l++) \
2368 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2369 (order-l-dr)*data[order-l-1]); \
2371 data[0] = div*(1 - dr)*data[0]; \
2373 for(k=0; k<order; k++) \
2375 theta[j][i*order+k] = data[k]; \
2376 dtheta[j][i*order+k] = ddata[k]; \
2381 void make_bsplines(splinevec theta,splinevec dtheta,int order,
2382 rvec fractx[],int nr,int ind[],real charge[],
2383 gmx_bool bFreeEnergy)
2385 /* construct splines for local atoms */
2391 /* With free energy we do not use the charge check.
2392 * In most cases this will be more efficient than calling make_bsplines
2393 * twice, since usually more than half the particles have charges.
2396 if (bFreeEnergy || charge[ii] != 0.0) {
2399 case 4: CALC_SPLINE(4); break;
2400 case 5: CALC_SPLINE(5); break;
2401 default: CALC_SPLINE(order); break;
2408 void make_dft_mod(real *mod,real *data,int ndata)
2413 for(i=0;i<ndata;i++) {
2415 for(j=0;j<ndata;j++) {
2416 arg=(2.0*M_PI*i*j)/ndata;
2417 sc+=data[j]*cos(arg);
2418 ss+=data[j]*sin(arg);
2422 for(i=0;i<ndata;i++)
2424 mod[i]=(mod[i-1]+mod[i+1])*0.5;
2428 static void make_bspline_moduli(splinevec bsp_mod,
2429 int nx,int ny,int nz,int order)
2431 int nmax=max(nx,max(ny,nz));
2432 real *data,*ddata,*bsp_data;
2438 snew(bsp_data,nmax);
2444 for(k=3;k<order;k++) {
2447 for(l=1;l<(k-1);l++)
2448 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2449 data[0]=div*data[0];
2453 for(k=1;k<order;k++)
2454 ddata[k]=data[k-1]-data[k];
2457 for(l=1;l<(order-1);l++)
2458 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2459 data[0]=div*data[0];
2463 for(i=1;i<=order;i++)
2464 bsp_data[i]=data[i-1];
2466 make_dft_mod(bsp_mod[XX],bsp_data,nx);
2467 make_dft_mod(bsp_mod[YY],bsp_data,ny);
2468 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
2476 /* Return the P3M optimal influence function */
2477 static double do_p3m_influence(double z, int order)
2484 /* The formula and most constants can be found in:
2485 * Ballenegger et al., JCTC 8, 936 (2012)
2490 return 1.0 - 2.0*z2/3.0;
2493 return 1.0 - z2 + 2.0*z4/15.0;
2496 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2499 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2502 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2505 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2507 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2514 /* Calculate the P3M B-spline moduli for one dimension */
2515 static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
2517 double zarg,zai,sinzai,infl;
2522 gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
2529 for(i=-maxk; i<0; i++)
2533 infl = do_p3m_influence(sinzai,order);
2534 bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
2537 for(i=1; i<maxk; i++)
2541 infl = do_p3m_influence(sinzai,order);
2542 bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
2546 /* Calculate the P3M B-spline moduli */
2547 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2548 int nx,int ny,int nz,int order)
2550 make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
2551 make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
2552 make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
2556 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2564 for(i=1; i<=nslab/2; i++) {
2565 fw = (atc->nodeid + i) % nslab;
2566 bw = (atc->nodeid - i + nslab) % nslab;
2567 if (n < nslab - 1) {
2568 atc->node_dest[n] = fw;
2569 atc->node_src[n] = bw;
2572 if (n < nslab - 1) {
2573 atc->node_dest[n] = bw;
2574 atc->node_src[n] = fw;
2580 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
2586 fprintf(log,"Destroying PME data structures.\n");
2589 sfree((*pmedata)->nnx);
2590 sfree((*pmedata)->nny);
2591 sfree((*pmedata)->nnz);
2593 pmegrids_destroy(&(*pmedata)->pmegridA);
2595 sfree((*pmedata)->fftgridA);
2596 sfree((*pmedata)->cfftgridA);
2597 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2599 if ((*pmedata)->pmegridB.grid.grid != NULL)
2601 pmegrids_destroy(&(*pmedata)->pmegridB);
2602 sfree((*pmedata)->fftgridB);
2603 sfree((*pmedata)->cfftgridB);
2604 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2606 for(thread=0; thread<(*pmedata)->nthread; thread++)
2608 free_work(&(*pmedata)->work[thread]);
2610 sfree((*pmedata)->work);
2618 static int mult_up(int n,int f)
2620 return ((n + f - 1)/f)*f;
2624 static double pme_load_imbalance(gmx_pme_t pme)
2629 nma = pme->nnodes_major;
2630 nmi = pme->nnodes_minor;
2632 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
2633 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
2634 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
2636 /* pme_solve is roughly double the cost of an fft */
2638 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2641 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
2642 int dimind,gmx_bool bSpread)
2646 atc->dimind = dimind;
2651 if (pme->nnodes > 1)
2653 atc->mpi_comm = pme->mpi_comm_d[dimind];
2654 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
2655 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
2659 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
2663 atc->bSpread = bSpread;
2664 atc->pme_order = pme->pme_order;
2668 /* These three allocations are not required for particle decomp. */
2669 snew(atc->node_dest,atc->nslab);
2670 snew(atc->node_src,atc->nslab);
2671 setup_coordinate_communication(atc);
2673 snew(atc->count_thread,pme->nthread);
2674 for(thread=0; thread<pme->nthread; thread++)
2676 snew(atc->count_thread[thread],atc->nslab);
2678 atc->count = atc->count_thread[0];
2679 snew(atc->rcount,atc->nslab);
2680 snew(atc->buf_index,atc->nslab);
2683 atc->nthread = pme->nthread;
2684 if (atc->nthread > 1)
2686 snew(atc->thread_plist,atc->nthread);
2688 snew(atc->spline,atc->nthread);
2689 for(thread=0; thread<atc->nthread; thread++)
2691 if (atc->nthread > 1)
2693 snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
2694 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2700 init_overlap_comm(pme_overlap_t * ol,
2710 int lbnd,rbnd,maxlr,b,i;
2713 pme_grid_comm_t *pgc;
2715 int fft_start,fft_end,send_index1,recv_index1;
2718 ol->mpi_comm = comm;
2721 ol->nnodes = nnodes;
2722 ol->nodeid = nodeid;
2724 /* Linear translation of the PME grid wo'nt affect reciprocal space
2725 * calculations, so to optimize we only interpolate "upwards",
2726 * which also means we only have to consider overlap in one direction.
2727 * I.e., particles on this node might also be spread to grid indices
2728 * that belong to higher nodes (modulo nnodes)
2731 snew(ol->s2g0,ol->nnodes+1);
2732 snew(ol->s2g1,ol->nnodes);
2733 if (debug) { fprintf(debug,"PME slab boundaries:"); }
2734 for(i=0; i<nnodes; i++)
2736 /* s2g0 the local interpolation grid start.
2737 * s2g1 the local interpolation grid end.
2738 * Because grid overlap communication only goes forward,
2739 * the grid the slabs for fft's should be rounded down.
2741 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2742 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2746 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
2749 ol->s2g0[nnodes] = ndata;
2750 if (debug) { fprintf(debug,"\n"); }
2752 /* Determine with how many nodes we need to communicate the grid overlap */
2758 for(i=0; i<nnodes; i++)
2760 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2761 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2767 while (bCont && b < nnodes);
2768 ol->noverlap_nodes = b - 1;
2770 snew(ol->send_id,ol->noverlap_nodes);
2771 snew(ol->recv_id,ol->noverlap_nodes);
2772 for(b=0; b<ol->noverlap_nodes; b++)
2774 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2775 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2777 snew(ol->comm_data, ol->noverlap_nodes);
2779 for(b=0; b<ol->noverlap_nodes; b++)
2781 pgc = &ol->comm_data[b];
2783 fft_start = ol->s2g0[ol->send_id[b]];
2784 fft_end = ol->s2g0[ol->send_id[b]+1];
2785 if (ol->send_id[b] < nodeid)
2790 send_index1 = ol->s2g1[nodeid];
2791 send_index1 = min(send_index1,fft_end);
2792 pgc->send_index0 = fft_start;
2793 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
2795 /* We always start receiving to the first index of our slab */
2796 fft_start = ol->s2g0[ol->nodeid];
2797 fft_end = ol->s2g0[ol->nodeid+1];
2798 recv_index1 = ol->s2g1[ol->recv_id[b]];
2799 if (ol->recv_id[b] > nodeid)
2801 recv_index1 -= ndata;
2803 recv_index1 = min(recv_index1,fft_end);
2804 pgc->recv_index0 = fft_start;
2805 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
2808 /* For non-divisible grid we need pme_order iso pme_order-1 */
2809 snew(ol->sendbuf,norder*commplainsize);
2810 snew(ol->recvbuf,norder*commplainsize);
2814 make_gridindex5_to_localindex(int n,int local_start,int local_range,
2815 int **global_to_local,
2816 real **fraction_shift)
2824 for(i=0; (i<5*n); i++)
2826 /* Determine the global to local grid index */
2827 gtl[i] = (i - local_start + n) % n;
2828 /* For coordinates that fall within the local grid the fraction
2829 * is correct, we don't need to shift it.
2832 if (local_range < n)
2834 /* Due to rounding issues i could be 1 beyond the lower or
2835 * upper boundary of the local grid. Correct the index for this.
2836 * If we shift the index, we need to shift the fraction by
2837 * the same amount in the other direction to not affect
2839 * Note that due to this shifting the weights at the end of
2840 * the spline might change, but that will only involve values
2841 * between zero and values close to the precision of a real,
2842 * which is anyhow the accuracy of the whole mesh calculation.
2844 /* With local_range=0 we should not change i=local_start */
2845 if (i % n != local_start)
2852 else if (gtl[i] == local_range)
2854 gtl[i] = local_range - 1;
2861 *global_to_local = gtl;
2862 *fraction_shift = fsh;
2865 static pme_spline_work_t *make_pme_spline_work(int order)
2867 pme_spline_work_t *work;
2874 snew_aligned(work,1,16);
2876 zero_SSE = _mm_setzero_ps();
2878 /* Generate bit masks to mask out the unused grid entries,
2879 * as we only operate on order of the 8 grid entries that are
2880 * load into 2 SSE float registers.
2882 for(of=0; of<8-(order-1); of++)
2886 tmp[i] = (i >= of && i < of+order ? 1 : 0);
2888 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
2889 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
2890 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
2891 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
2901 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2905 if (*nk % nnodes != 0)
2907 nk_new = nnodes*(*nk/nnodes + 1);
2909 if (2*nk_new >= 3*(*nk))
2911 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2912 dim,*nk,dim,nnodes,dim);
2917 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2918 dim,*nk,dim,nnodes,dim,nk_new,dim);
2925 int gmx_pme_init(gmx_pme_t * pmedata,
2931 gmx_bool bFreeEnergy,
2932 gmx_bool bReproducible,
2937 pme_atomcomm_t *atc;
2941 fprintf(debug,"Creating PME data structures.\n");
2944 pme->redist_init = FALSE;
2945 pme->sum_qgrid_tmp = NULL;
2946 pme->sum_qgrid_dd_tmp = NULL;
2947 pme->buf_nalloc = 0;
2948 pme->redist_buf_nalloc = 0;
2951 pme->bPPnode = TRUE;
2953 pme->nnodes_major = nnodes_major;
2954 pme->nnodes_minor = nnodes_minor;
2957 if (nnodes_major*nnodes_minor > 1)
2959 pme->mpi_comm = cr->mpi_comm_mygroup;
2961 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2962 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2963 if (pme->nnodes != nnodes_major*nnodes_minor)
2965 gmx_incons("PME node count mismatch");
2970 pme->mpi_comm = MPI_COMM_NULL;
2974 if (pme->nnodes == 1)
2977 pme->mpi_comm_d[0] = MPI_COMM_NULL;
2978 pme->mpi_comm_d[1] = MPI_COMM_NULL;
2980 pme->ndecompdim = 0;
2981 pme->nodeid_major = 0;
2982 pme->nodeid_minor = 0;
2984 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
2989 if (nnodes_minor == 1)
2992 pme->mpi_comm_d[0] = pme->mpi_comm;
2993 pme->mpi_comm_d[1] = MPI_COMM_NULL;
2995 pme->ndecompdim = 1;
2996 pme->nodeid_major = pme->nodeid;
2997 pme->nodeid_minor = 0;
3000 else if (nnodes_major == 1)
3003 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3004 pme->mpi_comm_d[1] = pme->mpi_comm;
3006 pme->ndecompdim = 1;
3007 pme->nodeid_major = 0;
3008 pme->nodeid_minor = pme->nodeid;
3012 if (pme->nnodes % nnodes_major != 0)
3014 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3016 pme->ndecompdim = 2;
3019 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
3020 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
3021 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
3022 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3024 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
3025 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
3026 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
3027 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
3030 pme->bPPnode = (cr->duty & DUTY_PP);
3033 pme->nthread = nthread;
3035 if (ir->ePBC == epbcSCREW)
3037 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
3040 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
3044 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3045 pme->pme_order = ir->pme_order;
3046 pme->epsilon_r = ir->epsilon_r;
3048 if (pme->pme_order > PME_ORDER_MAX)
3050 gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3051 pme->pme_order,PME_ORDER_MAX);
3054 /* Currently pme.c supports only the fft5d FFT code.
3055 * Therefore the grid always needs to be divisible by nnodes.
3056 * When the old 1D code is also supported again, change this check.
3058 * This check should be done before calling gmx_pme_init
3059 * and fplog should be passed iso stderr.
3061 if (pme->ndecompdim >= 2)
3063 if (pme->ndecompdim >= 1)
3066 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3067 'x',nnodes_major,&pme->nkx);
3068 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3069 'y',nnodes_minor,&pme->nky);
3073 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3074 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3075 pme->nkz <= pme->pme_order)
3077 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
3080 if (pme->nnodes > 1) {
3084 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3085 MPI_Type_commit(&(pme->rvec_mpi));
3088 /* Note that the charge spreading and force gathering, which usually
3089 * takes about the same amount of time as FFT+solve_pme,
3090 * is always fully load balanced
3091 * (unless the charge distribution is inhomogeneous).
3094 imbal = pme_load_imbalance(pme);
3095 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3099 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3100 " For optimal PME load balancing\n"
3101 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3102 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3104 (int)((imbal-1)*100 + 0.5),
3105 pme->nkx,pme->nky,pme->nnodes_major,
3106 pme->nky,pme->nkz,pme->nnodes_minor);
3110 /* For non-divisible grid we need pme_order iso pme_order-1 */
3111 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3112 * y is always copied through a buffer: we don't need padding in z,
3113 * but we do need the overlap in x because of the communication order.
3115 init_overlap_comm(&pme->overlap[0],pme->pme_order,
3119 pme->nnodes_major,pme->nodeid_major,
3121 (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3123 init_overlap_comm(&pme->overlap[1],pme->pme_order,
3127 pme->nnodes_minor,pme->nodeid_minor,
3129 (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
3131 /* Check for a limitation of the (current) sum_fftgrid_dd code */
3132 if (pme->nthread > 1 &&
3133 (pme->overlap[0].noverlap_nodes > 1 ||
3134 pme->overlap[1].noverlap_nodes > 1))
3136 gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
3139 snew(pme->bsp_mod[XX],pme->nkx);
3140 snew(pme->bsp_mod[YY],pme->nky);
3141 snew(pme->bsp_mod[ZZ],pme->nkz);
3143 /* The required size of the interpolation grid, including overlap.
3144 * The allocated size (pmegrid_n?) might be slightly larger.
3146 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3147 pme->overlap[0].s2g0[pme->nodeid_major];
3148 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3149 pme->overlap[1].s2g0[pme->nodeid_minor];
3150 pme->pmegrid_nz_base = pme->nkz;
3151 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3152 set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
3154 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3155 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3156 pme->pmegrid_start_iz = 0;
3158 make_gridindex5_to_localindex(pme->nkx,
3159 pme->pmegrid_start_ix,
3160 pme->pmegrid_nx - (pme->pme_order-1),
3161 &pme->nnx,&pme->fshx);
3162 make_gridindex5_to_localindex(pme->nky,
3163 pme->pmegrid_start_iy,
3164 pme->pmegrid_ny - (pme->pme_order-1),
3165 &pme->nny,&pme->fshy);
3166 make_gridindex5_to_localindex(pme->nkz,
3167 pme->pmegrid_start_iz,
3168 pme->pmegrid_nz_base,
3169 &pme->nnz,&pme->fshz);
3171 pmegrids_init(&pme->pmegridA,
3172 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3173 pme->pmegrid_nz_base,
3176 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3177 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3179 pme->spline_work = make_pme_spline_work(pme->pme_order);
3181 ndata[0] = pme->nkx;
3182 ndata[1] = pme->nky;
3183 ndata[2] = pme->nkz;
3185 /* This routine will allocate the grid data to fit the FFTs */
3186 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
3187 &pme->fftgridA,&pme->cfftgridA,
3189 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3190 bReproducible,pme->nthread);
3194 pmegrids_init(&pme->pmegridB,
3195 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3196 pme->pmegrid_nz_base,
3199 pme->nkx % pme->nnodes_major != 0,
3200 pme->nky % pme->nnodes_minor != 0);
3202 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
3203 &pme->fftgridB,&pme->cfftgridB,
3205 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3206 bReproducible,pme->nthread);
3210 pme->pmegridB.grid.grid = NULL;
3211 pme->fftgridB = NULL;
3212 pme->cfftgridB = NULL;
3217 /* Use plain SPME B-spline interpolation */
3218 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3222 /* Use the P3M grid-optimized influence function */
3223 make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3226 /* Use atc[0] for spreading */
3227 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
3228 if (pme->ndecompdim >= 2)
3230 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
3233 if (pme->nnodes == 1) {
3234 pme->atc[0].n = homenr;
3235 pme_realloc_atomcomm_things(&pme->atc[0]);
3241 /* Use fft5d, order after FFT is y major, z, x minor */
3243 snew(pme->work,pme->nthread);
3244 for(thread=0; thread<pme->nthread; thread++)
3246 realloc_work(&pme->work[thread],pme->nkx);
3256 static void copy_local_grid(gmx_pme_t pme,
3257 pmegrids_t *pmegrids,int thread,real *fftgrid)
3259 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3263 int offx,offy,offz,x,y,z,i0,i0t;
3268 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3272 fft_my = local_fft_size[YY];
3273 fft_mz = local_fft_size[ZZ];
3275 pmegrid = &pmegrids->grid_th[thread];
3277 nsx = pmegrid->n[XX];
3278 nsy = pmegrid->n[YY];
3279 nsz = pmegrid->n[ZZ];
3281 for(d=0; d<DIM; d++)
3283 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3284 local_fft_ndata[d] - pmegrid->offset[d]);
3287 offx = pmegrid->offset[XX];
3288 offy = pmegrid->offset[YY];
3289 offz = pmegrid->offset[ZZ];
3291 /* Directly copy the non-overlapping parts of the local grids.
3292 * This also initializes the full grid.
3294 grid_th = pmegrid->grid;
3295 for(x=0; x<nf[XX]; x++)
3297 for(y=0; y<nf[YY]; y++)
3299 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3300 i0t = (x*nsy + y)*nsz;
3301 for(z=0; z<nf[ZZ]; z++)
3303 fftgrid[i0+z] = grid_th[i0t+z];
3309 static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
3311 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3312 pme_overlap_t *overlap;
3316 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3320 /* Major dimension */
3321 overlap = &pme->overlap[0];
3323 nind = overlap->comm_data[0].send_nindex;
3325 for(y=0; y<local_fft_ndata[YY]; y++) {
3331 for(x=0; x<nind; x++) {
3332 for(y=0; y<local_fft_ndata[YY]; y++) {
3334 for(z=0; z<local_fft_ndata[ZZ]; z++) {
3335 if (sendbuf[i] != 0) n++;
3345 reduce_threadgrid_overlap(gmx_pme_t pme,
3346 const pmegrids_t *pmegrids,int thread,
3347 real *fftgrid,real *commbuf_x,real *commbuf_y)
3349 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3350 int fft_nx,fft_ny,fft_nz;
3355 int offx,offy,offz,x,y,z,i0,i0t;
3356 int sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
3357 gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
3358 gmx_bool bCommX,bCommY;
3361 const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
3362 const real *grid_th;
3365 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3369 fft_nx = local_fft_ndata[XX];
3370 fft_ny = local_fft_ndata[YY];
3371 fft_nz = local_fft_ndata[ZZ];
3373 fft_my = local_fft_size[YY];
3374 fft_mz = local_fft_size[ZZ];
3376 /* This routine is called when all thread have finished spreading.
3377 * Here each thread sums grid contributions calculated by other threads
3378 * to the thread local grid volume.
3379 * To minimize the number of grid copying operations,
3380 * this routines sums immediately from the pmegrid to the fftgrid.
3383 /* Determine which part of the full node grid we should operate on,
3384 * this is our thread local part of the full grid.
3386 pmegrid = &pmegrids->grid_th[thread];
3388 for(d=0; d<DIM; d++)
3390 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3391 local_fft_ndata[d]);
3394 offx = pmegrid->offset[XX];
3395 offy = pmegrid->offset[YY];
3396 offz = pmegrid->offset[ZZ];
3403 /* Now loop over all the thread data blocks that contribute
3404 * to the grid region we (our thread) are operating on.
3406 /* Note that ffy_nx/y is equal to the number of grid points
3407 * between the first point of our node grid and the one of the next node.
3409 for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
3411 fx = pmegrid->ci[XX] + sx;
3415 fx += pmegrids->nc[XX];
3417 bCommX = (pme->nnodes_major > 1);
3419 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3420 ox += pmegrid_g->offset[XX];
3423 tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
3427 tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
3430 for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
3432 fy = pmegrid->ci[YY] + sy;
3436 fy += pmegrids->nc[YY];
3438 bCommY = (pme->nnodes_minor > 1);
3440 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3441 oy += pmegrid_g->offset[YY];
3444 ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
3448 ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
3451 for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
3453 fz = pmegrid->ci[ZZ] + sz;
3457 fz += pmegrids->nc[ZZ];
3460 pmegrid_g = &pmegrids->grid_th[fz];
3461 oz += pmegrid_g->offset[ZZ];
3462 tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
3464 if (sx == 0 && sy == 0 && sz == 0)
3466 /* We have already added our local contribution
3467 * before calling this routine, so skip it here.
3472 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3474 pmegrid_f = &pmegrids->grid_th[thread_f];
3476 grid_th = pmegrid_f->grid;
3478 nsx = pmegrid_f->n[XX];
3479 nsy = pmegrid_f->n[YY];
3480 nsz = pmegrid_f->n[ZZ];
3482 #ifdef DEBUG_PME_REDUCE
3483 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3484 pme->nodeid,thread,thread_f,
3485 pme->pmegrid_start_ix,
3486 pme->pmegrid_start_iy,
3487 pme->pmegrid_start_iz,
3489 offx-ox,tx1-ox,offx,tx1,
3490 offy-oy,ty1-oy,offy,ty1,
3491 offz-oz,tz1-oz,offz,tz1);
3494 if (!(bCommX || bCommY))
3496 /* Copy from the thread local grid to the node grid */
3497 for(x=offx; x<tx1; x++)
3499 for(y=offy; y<ty1; y++)
3501 i0 = (x*fft_my + y)*fft_mz;
3502 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3503 for(z=offz; z<tz1; z++)
3505 fftgrid[i0+z] += grid_th[i0t+z];
3512 /* The order of this conditional decides
3513 * where the corner volume gets stored with x+y decomp.
3517 commbuf = commbuf_y;
3518 buf_my = ty1 - offy;
3521 /* We index commbuf modulo the local grid size */
3522 commbuf += buf_my*fft_nx*fft_nz;
3524 bClearBuf = bClearBufXY;
3525 bClearBufXY = FALSE;
3529 bClearBuf = bClearBufY;
3535 commbuf = commbuf_x;
3537 bClearBuf = bClearBufX;
3541 /* Copy to the communication buffer */
3542 for(x=offx; x<tx1; x++)
3544 for(y=offy; y<ty1; y++)
3546 i0 = (x*buf_my + y)*fft_nz;
3547 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3551 /* First access of commbuf, initialize it */
3552 for(z=offz; z<tz1; z++)
3554 commbuf[i0+z] = grid_th[i0t+z];
3559 for(z=offz; z<tz1; z++)
3561 commbuf[i0+z] += grid_th[i0t+z];
3573 static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
3575 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3576 pme_overlap_t *overlap;
3578 int recv_index0,recv_nindex;
3582 int ipulse,send_id,recv_id,datasize,gridsize,size_yx;
3583 real *sendptr,*recvptr;
3584 int x,y,z,indg,indb;
3586 /* Note that this routine is only used for forward communication.
3587 * Since the force gathering, unlike the charge spreading,
3588 * can be trivially parallelized over the particles,
3589 * the backwards process is much simpler and can use the "old"
3590 * communication setup.
3593 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3598 /* Currently supports only a single communication pulse */
3600 /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
3601 if (pme->nnodes_minor > 1)
3603 /* Major dimension */
3604 overlap = &pme->overlap[1];
3606 if (pme->nnodes_major > 1)
3608 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3614 datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
3618 send_id = overlap->send_id[ipulse];
3619 recv_id = overlap->recv_id[ipulse];
3620 send_nindex = overlap->comm_data[ipulse].send_nindex;
3621 /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
3623 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3625 sendptr = overlap->sendbuf;
3626 recvptr = overlap->recvbuf;
3629 printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
3630 local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
3631 printf("node %d send %f, %f\n",pme->nodeid,
3632 sendptr[0],sendptr[send_nindex*datasize-1]);
3636 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3638 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3640 overlap->mpi_comm,&stat);
3643 for(x=0; x<local_fft_ndata[XX]; x++)
3645 for(y=0; y<recv_nindex; y++)
3647 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3648 indb = (x*recv_nindex + y)*local_fft_ndata[ZZ];
3649 for(z=0; z<local_fft_ndata[ZZ]; z++)
3651 fftgrid[indg+z] += recvptr[indb+z];
3655 if (pme->nnodes_major > 1)
3657 sendptr = pme->overlap[0].sendbuf;
3658 for(x=0; x<size_yx; x++)
3660 for(y=0; y<recv_nindex; y++)
3662 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3663 indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
3664 for(z=0; z<local_fft_ndata[ZZ]; z++)
3666 sendptr[indg+z] += recvptr[indb+z];
3673 /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
3674 if (pme->nnodes_major > 1)
3676 /* Major dimension */
3677 overlap = &pme->overlap[0];
3679 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3680 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3684 send_id = overlap->send_id[ipulse];
3685 recv_id = overlap->recv_id[ipulse];
3686 send_nindex = overlap->comm_data[ipulse].send_nindex;
3687 /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
3689 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3691 sendptr = overlap->sendbuf;
3692 recvptr = overlap->recvbuf;
3696 fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
3697 send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
3701 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3703 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3705 overlap->mpi_comm,&stat);
3708 for(x=0; x<recv_nindex; x++)
3710 for(y=0; y<local_fft_ndata[YY]; y++)
3712 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3713 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3714 for(z=0; z<local_fft_ndata[ZZ]; z++)
3716 fftgrid[indg+z] += recvptr[indb+z];
3724 static void spread_on_grid(gmx_pme_t pme,
3725 pme_atomcomm_t *atc,pmegrids_t *grids,
3726 gmx_bool bCalcSplines,gmx_bool bSpread,
3730 #ifdef PME_TIME_THREADS
3731 gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
3732 static double cs1=0,cs2=0,cs3=0;
3733 static double cs1a[6]={0,0,0,0,0,0};
3737 nthread = pme->nthread;
3740 #ifdef PME_TIME_THREADS
3741 c1 = omp_cyc_start();
3745 #pragma omp parallel for num_threads(nthread) schedule(static)
3746 for(thread=0; thread<nthread; thread++)
3750 start = atc->n* thread /nthread;
3751 end = atc->n*(thread+1)/nthread;
3753 /* Compute fftgrid index for all atoms,
3754 * with help of some extra variables.
3756 calc_interpolation_idx(pme,atc,start,end,thread);
3759 #ifdef PME_TIME_THREADS
3760 c1 = omp_cyc_end(c1);
3764 #ifdef PME_TIME_THREADS
3765 c2 = omp_cyc_start();
3767 #pragma omp parallel for num_threads(nthread) schedule(static)
3768 for(thread=0; thread<nthread; thread++)
3770 splinedata_t *spline;
3773 /* make local bsplines */
3774 if (grids == NULL || grids->nthread == 1)
3776 spline = &atc->spline[0];
3780 grid = &grids->grid;
3784 spline = &atc->spline[thread];
3786 make_thread_local_ind(atc,thread,spline);
3788 grid = &grids->grid_th[thread];
3793 make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
3794 atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
3799 /* put local atoms on grid. */
3800 #ifdef PME_TIME_SPREAD
3801 ct1a = omp_cyc_start();
3803 spread_q_bsplines_thread(grid,atc,spline,pme->spline_work);
3805 if (grids->nthread > 1)
3807 copy_local_grid(pme,grids,thread,fftgrid);
3809 #ifdef PME_TIME_SPREAD
3810 ct1a = omp_cyc_end(ct1a);
3811 cs1a[thread] += (double)ct1a;
3815 #ifdef PME_TIME_THREADS
3816 c2 = omp_cyc_end(c2);
3820 if (bSpread && grids->nthread > 1)
3822 #ifdef PME_TIME_THREADS
3823 c3 = omp_cyc_start();
3825 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3826 for(thread=0; thread<grids->nthread; thread++)
3828 reduce_threadgrid_overlap(pme,grids,thread,
3830 pme->overlap[0].sendbuf,
3831 pme->overlap[1].sendbuf);
3832 #ifdef PRINT_PME_SENDBUF
3833 print_sendbuf(pme,pme->overlap[0].sendbuf);
3836 #ifdef PME_TIME_THREADS
3837 c3 = omp_cyc_end(c3);
3841 if (pme->nnodes > 1)
3843 /* Communicate the overlapping part of the fftgrid */
3844 sum_fftgrid_dd(pme,fftgrid);
3848 #ifdef PME_TIME_THREADS
3852 printf("idx %.2f spread %.2f red %.2f",
3853 cs1*1e-9,cs2*1e-9,cs3*1e-9);
3854 #ifdef PME_TIME_SPREAD
3855 for(thread=0; thread<nthread; thread++)
3856 printf(" %.2f",cs1a[thread]*1e-9);
3864 static void dump_grid(FILE *fp,
3865 int sx,int sy,int sz,int nx,int ny,int nz,
3866 int my,int mz,const real *g)
3876 fprintf(fp,"%2d %2d %2d %6.3f\n",
3877 sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
3883 static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
3885 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3887 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3893 pme->pmegrid_start_ix,
3894 pme->pmegrid_start_iy,
3895 pme->pmegrid_start_iz,
3896 pme->pmegrid_nx-pme->pme_order+1,
3897 pme->pmegrid_ny-pme->pme_order+1,
3898 pme->pmegrid_nz-pme->pme_order+1,
3905 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
3907 pme_atomcomm_t *atc;
3910 if (pme->nnodes > 1)
3912 gmx_incons("gmx_pme_calc_energy called in parallel");
3916 gmx_incons("gmx_pme_calc_energy with free energy");
3919 atc = &pme->atc_energy;
3921 if (atc->spline == NULL)
3923 snew(atc->spline,atc->nthread);
3926 atc->bSpread = TRUE;
3927 atc->pme_order = pme->pme_order;
3929 pme_realloc_atomcomm_things(atc);
3933 /* We only use the A-charges grid */
3934 grid = &pme->pmegridA;
3936 spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
3938 *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
3942 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
3943 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
3945 /* Reset all the counters related to performance over the run */
3946 wallcycle_stop(wcycle,ewcRUN);
3947 wallcycle_reset_all(wcycle);
3949 ir->init_step += step_rel;
3950 ir->nsteps -= step_rel;
3951 wallcycle_start(wcycle,ewcRUN);
3955 int gmx_pmeonly(gmx_pme_t pme,
3956 t_commrec *cr, t_nrnb *nrnb,
3957 gmx_wallcycle_t wcycle,
3958 real ewaldcoeff, gmx_bool bGatherOnly,
3961 gmx_pme_pp_t pme_pp;
3964 rvec *x_pp=NULL,*f_pp=NULL;
3965 real *chargeA=NULL,*chargeB=NULL;
3967 int maxshift_x=0,maxshift_y=0;
3968 real energy,dvdlambda;
3973 gmx_large_int_t step,step_rel;
3976 pme_pp = gmx_pme_pp_init(cr);
3981 do /****** this is a quasi-loop over time steps! */
3983 /* Domain decomposition */
3984 natoms = gmx_pme_recv_q_x(pme_pp,
3985 &chargeA,&chargeB,box,&x_pp,&f_pp,
3986 &maxshift_x,&maxshift_y,
3992 /* We should stop: break out of the loop */
3996 step_rel = step - ir->init_step;
3999 wallcycle_start(wcycle,ewcRUN);
4001 wallcycle_start(wcycle,ewcPMEMESH);
4005 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
4006 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
4007 &energy,lambda,&dvdlambda,
4008 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4010 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
4012 gmx_pme_send_force_vir_ener(pme_pp,
4013 f_pp,vir,energy,dvdlambda,
4018 if (step_rel == wcycle_get_reset_counters(wcycle))
4020 /* Reset all the counters related to performance over the run */
4021 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
4022 wcycle_set_reset_counters(wcycle, 0);
4025 } /***** end of quasi-loop, we stop with the break above */
4031 int gmx_pme_do(gmx_pme_t pme,
4032 int start, int homenr,
4034 real *chargeA, real *chargeB,
4035 matrix box, t_commrec *cr,
4036 int maxshift_x, int maxshift_y,
4037 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4038 matrix vir, real ewaldcoeff,
4039 real *energy, real lambda,
4040 real *dvdlambda, int flags)
4042 int q,d,i,j,ntot,npme;
4045 pme_atomcomm_t *atc=NULL;
4046 pmegrids_t *pmegrid=NULL;
4050 real *charge=NULL,*q_d;
4054 gmx_parallel_3dfft_t pfft_setup;
4056 t_complex * cfftgrid;
4058 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4059 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4061 assert(pme->nnodes > 0);
4062 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4064 if (pme->nnodes > 1) {
4067 if (atc->npd > atc->pd_nalloc) {
4068 atc->pd_nalloc = over_alloc_dd(atc->npd);
4069 srenew(atc->pd,atc->pd_nalloc);
4071 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4075 /* This could be necessary for TPI */
4076 pme->atc[0].n = homenr;
4079 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
4081 pmegrid = &pme->pmegridA;
4082 fftgrid = pme->fftgridA;
4083 cfftgrid = pme->cfftgridA;
4084 pfft_setup = pme->pfft_setupA;
4085 charge = chargeA+start;
4087 pmegrid = &pme->pmegridB;
4088 fftgrid = pme->fftgridB;
4089 cfftgrid = pme->cfftgridB;
4090 pfft_setup = pme->pfft_setupB;
4091 charge = chargeB+start;
4093 grid = pmegrid->grid.grid;
4094 /* Unpack structure */
4096 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
4097 cr->nnodes,cr->nodeid);
4098 fprintf(debug,"Grid = %p\n",(void*)grid);
4100 gmx_fatal(FARGS,"No grid!");
4104 m_inv_ur0(box,pme->recipbox);
4106 if (pme->nnodes == 1) {
4108 if (DOMAINDECOMP(cr)) {
4110 pme_realloc_atomcomm_things(atc);
4116 wallcycle_start(wcycle,ewcPME_REDISTXF);
4117 for(d=pme->ndecompdim-1; d>=0; d--)
4119 if (d == pme->ndecompdim-1)
4127 n_d = pme->atc[d+1].n;
4133 if (atc->npd > atc->pd_nalloc) {
4134 atc->pd_nalloc = over_alloc_dd(atc->npd);
4135 srenew(atc->pd,atc->pd_nalloc);
4137 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4138 pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
4141 /* Redistribute x (only once) and qA or qB */
4142 if (DOMAINDECOMP(cr)) {
4143 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
4145 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
4150 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4154 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
4157 if (flags & GMX_PME_SPREAD_Q)
4159 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4161 /* Spread the charges on a grid */
4162 spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
4166 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
4168 inc_nrnb(nrnb,eNR_SPREADQBSP,
4169 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4171 if (pme->nthread == 1)
4173 wrap_periodic_pmegrid(pme,grid);
4175 /* sum contributions to local grid from other nodes */
4177 if (pme->nnodes > 1)
4179 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
4184 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
4187 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4190 dump_local_fftgrid(pme,fftgrid);
4195 /* Here we start a large thread parallel region */
4196 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4197 for(thread=0; thread<pme->nthread; thread++)
4199 if (flags & GMX_PME_SOLVE)
4206 wallcycle_start(wcycle,ewcPME_FFT);
4208 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
4209 fftgrid,cfftgrid,thread,wcycle);
4212 wallcycle_stop(wcycle,ewcPME_FFT);
4216 /* solve in k-space for our local cells */
4219 wallcycle_start(wcycle,ewcPME_SOLVE);
4222 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
4223 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4225 pme->nthread,thread);
4228 wallcycle_stop(wcycle,ewcPME_SOLVE);
4230 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
4240 wallcycle_start(wcycle,ewcPME_FFT);
4242 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
4243 cfftgrid,fftgrid,thread,wcycle);
4246 wallcycle_stop(wcycle,ewcPME_FFT);
4250 if (pme->nodeid == 0)
4252 ntot = pme->nkx*pme->nky*pme->nkz;
4253 npme = ntot*log((real)ntot)/log(2.0);
4254 inc_nrnb(nrnb,eNR_FFT,2*npme);
4257 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4260 copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
4263 /* End of thread parallel section.
4264 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4269 /* distribute local grid to all nodes */
4271 if (pme->nnodes > 1) {
4272 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
4277 unwrap_periodic_pmegrid(pme,grid);
4279 /* interpolate forces for our local atoms */
4283 /* If we are running without parallelization,
4284 * atc->f is the actual force array, not a buffer,
4285 * therefore we should not clear it.
4287 bClearF = (q == 0 && PAR(cr));
4288 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4289 for(thread=0; thread<pme->nthread; thread++)
4291 gather_f_bsplines(pme,grid,bClearF,atc,
4292 &atc->spline[thread],
4293 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
4298 inc_nrnb(nrnb,eNR_GATHERFBSP,
4299 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4300 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4305 /* This should only be called on the master thread
4306 * and after the threads have synchronized.
4308 get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
4312 if (bCalcF && pme->nnodes > 1) {
4313 wallcycle_start(wcycle,ewcPME_REDISTXF);
4314 for(d=0; d<pme->ndecompdim; d++)
4317 if (d == pme->ndecompdim - 1)
4324 n_d = pme->atc[d+1].n;
4325 f_d = pme->atc[d+1].f;
4327 if (DOMAINDECOMP(cr)) {
4328 dd_pmeredist_f(pme,atc,n_d,f_d,
4329 d==pme->ndecompdim-1 && pme->bPPnode);
4331 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4335 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4342 *energy = energy_AB[0];
4343 m_add(vir,vir_AB[0],vir);
4345 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4346 *dvdlambda += energy_AB[1] - energy_AB[0];
4347 for(i=0; i<DIM; i++)
4349 for(j=0; j<DIM; j++)
4351 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4352 lambda*vir_AB[1][i][j];
4364 fprintf(debug,"PME mesh energy: %g\n",*energy);