1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
90 #include "gmx_cyclecounter.h"
93 /* Single precision, with SSE2 or higher available */
94 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
96 #include "gmx_x86_sse2.h"
97 #include "gmx_math_x86_sse2_single.h"
100 /* Some old AMD processors could have problems with unaligned loads+stores */
102 #define PME_SSE_UNALIGNED
107 /* #define PRT_FORCE */
108 /* conditions for on the fly time-measurement */
109 /* #define TAKETIME (step > 1 && timesteps < 10) */
110 #define TAKETIME FALSE
112 /* #define PME_TIME_THREADS */
115 #define mpi_type MPI_DOUBLE
117 #define mpi_type MPI_FLOAT
120 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
121 #define GMX_CACHE_SEP 64
123 /* We only define a maximum to be able to use local arrays without allocation.
124 * An order larger than 12 should never be needed, even for test cases.
125 * If needed it can be changed here.
127 #define PME_ORDER_MAX 12
129 /* Internal datastructures */
145 int *send_id,*recv_id;
146 pme_grid_comm_t *comm_data;
152 int *n; /* Cumulative counts of the number of particles per thread */
153 int nalloc; /* Allocation size of i */
154 int *i; /* Particle indices ordered on thread index (n) */
165 int dimind; /* The index of the dimension, 0=x, 1=y */
172 int *node_dest; /* The nodes to send x and q to with DD */
173 int *node_src; /* The nodes to receive x and q from with DD */
174 int *buf_index; /* Index for commnode into the buffers */
181 int *count; /* The number of atoms to send to each node */
183 int *rcount; /* The number of atoms to receive */
190 gmx_bool bSpread; /* These coordinates are used for spreading */
193 rvec *fractx; /* Fractional coordinate relative to the
194 * lower cell boundary
197 int *thread_idx; /* Which thread should spread which charge */
198 thread_plist_t *thread_plist;
199 splinedata_t *spline;
206 ivec ci; /* The spatial location of this grid */
207 ivec n; /* The size of *grid, including order-1 */
208 ivec offset; /* The grid offset from the full node grid */
209 int order; /* PME spreading order */
210 real *grid; /* The grid local thread, size n */
214 pmegrid_t grid; /* The full node grid (non thread-local) */
215 int nthread; /* The number of threads operating on this grid */
216 ivec nc; /* The local spatial decomposition over the threads */
217 pmegrid_t *grid_th; /* Array of grids for each thread */
218 int **g2t; /* The grid to thread index */
219 ivec nthread_comm; /* The number of threads to communicate with */
225 /* Masks for SSE aligned spreading and gathering */
226 __m128 mask_SSE0[6],mask_SSE1[6];
228 int dummy; /* C89 requires that struct has at least one member */
233 /* work data for solve_pme */
249 typedef struct gmx_pme {
250 int ndecompdim; /* The number of decomposition dimensions */
251 int nodeid; /* Our nodeid in mpi->mpi_comm */
254 int nnodes; /* The number of nodes doing PME */
259 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
261 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
264 int nthread; /* The number of threads doing PME */
266 gmx_bool bPPnode; /* Node also does particle-particle forces */
267 gmx_bool bFEP; /* Compute Free energy contribution */
268 int nkx,nky,nkz; /* Grid dimensions */
269 gmx_bool bP3M; /* Do P3M: optimize the influence function */
273 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
275 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
276 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
277 /* pmegrid_nz might be larger than strictly necessary to ensure
278 * memory alignment, pmegrid_nz_base gives the real base size.
281 /* The local PME grid starting indices */
282 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
284 /* Work data for spreading and gathering */
285 pme_spline_work_t spline_work;
287 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
288 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
289 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
291 t_complex *cfftgridA; /* Grids for complex FFT data */
292 t_complex *cfftgridB;
293 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
295 gmx_parallel_3dfft_t pfft_setupA;
296 gmx_parallel_3dfft_t pfft_setupB;
299 real *fshx,*fshy,*fshz;
301 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
305 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
307 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
309 rvec *bufv; /* Communication buffer */
310 real *bufr; /* Communication buffer */
311 int buf_nalloc; /* The communication buffer size */
313 /* thread local work data for solve_pme */
316 /* Work data for PME_redist */
317 gmx_bool redist_init;
325 int redist_buf_nalloc;
327 /* Work data for sum_qgrid */
328 real * sum_qgrid_tmp;
329 real * sum_qgrid_dd_tmp;
333 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
334 int start,int end,int thread)
337 int *idxptr,tix,tiy,tiz;
338 real *xptr,*fptr,tx,ty,tz;
339 real rxx,ryx,ryy,rzx,rzy,rzz;
341 int start_ix,start_iy,start_iz;
342 int *g2tx,*g2ty,*g2tz;
344 int *thread_idx=NULL;
345 thread_plist_t *tpl=NULL;
353 start_ix = pme->pmegrid_start_ix;
354 start_iy = pme->pmegrid_start_iy;
355 start_iz = pme->pmegrid_start_iz;
357 rxx = pme->recipbox[XX][XX];
358 ryx = pme->recipbox[YY][XX];
359 ryy = pme->recipbox[YY][YY];
360 rzx = pme->recipbox[ZZ][XX];
361 rzy = pme->recipbox[ZZ][YY];
362 rzz = pme->recipbox[ZZ][ZZ];
364 g2tx = pme->pmegridA.g2t[XX];
365 g2ty = pme->pmegridA.g2t[YY];
366 g2tz = pme->pmegridA.g2t[ZZ];
368 bThreads = (atc->nthread > 1);
371 thread_idx = atc->thread_idx;
373 tpl = &atc->thread_plist[thread];
375 for(i=0; i<atc->nthread; i++)
381 for(i=start; i<end; i++) {
383 idxptr = atc->idx[i];
384 fptr = atc->fractx[i];
386 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
387 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
388 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
389 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
395 /* Because decomposition only occurs in x and y,
396 * we never have a fraction correction in z.
398 fptr[XX] = tx - tix + pme->fshx[tix];
399 fptr[YY] = ty - tiy + pme->fshy[tiy];
402 idxptr[XX] = pme->nnx[tix];
403 idxptr[YY] = pme->nny[tiy];
404 idxptr[ZZ] = pme->nnz[tiz];
407 range_check(idxptr[XX],0,pme->pmegrid_nx);
408 range_check(idxptr[YY],0,pme->pmegrid_ny);
409 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
414 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
415 thread_idx[i] = thread_i;
422 /* Make a list of particle indices sorted on thread */
424 /* Get the cumulative count */
425 for(i=1; i<atc->nthread; i++)
427 tpl_n[i] += tpl_n[i-1];
429 /* The current implementation distributes particles equally
430 * over the threads, so we could actually allocate for that
431 * in pme_realloc_atomcomm_things.
433 if (tpl_n[atc->nthread-1] > tpl->nalloc)
435 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
436 srenew(tpl->i,tpl->nalloc);
438 /* Set tpl_n to the cumulative start */
439 for(i=atc->nthread-1; i>=1; i--)
441 tpl_n[i] = tpl_n[i-1];
445 /* Fill our thread local array with indices sorted on thread */
446 for(i=start; i<end; i++)
448 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
450 /* Now tpl_n contains the cummulative count again */
454 static void make_thread_local_ind(pme_atomcomm_t *atc,
455 int thread,splinedata_t *spline)
460 /* Combine the indices made by each thread into one index */
464 for(t=0; t<atc->nthread; t++)
466 tpl = &atc->thread_plist[t];
467 /* Copy our part (start - end) from the list of thread t */
470 start = tpl->n[thread-1];
472 end = tpl->n[thread];
473 for(i=start; i<end; i++)
475 spline->ind[n++] = tpl->i[i];
483 static void pme_calc_pidx(int start, int end,
484 matrix recipbox, rvec x[],
485 pme_atomcomm_t *atc, int *count)
490 real rxx,ryx,rzx,ryy,rzy;
493 /* Calculate PME task index (pidx) for each grid index.
494 * Here we always assign equally sized slabs to each node
495 * for load balancing reasons (the PME grid spacing is not used).
501 /* Reset the count */
502 for(i=0; i<nslab; i++)
507 if (atc->dimind == 0)
509 rxx = recipbox[XX][XX];
510 ryx = recipbox[YY][XX];
511 rzx = recipbox[ZZ][XX];
512 /* Calculate the node index in x-dimension */
513 for(i=start; i<end; i++)
516 /* Fractional coordinates along box vectors */
517 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
518 si = (int)(s + 2*nslab) % nslab;
525 ryy = recipbox[YY][YY];
526 rzy = recipbox[ZZ][YY];
527 /* Calculate the node index in y-dimension */
528 for(i=start; i<end; i++)
531 /* Fractional coordinates along box vectors */
532 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
533 si = (int)(s + 2*nslab) % nslab;
540 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
543 int nthread,thread,slab;
545 nthread = atc->nthread;
547 #pragma omp parallel for num_threads(nthread) schedule(static)
548 for(thread=0; thread<nthread; thread++)
550 pme_calc_pidx(natoms* thread /nthread,
551 natoms*(thread+1)/nthread,
552 recipbox,x,atc,atc->count_thread[thread]);
554 /* Non-parallel reduction, since nslab is small */
556 for(thread=1; thread<nthread; thread++)
558 for(slab=0; slab<atc->nslab; slab++)
560 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
565 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
569 srenew(spline->ind,atc->nalloc);
570 /* Initialize the index to identity so it works without threads */
571 for(i=0; i<atc->nalloc; i++)
578 srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
579 srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
583 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
585 int nalloc_old,i,j,nalloc_tpl;
587 /* We have to avoid a NULL pointer for atc->x to avoid
588 * possible fatal errors in MPI routines.
590 if (atc->n > atc->nalloc || atc->nalloc == 0)
592 nalloc_old = atc->nalloc;
593 atc->nalloc = over_alloc_dd(max(atc->n,1));
595 if (atc->nslab > 1) {
596 srenew(atc->x,atc->nalloc);
597 srenew(atc->q,atc->nalloc);
598 srenew(atc->f,atc->nalloc);
599 for(i=nalloc_old; i<atc->nalloc; i++)
601 clear_rvec(atc->f[i]);
605 srenew(atc->fractx,atc->nalloc);
606 srenew(atc->idx ,atc->nalloc);
608 if (atc->nthread > 1)
610 srenew(atc->thread_idx,atc->nalloc);
613 for(i=0; i<atc->nthread; i++)
615 pme_realloc_splinedata(&atc->spline[i],atc);
621 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
622 int n, gmx_bool bXF, rvec *x_f, real *charge,
624 /* Redistribute particle data for PME calculation */
625 /* domain decomposition by x coordinate */
630 if(FALSE == pme->redist_init) {
631 snew(pme->scounts,atc->nslab);
632 snew(pme->rcounts,atc->nslab);
633 snew(pme->sdispls,atc->nslab);
634 snew(pme->rdispls,atc->nslab);
635 snew(pme->sidx,atc->nslab);
636 pme->redist_init = TRUE;
638 if (n > pme->redist_buf_nalloc) {
639 pme->redist_buf_nalloc = over_alloc_dd(n);
640 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
647 /* forward, redistribution from pp to pme */
649 /* Calculate send counts and exchange them with other nodes */
650 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
651 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
652 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
654 /* Calculate send and receive displacements and index into send
659 for(i=1; i<atc->nslab; i++) {
660 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
661 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
662 pme->sidx[i]=pme->sdispls[i];
664 /* Total # of particles to be received */
665 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
667 pme_realloc_atomcomm_things(atc);
669 /* Copy particle coordinates into send buffer and exchange*/
670 for(i=0; (i<n); i++) {
671 ii=DIM*pme->sidx[pme->idxa[i]];
672 pme->sidx[pme->idxa[i]]++;
673 pme->redist_buf[ii+XX]=x_f[i][XX];
674 pme->redist_buf[ii+YY]=x_f[i][YY];
675 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
677 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
678 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
679 pme->rvec_mpi, atc->mpi_comm);
682 /* Copy charge into send buffer and exchange*/
683 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
684 for(i=0; (i<n); i++) {
685 ii=pme->sidx[pme->idxa[i]];
686 pme->sidx[pme->idxa[i]]++;
687 pme->redist_buf[ii]=charge[i];
689 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
690 atc->q, pme->rcounts, pme->rdispls, mpi_type,
693 else { /* backward, redistribution from pme to pp */
694 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
695 pme->redist_buf, pme->scounts, pme->sdispls,
696 pme->rvec_mpi, atc->mpi_comm);
698 /* Copy data from receive buffer */
699 for(i=0; i<atc->nslab; i++)
700 pme->sidx[i] = pme->sdispls[i];
701 for(i=0; (i<n); i++) {
702 ii = DIM*pme->sidx[pme->idxa[i]];
703 x_f[i][XX] += pme->redist_buf[ii+XX];
704 x_f[i][YY] += pme->redist_buf[ii+YY];
705 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
706 pme->sidx[pme->idxa[i]]++;
712 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
713 gmx_bool bBackward,int shift,
714 void *buf_s,int nbyte_s,
715 void *buf_r,int nbyte_r)
721 if (bBackward == FALSE) {
722 dest = atc->node_dest[shift];
723 src = atc->node_src[shift];
725 dest = atc->node_src[shift];
726 src = atc->node_dest[shift];
729 if (nbyte_s > 0 && nbyte_r > 0) {
730 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
732 buf_r,nbyte_r,MPI_BYTE,
734 atc->mpi_comm,&stat);
735 } else if (nbyte_s > 0) {
736 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
739 } else if (nbyte_r > 0) {
740 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
742 atc->mpi_comm,&stat);
747 static void dd_pmeredist_x_q(gmx_pme_t pme,
748 int n, gmx_bool bX, rvec *x, real *charge,
751 int *commnode,*buf_index;
752 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
754 commnode = atc->node_dest;
755 buf_index = atc->buf_index;
757 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
760 for(i=0; i<nnodes_comm; i++) {
761 buf_index[commnode[i]] = nsend;
762 nsend += atc->count[commnode[i]];
765 if (atc->count[atc->nodeid] + nsend != n)
766 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
767 "This usually means that your system is not well equilibrated.",
768 n - (atc->count[atc->nodeid] + nsend),
769 pme->nodeid,'x'+atc->dimind);
771 if (nsend > pme->buf_nalloc) {
772 pme->buf_nalloc = over_alloc_dd(nsend);
773 srenew(pme->bufv,pme->buf_nalloc);
774 srenew(pme->bufr,pme->buf_nalloc);
777 atc->n = atc->count[atc->nodeid];
778 for(i=0; i<nnodes_comm; i++) {
779 scount = atc->count[commnode[i]];
780 /* Communicate the count */
782 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
783 atc->dimind,atc->nodeid,commnode[i],scount);
784 pme_dd_sendrecv(atc,FALSE,i,
786 &atc->rcount[i],sizeof(int));
787 atc->n += atc->rcount[i];
790 pme_realloc_atomcomm_things(atc);
796 if (node == atc->nodeid) {
797 /* Copy direct to the receive buffer */
799 copy_rvec(x[i],atc->x[local_pos]);
801 atc->q[local_pos] = charge[i];
804 /* Copy to the send buffer */
806 copy_rvec(x[i],pme->bufv[buf_index[node]]);
808 pme->bufr[buf_index[node]] = charge[i];
814 for(i=0; i<nnodes_comm; i++) {
815 scount = atc->count[commnode[i]];
816 rcount = atc->rcount[i];
817 if (scount > 0 || rcount > 0) {
819 /* Communicate the coordinates */
820 pme_dd_sendrecv(atc,FALSE,i,
821 pme->bufv[buf_pos],scount*sizeof(rvec),
822 atc->x[local_pos],rcount*sizeof(rvec));
824 /* Communicate the charges */
825 pme_dd_sendrecv(atc,FALSE,i,
826 pme->bufr+buf_pos,scount*sizeof(real),
827 atc->q+local_pos,rcount*sizeof(real));
829 local_pos += atc->rcount[i];
834 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
838 int *commnode,*buf_index;
839 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
841 commnode = atc->node_dest;
842 buf_index = atc->buf_index;
844 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
846 local_pos = atc->count[atc->nodeid];
848 for(i=0; i<nnodes_comm; i++) {
849 scount = atc->rcount[i];
850 rcount = atc->count[commnode[i]];
851 if (scount > 0 || rcount > 0) {
852 /* Communicate the forces */
853 pme_dd_sendrecv(atc,TRUE,i,
854 atc->f[local_pos],scount*sizeof(rvec),
855 pme->bufv[buf_pos],rcount*sizeof(rvec));
858 buf_index[commnode[i]] = buf_pos;
868 if (node == atc->nodeid)
870 /* Add from the local force array */
871 rvec_inc(f[i],atc->f[local_pos]);
876 /* Add from the receive buffer */
877 rvec_inc(f[i],pme->bufv[buf_index[node]]);
887 if (node == atc->nodeid)
889 /* Copy from the local force array */
890 copy_rvec(atc->f[local_pos],f[i]);
895 /* Copy from the receive buffer */
896 copy_rvec(pme->bufv[buf_index[node]],f[i]);
905 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
907 pme_overlap_t *overlap;
908 int send_index0,send_nindex;
909 int recv_index0,recv_nindex;
911 int i,j,k,ix,iy,iz,icnt;
912 int ipulse,send_id,recv_id,datasize;
914 real *sendptr,*recvptr;
916 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
917 overlap = &pme->overlap[1];
919 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
921 /* Since we have already (un)wrapped the overlap in the z-dimension,
922 * we only have to communicate 0 to nkz (not pmegrid_nz).
924 if (direction==GMX_SUM_QGRID_FORWARD)
926 send_id = overlap->send_id[ipulse];
927 recv_id = overlap->recv_id[ipulse];
928 send_index0 = overlap->comm_data[ipulse].send_index0;
929 send_nindex = overlap->comm_data[ipulse].send_nindex;
930 recv_index0 = overlap->comm_data[ipulse].recv_index0;
931 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
935 send_id = overlap->recv_id[ipulse];
936 recv_id = overlap->send_id[ipulse];
937 send_index0 = overlap->comm_data[ipulse].recv_index0;
938 send_nindex = overlap->comm_data[ipulse].recv_nindex;
939 recv_index0 = overlap->comm_data[ipulse].send_index0;
940 recv_nindex = overlap->comm_data[ipulse].send_nindex;
943 /* Copy data to contiguous send buffer */
946 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
947 pme->nodeid,overlap->nodeid,send_id,
948 pme->pmegrid_start_iy,
949 send_index0-pme->pmegrid_start_iy,
950 send_index0-pme->pmegrid_start_iy+send_nindex);
953 for(i=0;i<pme->pmegrid_nx;i++)
956 for(j=0;j<send_nindex;j++)
958 iy = j + send_index0 - pme->pmegrid_start_iy;
959 for(k=0;k<pme->nkz;k++)
962 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
967 datasize = pme->pmegrid_nx * pme->nkz;
969 MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
971 overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
973 overlap->mpi_comm,&stat);
975 /* Get data from contiguous recv buffer */
978 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
979 pme->nodeid,overlap->nodeid,recv_id,
980 pme->pmegrid_start_iy,
981 recv_index0-pme->pmegrid_start_iy,
982 recv_index0-pme->pmegrid_start_iy+recv_nindex);
985 for(i=0;i<pme->pmegrid_nx;i++)
988 for(j=0;j<recv_nindex;j++)
990 iy = j + recv_index0 - pme->pmegrid_start_iy;
991 for(k=0;k<pme->nkz;k++)
994 if(direction==GMX_SUM_QGRID_FORWARD)
996 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1000 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1007 /* Major dimension is easier, no copying required,
1008 * but we might have to sum to separate array.
1009 * Since we don't copy, we have to communicate up to pmegrid_nz,
1010 * not nkz as for the minor direction.
1012 overlap = &pme->overlap[0];
1014 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
1016 if(direction==GMX_SUM_QGRID_FORWARD)
1018 send_id = overlap->send_id[ipulse];
1019 recv_id = overlap->recv_id[ipulse];
1020 send_index0 = overlap->comm_data[ipulse].send_index0;
1021 send_nindex = overlap->comm_data[ipulse].send_nindex;
1022 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1023 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1024 recvptr = overlap->recvbuf;
1028 send_id = overlap->recv_id[ipulse];
1029 recv_id = overlap->send_id[ipulse];
1030 send_index0 = overlap->comm_data[ipulse].recv_index0;
1031 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1032 recv_index0 = overlap->comm_data[ipulse].send_index0;
1033 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1034 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1037 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1038 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1042 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1043 pme->nodeid,overlap->nodeid,send_id,
1044 pme->pmegrid_start_ix,
1045 send_index0-pme->pmegrid_start_ix,
1046 send_index0-pme->pmegrid_start_ix+send_nindex);
1047 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1048 pme->nodeid,overlap->nodeid,recv_id,
1049 pme->pmegrid_start_ix,
1050 recv_index0-pme->pmegrid_start_ix,
1051 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1054 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
1056 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
1058 overlap->mpi_comm,&stat);
1060 /* ADD data from contiguous recv buffer */
1061 if(direction==GMX_SUM_QGRID_FORWARD)
1063 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1064 for(i=0;i<recv_nindex*datasize;i++)
1066 p[i] += overlap->recvbuf[i];
1075 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1077 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1078 ivec local_pme_size;
1082 /* Dimensions should be identical for A/B grid, so we just use A here */
1083 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1088 local_pme_size[0] = pme->pmegrid_nx;
1089 local_pme_size[1] = pme->pmegrid_ny;
1090 local_pme_size[2] = pme->pmegrid_nz;
1092 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1093 the offset is identical, and the PME grid always has more data (due to overlap)
1098 char fn[STRLEN],format[STRLEN];
1100 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
1101 fp = ffopen(fn,"w");
1102 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
1103 fp2 = ffopen(fn,"w");
1104 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1107 for(ix=0;ix<local_fft_ndata[XX];ix++)
1109 for(iy=0;iy<local_fft_ndata[YY];iy++)
1111 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1113 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1114 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1115 fftgrid[fftidx] = pmegrid[pmeidx];
1117 val = 100*pmegrid[pmeidx];
1118 if (pmegrid[pmeidx] != 0)
1119 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
1120 5.0*ix,5.0*iy,5.0*iz,1.0,val);
1121 if (pmegrid[pmeidx] != 0)
1122 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
1124 pme->pmegrid_start_ix + ix,
1125 pme->pmegrid_start_iy + iy,
1126 pme->pmegrid_start_iz + iz,
1141 static gmx_cycles_t omp_cyc_start()
1143 return gmx_cycles_read();
1146 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1148 return gmx_cycles_read() - c;
1153 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1154 int nthread,int thread)
1156 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1157 ivec local_pme_size;
1158 int ixy0,ixy1,ixy,ix,iy,iz;
1160 #ifdef PME_TIME_THREADS
1162 static double cs1=0;
1166 #ifdef PME_TIME_THREADS
1167 c1 = omp_cyc_start();
1169 /* Dimensions should be identical for A/B grid, so we just use A here */
1170 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1175 local_pme_size[0] = pme->pmegrid_nx;
1176 local_pme_size[1] = pme->pmegrid_ny;
1177 local_pme_size[2] = pme->pmegrid_nz;
1179 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1180 the offset is identical, and the PME grid always has more data (due to overlap)
1182 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1183 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1185 for(ixy=ixy0;ixy<ixy1;ixy++)
1187 ix = ixy/local_fft_ndata[YY];
1188 iy = ixy - ix*local_fft_ndata[YY];
1190 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1191 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1192 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1194 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1198 #ifdef PME_TIME_THREADS
1199 c1 = omp_cyc_end(c1);
1204 printf("copy %.2f\n",cs1*1e-9);
1213 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1215 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1221 pnx = pme->pmegrid_nx;
1222 pny = pme->pmegrid_ny;
1223 pnz = pme->pmegrid_nz;
1225 overlap = pme->pme_order - 1;
1227 /* Add periodic overlap in z */
1228 for(ix=0; ix<pme->pmegrid_nx; ix++)
1230 for(iy=0; iy<pme->pmegrid_ny; iy++)
1232 for(iz=0; iz<overlap; iz++)
1234 pmegrid[(ix*pny+iy)*pnz+iz] +=
1235 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1240 if (pme->nnodes_minor == 1)
1242 for(ix=0; ix<pme->pmegrid_nx; ix++)
1244 for(iy=0; iy<overlap; iy++)
1246 for(iz=0; iz<nz; iz++)
1248 pmegrid[(ix*pny+iy)*pnz+iz] +=
1249 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1255 if (pme->nnodes_major == 1)
1257 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1259 for(ix=0; ix<overlap; ix++)
1261 for(iy=0; iy<ny_x; iy++)
1263 for(iz=0; iz<nz; iz++)
1265 pmegrid[(ix*pny+iy)*pnz+iz] +=
1266 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1275 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1277 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
1283 pnx = pme->pmegrid_nx;
1284 pny = pme->pmegrid_ny;
1285 pnz = pme->pmegrid_nz;
1287 overlap = pme->pme_order - 1;
1289 if (pme->nnodes_major == 1)
1291 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1293 for(ix=0; ix<overlap; ix++)
1297 for(iy=0; iy<ny_x; iy++)
1299 for(iz=0; iz<nz; iz++)
1301 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1302 pmegrid[(ix*pny+iy)*pnz+iz];
1308 if (pme->nnodes_minor == 1)
1310 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1311 for(ix=0; ix<pme->pmegrid_nx; ix++)
1315 for(iy=0; iy<overlap; iy++)
1317 for(iz=0; iz<nz; iz++)
1319 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1320 pmegrid[(ix*pny+iy)*pnz+iz];
1326 /* Copy periodic overlap in z */
1327 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1328 for(ix=0; ix<pme->pmegrid_nx; ix++)
1332 for(iy=0; iy<pme->pmegrid_ny; iy++)
1334 for(iz=0; iz<overlap; iz++)
1336 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1337 pmegrid[(ix*pny+iy)*pnz+iz];
1343 static void clear_grid(int nx,int ny,int nz,real *grid,
1345 int fx,int fy,int fz,
1349 int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
1352 nc = 2 + (order - 2)/FLBS;
1353 ncz = 2 + (order - 2)/FLBSZ;
1355 for(fsx=fx; fsx<fx+nc; fsx++)
1357 for(fsy=fy; fsy<fy+nc; fsy++)
1359 for(fsz=fz; fsz<fz+ncz; fsz++)
1361 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1362 if (flag[flind] == 0)
1367 g0x = (gx*ny + gy)*nz + gz;
1368 for(x=0; x<FLBS; x++)
1371 for(y=0; y<FLBS; y++)
1373 for(z=0; z<FLBSZ; z++)
1389 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1390 #define DO_BSPLINE(order) \
1391 for(ithx=0; (ithx<order); ithx++) \
1393 index_x = (i0+ithx)*pny*pnz; \
1394 valx = qn*thx[ithx]; \
1396 for(ithy=0; (ithy<order); ithy++) \
1398 valxy = valx*thy[ithy]; \
1399 index_xy = index_x+(j0+ithy)*pnz; \
1401 for(ithz=0; (ithz<order); ithz++) \
1403 index_xyz = index_xy+(k0+ithz); \
1404 grid[index_xyz] += valxy*thz[ithz]; \
1410 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1411 pme_atomcomm_t *atc, splinedata_t *spline,
1412 pme_spline_work_t *work)
1415 /* spread charges from home atoms to local grid */
1418 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1420 int order,norder,index_x,index_xy,index_xyz;
1422 real *thx,*thy,*thz;
1423 int localsize, bndsize;
1424 int pnx,pny,pnz,ndatatot;
1427 pnx = pmegrid->n[XX];
1428 pny = pmegrid->n[YY];
1429 pnz = pmegrid->n[ZZ];
1431 offx = pmegrid->offset[XX];
1432 offy = pmegrid->offset[YY];
1433 offz = pmegrid->offset[ZZ];
1435 ndatatot = pnx*pny*pnz;
1436 grid = pmegrid->grid;
1437 for(i=0;i<ndatatot;i++)
1442 order = pmegrid->order;
1444 for(nn=0; nn<spline->n; nn++)
1446 n = spline->ind[nn];
1451 idxptr = atc->idx[n];
1454 i0 = idxptr[XX] - offx;
1455 j0 = idxptr[YY] - offy;
1456 k0 = idxptr[ZZ] - offz;
1458 thx = spline->theta[XX] + norder;
1459 thy = spline->theta[YY] + norder;
1460 thz = spline->theta[ZZ] + norder;
1465 #ifdef PME_SSE_UNALIGNED
1466 #define PME_SPREAD_SSE_ORDER4
1468 #define PME_SPREAD_SSE_ALIGNED
1471 #include "pme_sse_single.h"
1478 #define PME_SPREAD_SSE_ALIGNED
1480 #include "pme_sse_single.h"
1493 static void set_grid_alignment(int *pmegrid_nz,int pme_order)
1497 #ifndef PME_SSE_UNALIGNED
1502 /* Round nz up to a multiple of 4 to ensure alignment */
1503 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1508 static void set_gridsize_alignment(int *gridsize,int pme_order)
1511 #ifndef PME_SSE_UNALIGNED
1514 /* Add extra elements to ensured aligned operations do not go
1515 * beyond the allocated grid size.
1516 * Note that for pme_order=5, the pme grid z-size alignment
1517 * ensures that we will not go beyond the grid size.
1525 static void pmegrid_init(pmegrid_t *grid,
1526 int cx, int cy, int cz,
1527 int x0, int y0, int z0,
1528 int x1, int y1, int z1,
1529 gmx_bool set_alignment,
1538 grid->offset[XX] = x0;
1539 grid->offset[YY] = y0;
1540 grid->offset[ZZ] = z0;
1541 grid->n[XX] = x1 - x0 + pme_order - 1;
1542 grid->n[YY] = y1 - y0 + pme_order - 1;
1543 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1546 set_grid_alignment(&nz,pme_order);
1551 else if (nz != grid->n[ZZ])
1553 gmx_incons("pmegrid_init call with an unaligned z size");
1556 grid->order = pme_order;
1559 gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
1560 set_gridsize_alignment(&gridsize,pme_order);
1561 snew_aligned(grid->grid,gridsize,16);
1569 static int div_round_up(int enumerator,int denominator)
1571 return (enumerator + denominator - 1)/denominator;
1574 static void make_subgrid_division(const ivec n,int ovl,int nthread,
1577 int gsize_opt,gsize;
1582 for(nsx=1; nsx<=nthread; nsx++)
1584 if (nthread % nsx == 0)
1586 for(nsy=1; nsy<=nthread; nsy++)
1588 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1590 nsz = nthread/(nsx*nsy);
1592 /* Determine the number of grid points per thread */
1594 (div_round_up(n[XX],nsx) + ovl)*
1595 (div_round_up(n[YY],nsy) + ovl)*
1596 (div_round_up(n[ZZ],nsz) + ovl);
1598 /* Minimize the number of grids points per thread
1599 * and, secondarily, the number of cuts in minor dimensions.
1601 if (gsize_opt == -1 ||
1602 gsize < gsize_opt ||
1603 (gsize == gsize_opt &&
1604 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1616 env = getenv("GMX_PME_THREAD_DIVISION");
1619 sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
1622 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1624 gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
1628 static void pmegrids_init(pmegrids_t *grids,
1629 int nx,int ny,int nz,int nz_base,
1635 ivec n,n_base,g0,g1;
1636 int t,x,y,z,d,i,tfac;
1639 n[XX] = nx - (pme_order - 1);
1640 n[YY] = ny - (pme_order - 1);
1641 n[ZZ] = nz - (pme_order - 1);
1643 copy_ivec(n,n_base);
1644 n_base[ZZ] = nz_base;
1646 pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
1649 grids->nthread = nthread;
1651 make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
1653 if (grids->nthread > 1)
1659 for(d=0; d<DIM; d++)
1661 nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
1663 set_grid_alignment(&nst[ZZ],pme_order);
1667 fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
1668 grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
1669 fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1671 nst[XX],nst[YY],nst[ZZ]);
1674 snew(grids->grid_th,grids->nthread);
1676 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1677 set_gridsize_alignment(&gridsize,pme_order);
1678 snew_aligned(grid_all,
1679 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1682 for(x=0; x<grids->nc[XX]; x++)
1684 for(y=0; y<grids->nc[YY]; y++)
1686 for(z=0; z<grids->nc[ZZ]; z++)
1688 pmegrid_init(&grids->grid_th[t],
1690 (n[XX]*(x ))/grids->nc[XX],
1691 (n[YY]*(y ))/grids->nc[YY],
1692 (n[ZZ]*(z ))/grids->nc[ZZ],
1693 (n[XX]*(x+1))/grids->nc[XX],
1694 (n[YY]*(y+1))/grids->nc[YY],
1695 (n[ZZ]*(z+1))/grids->nc[ZZ],
1698 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1705 snew(grids->g2t,DIM);
1707 for(d=DIM-1; d>=0; d--)
1709 snew(grids->g2t[d],n[d]);
1711 for(i=0; i<n[d]; i++)
1713 /* The second check should match the parameters
1714 * of the pmegrid_init call above.
1716 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1720 grids->g2t[d][i] = t*tfac;
1723 tfac *= grids->nc[d];
1727 case XX: max_comm_lines = overlap_x; break;
1728 case YY: max_comm_lines = overlap_y; break;
1729 case ZZ: max_comm_lines = pme_order - 1; break;
1731 grids->nthread_comm[d] = 0;
1732 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
1734 grids->nthread_comm[d]++;
1738 fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
1739 'x'+d,grids->nthread_comm[d]);
1741 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1742 * work, but this is not a problematic restriction.
1744 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1746 gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
1752 static void pmegrids_destroy(pmegrids_t *grids)
1756 if (grids->grid.grid != NULL)
1758 sfree(grids->grid.grid);
1760 if (grids->nthread > 0)
1762 for(t=0; t<grids->nthread; t++)
1764 sfree(grids->grid_th[t].grid);
1766 sfree(grids->grid_th);
1772 static void realloc_work(pme_work_t *work,int nkx)
1774 if (nkx > work->nalloc)
1777 srenew(work->mhx ,work->nalloc);
1778 srenew(work->mhy ,work->nalloc);
1779 srenew(work->mhz ,work->nalloc);
1780 srenew(work->m2 ,work->nalloc);
1781 /* Allocate an aligned pointer for SSE operations, including 3 extra
1782 * elements at the end since SSE operates on 4 elements at a time.
1784 sfree_aligned(work->denom);
1785 sfree_aligned(work->tmp1);
1786 sfree_aligned(work->eterm);
1787 snew_aligned(work->denom,work->nalloc+3,16);
1788 snew_aligned(work->tmp1 ,work->nalloc+3,16);
1789 snew_aligned(work->eterm,work->nalloc+3,16);
1790 srenew(work->m2inv,work->nalloc);
1795 static void free_work(pme_work_t *work)
1801 sfree_aligned(work->denom);
1802 sfree_aligned(work->tmp1);
1803 sfree_aligned(work->eterm);
1809 /* Calculate exponentials through SSE in float precision */
1810 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1813 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
1816 __m128 tmp_d1,d_inv,tmp_r,tmp_e;
1818 f_sse = _mm_load1_ps(&f);
1819 for(kx=0; kx<end; kx+=4)
1821 tmp_d1 = _mm_load_ps(d_aligned+kx);
1822 lu = _mm_rcp_ps(tmp_d1);
1823 d_inv = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
1824 tmp_r = _mm_load_ps(r_aligned+kx);
1825 tmp_r = gmx_mm_exp_ps(tmp_r);
1826 tmp_e = _mm_mul_ps(f_sse,d_inv);
1827 tmp_e = _mm_mul_ps(tmp_e,tmp_r);
1828 _mm_store_ps(e_aligned+kx,tmp_e);
1833 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1836 for(kx=start; kx<end; kx++)
1840 for(kx=start; kx<end; kx++)
1844 for(kx=start; kx<end; kx++)
1846 e[kx] = f*r[kx]*d[kx];
1852 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1853 real ewaldcoeff,real vol,
1855 int nthread,int thread)
1857 /* do recip sum over local cells in grid */
1858 /* y major, z middle, x minor or continuous */
1860 int kx,ky,kz,maxkx,maxky,maxkz;
1861 int nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
1863 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1864 real ets2,struct2,vfactor,ets2vf;
1865 real d1,d2,energy=0;
1867 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1868 real rxx,ryx,ryy,rzx,rzy,rzz;
1870 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
1871 real mhxk,mhyk,mhzk,m2k;
1874 ivec local_ndata,local_offset,local_size;
1877 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1883 /* Dimensions should be identical for A/B grid, so we just use A here */
1884 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1890 rxx = pme->recipbox[XX][XX];
1891 ryx = pme->recipbox[YY][XX];
1892 ryy = pme->recipbox[YY][YY];
1893 rzx = pme->recipbox[ZZ][XX];
1894 rzy = pme->recipbox[ZZ][YY];
1895 rzz = pme->recipbox[ZZ][ZZ];
1901 work = &pme->work[thread];
1906 denom = work->denom;
1908 eterm = work->eterm;
1909 m2inv = work->m2inv;
1911 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1912 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1914 for(iyz=iyz0; iyz<iyz1; iyz++)
1916 iy = iyz/local_ndata[ZZ];
1917 iz = iyz - iy*local_ndata[ZZ];
1919 ky = iy + local_offset[YY];
1930 by = M_PI*vol*pme->bsp_mod[YY][ky];
1932 kz = iz + local_offset[ZZ];
1936 bz = pme->bsp_mod[ZZ][kz];
1938 /* 0.5 correction for corner points */
1940 if (kz == 0 || kz == (nz+1)/2)
1945 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1947 /* We should skip the k-space point (0,0,0) */
1948 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
1950 kxstart = local_offset[XX];
1954 kxstart = local_offset[XX] + 1;
1957 kxend = local_offset[XX] + local_ndata[XX];
1961 /* More expensive inner loop, especially because of the storage
1962 * of the mh elements in array's.
1963 * Because x is the minor grid index, all mh elements
1964 * depend on kx for triclinic unit cells.
1967 /* Two explicit loops to avoid a conditional inside the loop */
1968 for(kx=kxstart; kx<maxkx; kx++)
1973 mhyk = mx * ryx + my * ryy;
1974 mhzk = mx * rzx + my * rzy + mz * rzz;
1975 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1980 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1981 tmp1[kx] = -factor*m2k;
1984 for(kx=maxkx; kx<kxend; kx++)
1989 mhyk = mx * ryx + my * ryy;
1990 mhzk = mx * rzx + my * rzy + mz * rzz;
1991 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1996 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1997 tmp1[kx] = -factor*m2k;
2000 for(kx=kxstart; kx<kxend; kx++)
2002 m2inv[kx] = 1.0/m2[kx];
2005 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2007 for(kx=kxstart; kx<kxend; kx++,p0++)
2012 p0->re = d1*eterm[kx];
2013 p0->im = d2*eterm[kx];
2015 struct2 = 2.0*(d1*d1+d2*d2);
2017 tmp1[kx] = eterm[kx]*struct2;
2020 for(kx=kxstart; kx<kxend; kx++)
2022 ets2 = corner_fac*tmp1[kx];
2023 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2026 ets2vf = ets2*vfactor;
2027 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2028 virxy += ets2vf*mhx[kx]*mhy[kx];
2029 virxz += ets2vf*mhx[kx]*mhz[kx];
2030 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2031 viryz += ets2vf*mhy[kx]*mhz[kx];
2032 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2037 /* We don't need to calculate the energy and the virial.
2038 * In this case the triclinic overhead is small.
2041 /* Two explicit loops to avoid a conditional inside the loop */
2043 for(kx=kxstart; kx<maxkx; kx++)
2048 mhyk = mx * ryx + my * ryy;
2049 mhzk = mx * rzx + my * rzy + mz * rzz;
2050 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2051 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2052 tmp1[kx] = -factor*m2k;
2055 for(kx=maxkx; kx<kxend; kx++)
2060 mhyk = mx * ryx + my * ryy;
2061 mhzk = mx * rzx + my * rzy + mz * rzz;
2062 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2063 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2064 tmp1[kx] = -factor*m2k;
2067 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2069 for(kx=kxstart; kx<kxend; kx++,p0++)
2074 p0->re = d1*eterm[kx];
2075 p0->im = d2*eterm[kx];
2082 /* Update virial with local values.
2083 * The virial is symmetric by definition.
2084 * this virial seems ok for isotropic scaling, but I'm
2085 * experiencing problems on semiisotropic membranes.
2086 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2088 work->vir[XX][XX] = 0.25*virxx;
2089 work->vir[YY][YY] = 0.25*viryy;
2090 work->vir[ZZ][ZZ] = 0.25*virzz;
2091 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2092 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2093 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2095 /* This energy should be corrected for a charged system */
2096 work->energy = 0.5*energy;
2099 /* Return the loop count */
2100 return local_ndata[YY]*local_ndata[XX];
2103 static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
2104 real *mesh_energy,matrix vir)
2106 /* This function sums output over threads
2107 * and should therefore only be called after thread synchronization.
2111 *mesh_energy = pme->work[0].energy;
2112 copy_mat(pme->work[0].vir,vir);
2114 for(thread=1; thread<nthread; thread++)
2116 *mesh_energy += pme->work[thread].energy;
2117 m_add(vir,pme->work[thread].vir,vir);
2121 #define DO_FSPLINE(order) \
2122 for(ithx=0; (ithx<order); ithx++) \
2124 index_x = (i0+ithx)*pny*pnz; \
2128 for(ithy=0; (ithy<order); ithy++) \
2130 index_xy = index_x+(j0+ithy)*pnz; \
2135 for(ithz=0; (ithz<order); ithz++) \
2137 gval = grid[index_xy+(k0+ithz)]; \
2138 fxy1 += thz[ithz]*gval; \
2139 fz1 += dthz[ithz]*gval; \
2148 static void gather_f_bsplines(gmx_pme_t pme,real *grid,
2149 gmx_bool bClearF,pme_atomcomm_t *atc,
2150 splinedata_t *spline,
2153 /* sum forces for local particles */
2154 int nn,n,ithx,ithy,ithz,i0,j0,k0;
2155 int index_x,index_xy;
2156 int nx,ny,nz,pnx,pny,pnz;
2158 real tx,ty,dx,dy,qn;
2161 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
2163 real rxx,ryx,ryy,rzx,rzy,rzz;
2166 pme_spline_work_t *work;
2168 work = &pme->spline_work;
2170 order = pme->pme_order;
2171 thx = spline->theta[XX];
2172 thy = spline->theta[YY];
2173 thz = spline->theta[ZZ];
2174 dthx = spline->dtheta[XX];
2175 dthy = spline->dtheta[YY];
2176 dthz = spline->dtheta[ZZ];
2180 pnx = pme->pmegrid_nx;
2181 pny = pme->pmegrid_ny;
2182 pnz = pme->pmegrid_nz;
2184 rxx = pme->recipbox[XX][XX];
2185 ryx = pme->recipbox[YY][XX];
2186 ryy = pme->recipbox[YY][YY];
2187 rzx = pme->recipbox[ZZ][XX];
2188 rzy = pme->recipbox[ZZ][YY];
2189 rzz = pme->recipbox[ZZ][ZZ];
2191 for(nn=0; nn<spline->n; nn++)
2193 n = spline->ind[nn];
2194 qn = scale*atc->q[n];
2207 idxptr = atc->idx[n];
2214 /* Pointer arithmetic alert, next six statements */
2215 thx = spline->theta[XX] + norder;
2216 thy = spline->theta[YY] + norder;
2217 thz = spline->theta[ZZ] + norder;
2218 dthx = spline->dtheta[XX] + norder;
2219 dthy = spline->dtheta[YY] + norder;
2220 dthz = spline->dtheta[ZZ] + norder;
2225 #ifdef PME_SSE_UNALIGNED
2226 #define PME_GATHER_F_SSE_ORDER4
2228 #define PME_GATHER_F_SSE_ALIGNED
2231 #include "pme_sse_single.h"
2238 #define PME_GATHER_F_SSE_ALIGNED
2240 #include "pme_sse_single.h"
2250 atc->f[n][XX] += -qn*( fx*nx*rxx );
2251 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2252 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2255 /* Since the energy and not forces are interpolated
2256 * the net force might not be exactly zero.
2257 * This can be solved by also interpolating F, but
2258 * that comes at a cost.
2259 * A better hack is to remove the net force every
2260 * step, but that must be done at a higher level
2261 * since this routine doesn't see all atoms if running
2262 * in parallel. Don't know how important it is? EL 990726
2267 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
2268 pme_atomcomm_t *atc)
2270 splinedata_t *spline;
2271 int n,ithx,ithy,ithz,i0,j0,k0;
2272 int index_x,index_xy;
2274 real energy,pot,tx,ty,qn,gval;
2275 real *thx,*thy,*thz;
2279 spline = &atc->spline[0];
2281 order = pme->pme_order;
2284 for(n=0; (n<atc->n); n++) {
2288 idxptr = atc->idx[n];
2295 /* Pointer arithmetic alert, next three statements */
2296 thx = spline->theta[XX] + norder;
2297 thy = spline->theta[YY] + norder;
2298 thz = spline->theta[ZZ] + norder;
2301 for(ithx=0; (ithx<order); ithx++)
2303 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2306 for(ithy=0; (ithy<order); ithy++)
2308 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2311 for(ithz=0; (ithz<order); ithz++)
2313 gval = grid[index_xy+(k0+ithz)];
2314 pot += tx*ty*thz[ithz]*gval;
2327 /* Macro to force loop unrolling by fixing order.
2328 * This gives a significant performance gain.
2330 #define CALC_SPLINE(order) \
2334 real data[PME_ORDER_MAX]; \
2335 real ddata[PME_ORDER_MAX]; \
2337 for(j=0; (j<DIM); j++) \
2341 /* dr is relative offset from lower cell limit */ \
2342 data[order-1] = 0; \
2346 for(k=3; (k<order); k++) \
2348 div = 1.0/(k - 1.0); \
2349 data[k-1] = div*dr*data[k-2]; \
2350 for(l=1; (l<(k-1)); l++) \
2352 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2355 data[0] = div*(1-dr)*data[0]; \
2357 /* differentiate */ \
2358 ddata[0] = -data[0]; \
2359 for(k=1; (k<order); k++) \
2361 ddata[k] = data[k-1] - data[k]; \
2364 div = 1.0/(order - 1); \
2365 data[order-1] = div*dr*data[order-2]; \
2366 for(l=1; (l<(order-1)); l++) \
2368 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2369 (order-l-dr)*data[order-l-1]); \
2371 data[0] = div*(1 - dr)*data[0]; \
2373 for(k=0; k<order; k++) \
2375 theta[j][i*order+k] = data[k]; \
2376 dtheta[j][i*order+k] = ddata[k]; \
2381 void make_bsplines(splinevec theta,splinevec dtheta,int order,
2382 rvec fractx[],int nr,int ind[],real charge[],
2383 gmx_bool bFreeEnergy)
2385 /* construct splines for local atoms */
2391 /* With free energy we do not use the charge check.
2392 * In most cases this will be more efficient than calling make_bsplines
2393 * twice, since usually more than half the particles have charges.
2396 if (bFreeEnergy || charge[ii] != 0.0) {
2399 case 4: CALC_SPLINE(4); break;
2400 case 5: CALC_SPLINE(5); break;
2401 default: CALC_SPLINE(order); break;
2408 void make_dft_mod(real *mod,real *data,int ndata)
2413 for(i=0;i<ndata;i++) {
2415 for(j=0;j<ndata;j++) {
2416 arg=(2.0*M_PI*i*j)/ndata;
2417 sc+=data[j]*cos(arg);
2418 ss+=data[j]*sin(arg);
2422 for(i=0;i<ndata;i++)
2424 mod[i]=(mod[i-1]+mod[i+1])*0.5;
2428 static void make_bspline_moduli(splinevec bsp_mod,
2429 int nx,int ny,int nz,int order)
2431 int nmax=max(nx,max(ny,nz));
2432 real *data,*ddata,*bsp_data;
2438 snew(bsp_data,nmax);
2444 for(k=3;k<order;k++) {
2447 for(l=1;l<(k-1);l++)
2448 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2449 data[0]=div*data[0];
2453 for(k=1;k<order;k++)
2454 ddata[k]=data[k-1]-data[k];
2457 for(l=1;l<(order-1);l++)
2458 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2459 data[0]=div*data[0];
2463 for(i=1;i<=order;i++)
2464 bsp_data[i]=data[i-1];
2466 make_dft_mod(bsp_mod[XX],bsp_data,nx);
2467 make_dft_mod(bsp_mod[YY],bsp_data,ny);
2468 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
2476 /* Return the P3M optimal influence function */
2477 static double do_p3m_influence(double z, int order)
2484 /* The formula and most constants can be found in:
2485 * Ballenegger et al., JCTC 8, 936 (2012)
2490 return 1.0 - 2.0*z2/3.0;
2493 return 1.0 - z2 + 2.0*z4/15.0;
2496 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2499 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2502 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2505 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2507 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2514 /* Calculate the P3M B-spline moduli for one dimension */
2515 static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
2517 double zarg,zai,sinzai,infl;
2522 gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
2529 for(i=-maxk; i<0; i++)
2533 infl = do_p3m_influence(sinzai,order);
2534 bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
2537 for(i=1; i<maxk; i++)
2541 infl = do_p3m_influence(sinzai,order);
2542 bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
2546 /* Calculate the P3M B-spline moduli */
2547 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2548 int nx,int ny,int nz,int order)
2550 make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
2551 make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
2552 make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
2556 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2564 for(i=1; i<=nslab/2; i++) {
2565 fw = (atc->nodeid + i) % nslab;
2566 bw = (atc->nodeid - i + nslab) % nslab;
2567 if (n < nslab - 1) {
2568 atc->node_dest[n] = fw;
2569 atc->node_src[n] = bw;
2572 if (n < nslab - 1) {
2573 atc->node_dest[n] = bw;
2574 atc->node_src[n] = fw;
2580 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
2586 fprintf(log,"Destroying PME data structures.\n");
2589 sfree((*pmedata)->nnx);
2590 sfree((*pmedata)->nny);
2591 sfree((*pmedata)->nnz);
2593 pmegrids_destroy(&(*pmedata)->pmegridA);
2595 sfree((*pmedata)->fftgridA);
2596 sfree((*pmedata)->cfftgridA);
2597 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2599 if ((*pmedata)->pmegridB.grid.grid != NULL)
2601 pmegrids_destroy(&(*pmedata)->pmegridB);
2602 sfree((*pmedata)->fftgridB);
2603 sfree((*pmedata)->cfftgridB);
2604 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2606 for(thread=0; thread<(*pmedata)->nthread; thread++)
2608 free_work(&(*pmedata)->work[thread]);
2610 sfree((*pmedata)->work);
2618 static int mult_up(int n,int f)
2620 return ((n + f - 1)/f)*f;
2624 static double pme_load_imbalance(gmx_pme_t pme)
2629 nma = pme->nnodes_major;
2630 nmi = pme->nnodes_minor;
2632 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
2633 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
2634 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
2636 /* pme_solve is roughly double the cost of an fft */
2638 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2641 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
2642 int dimind,gmx_bool bSpread)
2646 atc->dimind = dimind;
2651 if (pme->nnodes > 1)
2653 atc->mpi_comm = pme->mpi_comm_d[dimind];
2654 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
2655 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
2659 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
2663 atc->bSpread = bSpread;
2664 atc->pme_order = pme->pme_order;
2668 /* These three allocations are not required for particle decomp. */
2669 snew(atc->node_dest,atc->nslab);
2670 snew(atc->node_src,atc->nslab);
2671 setup_coordinate_communication(atc);
2673 snew(atc->count_thread,pme->nthread);
2674 for(thread=0; thread<pme->nthread; thread++)
2676 snew(atc->count_thread[thread],atc->nslab);
2678 atc->count = atc->count_thread[0];
2679 snew(atc->rcount,atc->nslab);
2680 snew(atc->buf_index,atc->nslab);
2683 atc->nthread = pme->nthread;
2684 if (atc->nthread > 1)
2686 snew(atc->thread_plist,atc->nthread);
2688 snew(atc->spline,atc->nthread);
2689 for(thread=0; thread<atc->nthread; thread++)
2691 if (atc->nthread > 1)
2693 snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
2694 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2700 init_overlap_comm(pme_overlap_t * ol,
2710 int lbnd,rbnd,maxlr,b,i;
2713 pme_grid_comm_t *pgc;
2715 int fft_start,fft_end,send_index1,recv_index1;
2718 ol->mpi_comm = comm;
2721 ol->nnodes = nnodes;
2722 ol->nodeid = nodeid;
2724 /* Linear translation of the PME grid wo'nt affect reciprocal space
2725 * calculations, so to optimize we only interpolate "upwards",
2726 * which also means we only have to consider overlap in one direction.
2727 * I.e., particles on this node might also be spread to grid indices
2728 * that belong to higher nodes (modulo nnodes)
2731 snew(ol->s2g0,ol->nnodes+1);
2732 snew(ol->s2g1,ol->nnodes);
2733 if (debug) { fprintf(debug,"PME slab boundaries:"); }
2734 for(i=0; i<nnodes; i++)
2736 /* s2g0 the local interpolation grid start.
2737 * s2g1 the local interpolation grid end.
2738 * Because grid overlap communication only goes forward,
2739 * the grid the slabs for fft's should be rounded down.
2741 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2742 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2746 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
2749 ol->s2g0[nnodes] = ndata;
2750 if (debug) { fprintf(debug,"\n"); }
2752 /* Determine with how many nodes we need to communicate the grid overlap */
2758 for(i=0; i<nnodes; i++)
2760 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2761 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2767 while (bCont && b < nnodes);
2768 ol->noverlap_nodes = b - 1;
2770 snew(ol->send_id,ol->noverlap_nodes);
2771 snew(ol->recv_id,ol->noverlap_nodes);
2772 for(b=0; b<ol->noverlap_nodes; b++)
2774 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2775 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2777 snew(ol->comm_data, ol->noverlap_nodes);
2779 for(b=0; b<ol->noverlap_nodes; b++)
2781 pgc = &ol->comm_data[b];
2783 fft_start = ol->s2g0[ol->send_id[b]];
2784 fft_end = ol->s2g0[ol->send_id[b]+1];
2785 if (ol->send_id[b] < nodeid)
2790 send_index1 = ol->s2g1[nodeid];
2791 send_index1 = min(send_index1,fft_end);
2792 pgc->send_index0 = fft_start;
2793 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
2795 /* We always start receiving to the first index of our slab */
2796 fft_start = ol->s2g0[ol->nodeid];
2797 fft_end = ol->s2g0[ol->nodeid+1];
2798 recv_index1 = ol->s2g1[ol->recv_id[b]];
2799 if (ol->recv_id[b] > nodeid)
2801 recv_index1 -= ndata;
2803 recv_index1 = min(recv_index1,fft_end);
2804 pgc->recv_index0 = fft_start;
2805 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
2808 /* For non-divisible grid we need pme_order iso pme_order-1 */
2809 snew(ol->sendbuf,norder*commplainsize);
2810 snew(ol->recvbuf,norder*commplainsize);
2814 make_gridindex5_to_localindex(int n,int local_start,int local_range,
2815 int **global_to_local,
2816 real **fraction_shift)
2824 for(i=0; (i<5*n); i++)
2826 /* Determine the global to local grid index */
2827 gtl[i] = (i - local_start + n) % n;
2828 /* For coordinates that fall within the local grid the fraction
2829 * is correct, we don't need to shift it.
2832 if (local_range < n)
2834 /* Due to rounding issues i could be 1 beyond the lower or
2835 * upper boundary of the local grid. Correct the index for this.
2836 * If we shift the index, we need to shift the fraction by
2837 * the same amount in the other direction to not affect
2839 * Note that due to this shifting the weights at the end of
2840 * the spline might change, but that will only involve values
2841 * between zero and values close to the precision of a real,
2842 * which is anyhow the accuracy of the whole mesh calculation.
2844 /* With local_range=0 we should not change i=local_start */
2845 if (i % n != local_start)
2852 else if (gtl[i] == local_range)
2854 gtl[i] = local_range - 1;
2861 *global_to_local = gtl;
2862 *fraction_shift = fsh;
2865 static void sse_mask_init(pme_spline_work_t *work,int order)
2872 zero_SSE = _mm_setzero_ps();
2874 for(of=0; of<8-(order-1); of++)
2878 tmp[i] = (i >= of && i < of+order ? 1 : 0);
2880 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
2881 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
2882 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
2883 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
2889 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2893 if (*nk % nnodes != 0)
2895 nk_new = nnodes*(*nk/nnodes + 1);
2897 if (2*nk_new >= 3*(*nk))
2899 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2900 dim,*nk,dim,nnodes,dim);
2905 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2906 dim,*nk,dim,nnodes,dim,nk_new,dim);
2913 int gmx_pme_init(gmx_pme_t * pmedata,
2919 gmx_bool bFreeEnergy,
2920 gmx_bool bReproducible,
2925 pme_atomcomm_t *atc;
2929 fprintf(debug,"Creating PME data structures.\n");
2932 pme->redist_init = FALSE;
2933 pme->sum_qgrid_tmp = NULL;
2934 pme->sum_qgrid_dd_tmp = NULL;
2935 pme->buf_nalloc = 0;
2936 pme->redist_buf_nalloc = 0;
2939 pme->bPPnode = TRUE;
2941 pme->nnodes_major = nnodes_major;
2942 pme->nnodes_minor = nnodes_minor;
2945 if (nnodes_major*nnodes_minor > 1)
2947 pme->mpi_comm = cr->mpi_comm_mygroup;
2949 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2950 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2951 if (pme->nnodes != nnodes_major*nnodes_minor)
2953 gmx_incons("PME node count mismatch");
2958 pme->mpi_comm = MPI_COMM_NULL;
2962 if (pme->nnodes == 1)
2964 pme->ndecompdim = 0;
2965 pme->nodeid_major = 0;
2966 pme->nodeid_minor = 0;
2968 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
2973 if (nnodes_minor == 1)
2976 pme->mpi_comm_d[0] = pme->mpi_comm;
2977 pme->mpi_comm_d[1] = MPI_COMM_NULL;
2979 pme->ndecompdim = 1;
2980 pme->nodeid_major = pme->nodeid;
2981 pme->nodeid_minor = 0;
2984 else if (nnodes_major == 1)
2987 pme->mpi_comm_d[0] = MPI_COMM_NULL;
2988 pme->mpi_comm_d[1] = pme->mpi_comm;
2990 pme->ndecompdim = 1;
2991 pme->nodeid_major = 0;
2992 pme->nodeid_minor = pme->nodeid;
2996 if (pme->nnodes % nnodes_major != 0)
2998 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3000 pme->ndecompdim = 2;
3003 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
3004 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
3005 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
3006 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3008 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
3009 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
3010 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
3011 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
3014 pme->bPPnode = (cr->duty & DUTY_PP);
3017 pme->nthread = nthread;
3019 if (ir->ePBC == epbcSCREW)
3021 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
3024 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
3028 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3029 pme->pme_order = ir->pme_order;
3030 pme->epsilon_r = ir->epsilon_r;
3032 if (pme->pme_order > PME_ORDER_MAX)
3034 gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3035 pme->pme_order,PME_ORDER_MAX);
3038 /* Currently pme.c supports only the fft5d FFT code.
3039 * Therefore the grid always needs to be divisible by nnodes.
3040 * When the old 1D code is also supported again, change this check.
3042 * This check should be done before calling gmx_pme_init
3043 * and fplog should be passed iso stderr.
3045 if (pme->ndecompdim >= 2)
3047 if (pme->ndecompdim >= 1)
3050 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3051 'x',nnodes_major,&pme->nkx);
3052 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3053 'y',nnodes_minor,&pme->nky);
3057 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3058 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3059 pme->nkz <= pme->pme_order)
3061 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
3064 if (pme->nnodes > 1) {
3068 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3069 MPI_Type_commit(&(pme->rvec_mpi));
3072 /* Note that the charge spreading and force gathering, which usually
3073 * takes about the same amount of time as FFT+solve_pme,
3074 * is always fully load balanced
3075 * (unless the charge distribution is inhomogeneous).
3078 imbal = pme_load_imbalance(pme);
3079 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3083 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3084 " For optimal PME load balancing\n"
3085 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3086 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3088 (int)((imbal-1)*100 + 0.5),
3089 pme->nkx,pme->nky,pme->nnodes_major,
3090 pme->nky,pme->nkz,pme->nnodes_minor);
3094 /* For non-divisible grid we need pme_order iso pme_order-1 */
3095 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3096 * y is always copied through a buffer: we don't need padding in z,
3097 * but we do need the overlap in x because of the communication order.
3099 init_overlap_comm(&pme->overlap[0],pme->pme_order,
3103 pme->nnodes_major,pme->nodeid_major,
3105 (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3107 init_overlap_comm(&pme->overlap[1],pme->pme_order,
3111 pme->nnodes_minor,pme->nodeid_minor,
3113 (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
3115 /* Check for a limitation of the (current) sum_fftgrid_dd code */
3116 if (pme->nthread > 1 &&
3117 (pme->overlap[0].noverlap_nodes > 1 ||
3118 pme->overlap[1].noverlap_nodes > 1))
3120 gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
3123 snew(pme->bsp_mod[XX],pme->nkx);
3124 snew(pme->bsp_mod[YY],pme->nky);
3125 snew(pme->bsp_mod[ZZ],pme->nkz);
3127 /* The required size of the interpolation grid, including overlap.
3128 * The allocated size (pmegrid_n?) might be slightly larger.
3130 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3131 pme->overlap[0].s2g0[pme->nodeid_major];
3132 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3133 pme->overlap[1].s2g0[pme->nodeid_minor];
3134 pme->pmegrid_nz_base = pme->nkz;
3135 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3136 set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
3138 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3139 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3140 pme->pmegrid_start_iz = 0;
3142 make_gridindex5_to_localindex(pme->nkx,
3143 pme->pmegrid_start_ix,
3144 pme->pmegrid_nx - (pme->pme_order-1),
3145 &pme->nnx,&pme->fshx);
3146 make_gridindex5_to_localindex(pme->nky,
3147 pme->pmegrid_start_iy,
3148 pme->pmegrid_ny - (pme->pme_order-1),
3149 &pme->nny,&pme->fshy);
3150 make_gridindex5_to_localindex(pme->nkz,
3151 pme->pmegrid_start_iz,
3152 pme->pmegrid_nz_base,
3153 &pme->nnz,&pme->fshz);
3155 pmegrids_init(&pme->pmegridA,
3156 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3157 pme->pmegrid_nz_base,
3160 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3161 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3163 sse_mask_init(&pme->spline_work,pme->pme_order);
3165 ndata[0] = pme->nkx;
3166 ndata[1] = pme->nky;
3167 ndata[2] = pme->nkz;
3169 /* This routine will allocate the grid data to fit the FFTs */
3170 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
3171 &pme->fftgridA,&pme->cfftgridA,
3173 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3174 bReproducible,pme->nthread);
3178 pmegrids_init(&pme->pmegridB,
3179 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3180 pme->pmegrid_nz_base,
3183 pme->nkx % pme->nnodes_major != 0,
3184 pme->nky % pme->nnodes_minor != 0);
3186 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
3187 &pme->fftgridB,&pme->cfftgridB,
3189 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3190 bReproducible,pme->nthread);
3194 pme->pmegridB.grid.grid = NULL;
3195 pme->fftgridB = NULL;
3196 pme->cfftgridB = NULL;
3201 /* Use plain SPME B-spline interpolation */
3202 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3206 /* Use the P3M grid-optimized influence function */
3207 make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3210 /* Use atc[0] for spreading */
3211 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
3212 if (pme->ndecompdim >= 2)
3214 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
3217 if (pme->nnodes == 1) {
3218 pme->atc[0].n = homenr;
3219 pme_realloc_atomcomm_things(&pme->atc[0]);
3225 /* Use fft5d, order after FFT is y major, z, x minor */
3227 snew(pme->work,pme->nthread);
3228 for(thread=0; thread<pme->nthread; thread++)
3230 realloc_work(&pme->work[thread],pme->nkx);
3240 static void copy_local_grid(gmx_pme_t pme,
3241 pmegrids_t *pmegrids,int thread,real *fftgrid)
3243 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3247 int offx,offy,offz,x,y,z,i0,i0t;
3252 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3256 fft_my = local_fft_size[YY];
3257 fft_mz = local_fft_size[ZZ];
3259 pmegrid = &pmegrids->grid_th[thread];
3261 nsx = pmegrid->n[XX];
3262 nsy = pmegrid->n[YY];
3263 nsz = pmegrid->n[ZZ];
3265 for(d=0; d<DIM; d++)
3267 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3268 local_fft_ndata[d] - pmegrid->offset[d]);
3271 offx = pmegrid->offset[XX];
3272 offy = pmegrid->offset[YY];
3273 offz = pmegrid->offset[ZZ];
3275 /* Directly copy the non-overlapping parts of the local grids.
3276 * This also initializes the full grid.
3278 grid_th = pmegrid->grid;
3279 for(x=0; x<nf[XX]; x++)
3281 for(y=0; y<nf[YY]; y++)
3283 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3284 i0t = (x*nsy + y)*nsz;
3285 for(z=0; z<nf[ZZ]; z++)
3287 fftgrid[i0+z] = grid_th[i0t+z];
3293 static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
3295 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3296 pme_overlap_t *overlap;
3300 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3304 /* Major dimension */
3305 overlap = &pme->overlap[0];
3307 nind = overlap->comm_data[0].send_nindex;
3309 for(y=0; y<local_fft_ndata[YY]; y++) {
3315 for(x=0; x<nind; x++) {
3316 for(y=0; y<local_fft_ndata[YY]; y++) {
3318 for(z=0; z<local_fft_ndata[ZZ]; z++) {
3319 if (sendbuf[i] != 0) n++;
3329 reduce_threadgrid_overlap(gmx_pme_t pme,
3330 const pmegrids_t *pmegrids,int thread,
3331 real *fftgrid,real *commbuf_x,real *commbuf_y)
3333 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3334 int fft_nx,fft_ny,fft_nz;
3339 int offx,offy,offz,x,y,z,i0,i0t;
3340 int sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
3341 gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
3342 gmx_bool bCommX,bCommY;
3345 const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
3346 const real *grid_th;
3349 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3353 fft_nx = local_fft_ndata[XX];
3354 fft_ny = local_fft_ndata[YY];
3355 fft_nz = local_fft_ndata[ZZ];
3357 fft_my = local_fft_size[YY];
3358 fft_mz = local_fft_size[ZZ];
3360 /* This routine is called when all thread have finished spreading.
3361 * Here each thread sums grid contributions calculated by other threads
3362 * to the thread local grid volume.
3363 * To minimize the number of grid copying operations,
3364 * this routines sums immediately from the pmegrid to the fftgrid.
3367 /* Determine which part of the full node grid we should operate on,
3368 * this is our thread local part of the full grid.
3370 pmegrid = &pmegrids->grid_th[thread];
3372 for(d=0; d<DIM; d++)
3374 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3375 local_fft_ndata[d]);
3378 offx = pmegrid->offset[XX];
3379 offy = pmegrid->offset[YY];
3380 offz = pmegrid->offset[ZZ];
3387 /* Now loop over all the thread data blocks that contribute
3388 * to the grid region we (our thread) are operating on.
3390 /* Note that ffy_nx/y is equal to the number of grid points
3391 * between the first point of our node grid and the one of the next node.
3393 for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
3395 fx = pmegrid->ci[XX] + sx;
3399 fx += pmegrids->nc[XX];
3401 bCommX = (pme->nnodes_major > 1);
3403 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3404 ox += pmegrid_g->offset[XX];
3407 tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
3411 tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
3414 for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
3416 fy = pmegrid->ci[YY] + sy;
3420 fy += pmegrids->nc[YY];
3422 bCommY = (pme->nnodes_minor > 1);
3424 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3425 oy += pmegrid_g->offset[YY];
3428 ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
3432 ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
3435 for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
3437 fz = pmegrid->ci[ZZ] + sz;
3441 fz += pmegrids->nc[ZZ];
3444 pmegrid_g = &pmegrids->grid_th[fz];
3445 oz += pmegrid_g->offset[ZZ];
3446 tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
3448 if (sx == 0 && sy == 0 && sz == 0)
3450 /* We have already added our local contribution
3451 * before calling this routine, so skip it here.
3456 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3458 pmegrid_f = &pmegrids->grid_th[thread_f];
3460 grid_th = pmegrid_f->grid;
3462 nsx = pmegrid_f->n[XX];
3463 nsy = pmegrid_f->n[YY];
3464 nsz = pmegrid_f->n[ZZ];
3466 #ifdef DEBUG_PME_REDUCE
3467 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3468 pme->nodeid,thread,thread_f,
3469 pme->pmegrid_start_ix,
3470 pme->pmegrid_start_iy,
3471 pme->pmegrid_start_iz,
3473 offx-ox,tx1-ox,offx,tx1,
3474 offy-oy,ty1-oy,offy,ty1,
3475 offz-oz,tz1-oz,offz,tz1);
3478 if (!(bCommX || bCommY))
3480 /* Copy from the thread local grid to the node grid */
3481 for(x=offx; x<tx1; x++)
3483 for(y=offy; y<ty1; y++)
3485 i0 = (x*fft_my + y)*fft_mz;
3486 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3487 for(z=offz; z<tz1; z++)
3489 fftgrid[i0+z] += grid_th[i0t+z];
3496 /* The order of this conditional decides
3497 * where the corner volume gets stored with x+y decomp.
3501 commbuf = commbuf_y;
3502 buf_my = ty1 - offy;
3505 /* We index commbuf modulo the local grid size */
3506 commbuf += buf_my*fft_nx*fft_nz;
3508 bClearBuf = bClearBufXY;
3509 bClearBufXY = FALSE;
3513 bClearBuf = bClearBufY;
3519 commbuf = commbuf_x;
3521 bClearBuf = bClearBufX;
3525 /* Copy to the communication buffer */
3526 for(x=offx; x<tx1; x++)
3528 for(y=offy; y<ty1; y++)
3530 i0 = (x*buf_my + y)*fft_nz;
3531 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3535 /* First access of commbuf, initialize it */
3536 for(z=offz; z<tz1; z++)
3538 commbuf[i0+z] = grid_th[i0t+z];
3543 for(z=offz; z<tz1; z++)
3545 commbuf[i0+z] += grid_th[i0t+z];
3557 static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
3559 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3560 pme_overlap_t *overlap;
3562 int recv_index0,recv_nindex;
3566 int ipulse,send_id,recv_id,datasize,gridsize,size_yx;
3567 real *sendptr,*recvptr;
3568 int x,y,z,indg,indb;
3570 /* Note that this routine is only used for forward communication.
3571 * Since the force gathering, unlike the charge spreading,
3572 * can be trivially parallelized over the particles,
3573 * the backwards process is much simpler and can use the "old"
3574 * communication setup.
3577 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3582 /* Currently supports only a single communication pulse */
3584 /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
3585 if (pme->nnodes_minor > 1)
3587 /* Major dimension */
3588 overlap = &pme->overlap[1];
3590 if (pme->nnodes_major > 1)
3592 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3598 datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
3602 send_id = overlap->send_id[ipulse];
3603 recv_id = overlap->recv_id[ipulse];
3604 send_nindex = overlap->comm_data[ipulse].send_nindex;
3605 /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
3607 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3609 sendptr = overlap->sendbuf;
3610 recvptr = overlap->recvbuf;
3613 printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
3614 local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
3615 printf("node %d send %f, %f\n",pme->nodeid,
3616 sendptr[0],sendptr[send_nindex*datasize-1]);
3620 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3622 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3624 overlap->mpi_comm,&stat);
3627 for(x=0; x<local_fft_ndata[XX]; x++)
3629 for(y=0; y<recv_nindex; y++)
3631 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3632 indb = (x*recv_nindex + y)*local_fft_ndata[ZZ];
3633 for(z=0; z<local_fft_ndata[ZZ]; z++)
3635 fftgrid[indg+z] += recvptr[indb+z];
3639 if (pme->nnodes_major > 1)
3641 sendptr = pme->overlap[0].sendbuf;
3642 for(x=0; x<size_yx; x++)
3644 for(y=0; y<recv_nindex; y++)
3646 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3647 indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
3648 for(z=0; z<local_fft_ndata[ZZ]; z++)
3650 sendptr[indg+z] += recvptr[indb+z];
3657 /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
3658 if (pme->nnodes_major > 1)
3660 /* Major dimension */
3661 overlap = &pme->overlap[0];
3663 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3664 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3668 send_id = overlap->send_id[ipulse];
3669 recv_id = overlap->recv_id[ipulse];
3670 send_nindex = overlap->comm_data[ipulse].send_nindex;
3671 /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
3673 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3675 sendptr = overlap->sendbuf;
3676 recvptr = overlap->recvbuf;
3680 fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
3681 send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
3685 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3687 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3689 overlap->mpi_comm,&stat);
3692 for(x=0; x<recv_nindex; x++)
3694 for(y=0; y<local_fft_ndata[YY]; y++)
3696 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3697 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3698 for(z=0; z<local_fft_ndata[ZZ]; z++)
3700 fftgrid[indg+z] += recvptr[indb+z];
3708 static void spread_on_grid(gmx_pme_t pme,
3709 pme_atomcomm_t *atc,pmegrids_t *grids,
3710 gmx_bool bCalcSplines,gmx_bool bSpread,
3714 #ifdef PME_TIME_THREADS
3715 gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
3716 static double cs1=0,cs2=0,cs3=0;
3717 static double cs1a[6]={0,0,0,0,0,0};
3721 nthread = pme->nthread;
3724 #ifdef PME_TIME_THREADS
3725 c1 = omp_cyc_start();
3729 #pragma omp parallel for num_threads(nthread) schedule(static)
3730 for(thread=0; thread<nthread; thread++)
3734 start = atc->n* thread /nthread;
3735 end = atc->n*(thread+1)/nthread;
3737 /* Compute fftgrid index for all atoms,
3738 * with help of some extra variables.
3740 calc_interpolation_idx(pme,atc,start,end,thread);
3743 #ifdef PME_TIME_THREADS
3744 c1 = omp_cyc_end(c1);
3748 #ifdef PME_TIME_THREADS
3749 c2 = omp_cyc_start();
3751 #pragma omp parallel for num_threads(nthread) schedule(static)
3752 for(thread=0; thread<nthread; thread++)
3754 splinedata_t *spline;
3757 /* make local bsplines */
3758 if (grids == NULL || grids->nthread == 1)
3760 spline = &atc->spline[0];
3764 grid = &grids->grid;
3768 spline = &atc->spline[thread];
3770 make_thread_local_ind(atc,thread,spline);
3772 grid = &grids->grid_th[thread];
3777 make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
3778 atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
3783 /* put local atoms on grid. */
3784 #ifdef PME_TIME_SPREAD
3785 ct1a = omp_cyc_start();
3787 spread_q_bsplines_thread(grid,atc,spline,&pme->spline_work);
3789 if (grids->nthread > 1)
3791 copy_local_grid(pme,grids,thread,fftgrid);
3793 #ifdef PME_TIME_SPREAD
3794 ct1a = omp_cyc_end(ct1a);
3795 cs1a[thread] += (double)ct1a;
3799 #ifdef PME_TIME_THREADS
3800 c2 = omp_cyc_end(c2);
3804 if (bSpread && grids->nthread > 1)
3806 #ifdef PME_TIME_THREADS
3807 c3 = omp_cyc_start();
3809 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3810 for(thread=0; thread<grids->nthread; thread++)
3812 reduce_threadgrid_overlap(pme,grids,thread,
3814 pme->overlap[0].sendbuf,
3815 pme->overlap[1].sendbuf);
3816 #ifdef PRINT_PME_SENDBUF
3817 print_sendbuf(pme,pme->overlap[0].sendbuf);
3820 #ifdef PME_TIME_THREADS
3821 c3 = omp_cyc_end(c3);
3825 if (pme->nnodes > 1)
3827 /* Communicate the overlapping part of the fftgrid */
3828 sum_fftgrid_dd(pme,fftgrid);
3832 #ifdef PME_TIME_THREADS
3836 printf("idx %.2f spread %.2f red %.2f",
3837 cs1*1e-9,cs2*1e-9,cs3*1e-9);
3838 #ifdef PME_TIME_SPREAD
3839 for(thread=0; thread<nthread; thread++)
3840 printf(" %.2f",cs1a[thread]*1e-9);
3848 static void dump_grid(FILE *fp,
3849 int sx,int sy,int sz,int nx,int ny,int nz,
3850 int my,int mz,const real *g)
3860 fprintf(fp,"%2d %2d %2d %6.3f\n",
3861 sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
3867 static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
3869 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3871 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3877 pme->pmegrid_start_ix,
3878 pme->pmegrid_start_iy,
3879 pme->pmegrid_start_iz,
3880 pme->pmegrid_nx-pme->pme_order+1,
3881 pme->pmegrid_ny-pme->pme_order+1,
3882 pme->pmegrid_nz-pme->pme_order+1,
3889 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
3891 pme_atomcomm_t *atc;
3894 if (pme->nnodes > 1)
3896 gmx_incons("gmx_pme_calc_energy called in parallel");
3900 gmx_incons("gmx_pme_calc_energy with free energy");
3903 atc = &pme->atc_energy;
3905 if (atc->spline == NULL)
3907 snew(atc->spline,atc->nthread);
3910 atc->bSpread = TRUE;
3911 atc->pme_order = pme->pme_order;
3913 pme_realloc_atomcomm_things(atc);
3917 /* We only use the A-charges grid */
3918 grid = &pme->pmegridA;
3920 spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
3922 *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
3926 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
3927 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
3929 /* Reset all the counters related to performance over the run */
3930 wallcycle_stop(wcycle,ewcRUN);
3931 wallcycle_reset_all(wcycle);
3933 ir->init_step += step_rel;
3934 ir->nsteps -= step_rel;
3935 wallcycle_start(wcycle,ewcRUN);
3939 int gmx_pmeonly(gmx_pme_t pme,
3940 t_commrec *cr, t_nrnb *nrnb,
3941 gmx_wallcycle_t wcycle,
3942 real ewaldcoeff, gmx_bool bGatherOnly,
3945 gmx_pme_pp_t pme_pp;
3948 rvec *x_pp=NULL,*f_pp=NULL;
3949 real *chargeA=NULL,*chargeB=NULL;
3951 int maxshift_x=0,maxshift_y=0;
3952 real energy,dvdlambda;
3957 gmx_large_int_t step,step_rel;
3960 pme_pp = gmx_pme_pp_init(cr);
3965 do /****** this is a quasi-loop over time steps! */
3967 /* Domain decomposition */
3968 natoms = gmx_pme_recv_q_x(pme_pp,
3969 &chargeA,&chargeB,box,&x_pp,&f_pp,
3970 &maxshift_x,&maxshift_y,
3976 /* We should stop: break out of the loop */
3980 step_rel = step - ir->init_step;
3983 wallcycle_start(wcycle,ewcRUN);
3985 wallcycle_start(wcycle,ewcPMEMESH);
3989 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
3990 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
3991 &energy,lambda,&dvdlambda,
3992 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
3994 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
3996 gmx_pme_send_force_vir_ener(pme_pp,
3997 f_pp,vir,energy,dvdlambda,
4002 if (step_rel == wcycle_get_reset_counters(wcycle))
4004 /* Reset all the counters related to performance over the run */
4005 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
4006 wcycle_set_reset_counters(wcycle, 0);
4009 } /***** end of quasi-loop, we stop with the break above */
4015 int gmx_pme_do(gmx_pme_t pme,
4016 int start, int homenr,
4018 real *chargeA, real *chargeB,
4019 matrix box, t_commrec *cr,
4020 int maxshift_x, int maxshift_y,
4021 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4022 matrix vir, real ewaldcoeff,
4023 real *energy, real lambda,
4024 real *dvdlambda, int flags)
4026 int q,d,i,j,ntot,npme;
4029 pme_atomcomm_t *atc=NULL;
4030 pmegrids_t *pmegrid=NULL;
4034 real *charge=NULL,*q_d;
4038 gmx_parallel_3dfft_t pfft_setup;
4040 t_complex * cfftgrid;
4042 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4043 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4045 assert(pme->nnodes > 0);
4046 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4048 if (pme->nnodes > 1) {
4051 if (atc->npd > atc->pd_nalloc) {
4052 atc->pd_nalloc = over_alloc_dd(atc->npd);
4053 srenew(atc->pd,atc->pd_nalloc);
4055 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4059 /* This could be necessary for TPI */
4060 pme->atc[0].n = homenr;
4063 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
4065 pmegrid = &pme->pmegridA;
4066 fftgrid = pme->fftgridA;
4067 cfftgrid = pme->cfftgridA;
4068 pfft_setup = pme->pfft_setupA;
4069 charge = chargeA+start;
4071 pmegrid = &pme->pmegridB;
4072 fftgrid = pme->fftgridB;
4073 cfftgrid = pme->cfftgridB;
4074 pfft_setup = pme->pfft_setupB;
4075 charge = chargeB+start;
4077 grid = pmegrid->grid.grid;
4078 /* Unpack structure */
4080 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
4081 cr->nnodes,cr->nodeid);
4082 fprintf(debug,"Grid = %p\n",(void*)grid);
4084 gmx_fatal(FARGS,"No grid!");
4088 m_inv_ur0(box,pme->recipbox);
4090 if (pme->nnodes == 1) {
4092 if (DOMAINDECOMP(cr)) {
4094 pme_realloc_atomcomm_things(atc);
4100 wallcycle_start(wcycle,ewcPME_REDISTXF);
4101 for(d=pme->ndecompdim-1; d>=0; d--)
4103 if (d == pme->ndecompdim-1)
4111 n_d = pme->atc[d+1].n;
4117 if (atc->npd > atc->pd_nalloc) {
4118 atc->pd_nalloc = over_alloc_dd(atc->npd);
4119 srenew(atc->pd,atc->pd_nalloc);
4121 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4122 pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
4125 /* Redistribute x (only once) and qA or qB */
4126 if (DOMAINDECOMP(cr)) {
4127 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
4129 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
4134 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4138 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
4141 if (flags & GMX_PME_SPREAD_Q)
4143 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4145 /* Spread the charges on a grid */
4146 spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
4150 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
4152 inc_nrnb(nrnb,eNR_SPREADQBSP,
4153 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4155 if (pme->nthread == 1)
4157 wrap_periodic_pmegrid(pme,grid);
4159 /* sum contributions to local grid from other nodes */
4161 if (pme->nnodes > 1)
4163 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
4168 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
4171 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4174 dump_local_fftgrid(pme,fftgrid);
4179 /* Here we start a large thread parallel region */
4180 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4181 for(thread=0; thread<pme->nthread; thread++)
4183 if (flags & GMX_PME_SOLVE)
4190 wallcycle_start(wcycle,ewcPME_FFT);
4192 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
4193 fftgrid,cfftgrid,thread,wcycle);
4196 wallcycle_stop(wcycle,ewcPME_FFT);
4200 /* solve in k-space for our local cells */
4203 wallcycle_start(wcycle,ewcPME_SOLVE);
4206 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
4207 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4209 pme->nthread,thread);
4212 wallcycle_stop(wcycle,ewcPME_SOLVE);
4214 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
4224 wallcycle_start(wcycle,ewcPME_FFT);
4226 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
4227 cfftgrid,fftgrid,thread,wcycle);
4230 wallcycle_stop(wcycle,ewcPME_FFT);
4234 if (pme->nodeid == 0)
4236 ntot = pme->nkx*pme->nky*pme->nkz;
4237 npme = ntot*log((real)ntot)/log(2.0);
4238 inc_nrnb(nrnb,eNR_FFT,2*npme);
4241 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4244 copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
4247 /* End of thread parallel section.
4248 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4253 /* distribute local grid to all nodes */
4255 if (pme->nnodes > 1) {
4256 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
4261 unwrap_periodic_pmegrid(pme,grid);
4263 /* interpolate forces for our local atoms */
4267 /* If we are running without parallelization,
4268 * atc->f is the actual force array, not a buffer,
4269 * therefore we should not clear it.
4271 bClearF = (q == 0 && PAR(cr));
4272 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4273 for(thread=0; thread<pme->nthread; thread++)
4275 gather_f_bsplines(pme,grid,bClearF,atc,
4276 &atc->spline[thread],
4277 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
4282 inc_nrnb(nrnb,eNR_GATHERFBSP,
4283 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4284 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4289 /* This should only be called on the master thread
4290 * and after the threads have synchronized.
4292 get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
4296 if (bCalcF && pme->nnodes > 1) {
4297 wallcycle_start(wcycle,ewcPME_REDISTXF);
4298 for(d=0; d<pme->ndecompdim; d++)
4301 if (d == pme->ndecompdim - 1)
4308 n_d = pme->atc[d+1].n;
4309 f_d = pme->atc[d+1].f;
4311 if (DOMAINDECOMP(cr)) {
4312 dd_pmeredist_f(pme,atc,n_d,f_d,
4313 d==pme->ndecompdim-1 && pme->bPPnode);
4315 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4319 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4326 *energy = energy_AB[0];
4327 m_add(vir,vir_AB[0],vir);
4329 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4330 *dvdlambda += energy_AB[1] - energy_AB[0];
4331 for(i=0; i<DIM; i++)
4333 for(j=0; j<DIM; j++)
4335 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4336 lambda*vir_AB[1][i][j];
4348 fprintf(debug,"PME mesh energy: %g\n",*energy);