*
* GROningen MAchine for Chemical Simulations
*
- * VERSION 4.5
* Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team,
+ * Copyright (c) 2001-2012, The GROMACS development team,
* check out http://www.gromacs.org for more information.
* This program is free software; you can redistribute it and/or
#include "tmpi.h"
#endif
+#ifdef GMX_OPENMP
+#define FFT5D_THREADS
+#endif
#ifdef FFT5D_THREADS
#include <omp.h>
/* requires fftw compiled with openmp */
-#define FFT5D_FFTW_THREADS
+/* #define FFT5D_FFTW_THREADS (now set by cmake) */
#endif
#include "fft5d.h"
/* NxMxK the size of the data
* comm communicator to use for fft5d
* P0 number of processor in 1st axes (can be null for automatic)
- * lin is allocated by fft5d because size of array is only known after planning phase */
-fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
+ * lin is allocated by fft5d because size of array is only known after planning phase
+ * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
+*/
+fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
{
- int P[2],bMaster,prank[2],i;
+ int P[2],bMaster,prank[2],i,t;
int rNG,rMG,rKG;
int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
int C[3],rC[3],nP[2];
int lsize;
- t_complex *lin=0,*lout=0;
+ t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
fft5d_plan plan;
int s;
/* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
#ifdef GMX_MPI
- if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
+ if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
{
MPI_Comm_size(comm[0],&P[0]);
MPI_Comm_rank(comm[0],&prank[0]);
prank[0] = 0;
}
#ifdef GMX_MPI
- if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
+ if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
{
MPI_Comm_size(comm[1],&P[1]);
MPI_Comm_rank(comm[1],&prank[1]);
if (!(flags&FFT5D_NOMALLOC)) {
lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
+ lout2 = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
+ lout3 = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
} else {
lin = *rlin;
lout = *rlout;
+ lout2 = *rlout2;
+ lout3 = *rlout3;
}
plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
-#ifdef FFT5D_THREADS
-#ifdef FFT5D_FFTW_THREADS
- FFTW(init_threads)();
- int nthreads;
- #pragma omp parallel
- {
- #pragma omp master
- {
- nthreads = omp_get_num_threads();
- }
- }
- if (prank[0] == 0 && prank[1] == 0)
+ if (debug)
{
- printf("Running fftw on %d threads\n",nthreads);
+ fprintf(debug, "Running on %d threads\n",nthreads);
}
- FFTW(plan_with_nthreads)(nthreads);
-#endif
-#endif
-#ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
- if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) { /*don't do 3d plan in parallel or if in_place requested */
+#ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
+ /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
+ * within a parallel region. For now deactivated. If it should be supported it has to made sure that
+ * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
+ * and that the 3d plan is faster than the 1d plan.
+ */
+ if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) { /*don't do 3d plan in parallel or if in_place requested */
int fftwflags=FFTW_DESTROY_INPUT;
FFTW(iodim) dims[3];
int inNG=NG,outMG=MG,outKG=KG;
dims[2].os = 1;
}
}
+#ifdef FFT5D_THREADS
+#ifdef FFT5D_FFTW_THREADS
+ FFTW(plan_with_nthreads)(nthreads);
+#endif
+#endif
if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
/*howmany*/ 0, /*howmany_dims*/0 ,
(FFTW(complex) *)lin, (FFTW(complex) *)lout,
/*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
}
+#ifdef FFT5D_THREADS
+#ifdef FFT5D_FFTW_THREADS
+ FFTW(plan_with_nthreads)(1);
+#endif
+#endif
FFTW_UNLOCK;
}
if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
s,rC[s],M[s],pK[s],C[s],lsize);
}
- if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
- gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
- } else {
- gmx_fft_init_many_1d ( &plan->p1d[s], C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
+ plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
+
+ /* Make sure that the init routines are only called by one thread at a time and in order
+ (later is only important to not confuse valgrind)
+ */
+#pragma omp parallel for num_threads(nthreads) schedule(static) ordered
+ for(t=0; t<nthreads; t++)
+ {
+ int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
+
+ if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
+ gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
+ } else {
+ gmx_fft_init_many_1d ( &plan->p1d[s][t], C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
+ }
}
}
+
#ifdef GMX_FFT_FFTW3
}
#endif
FFTW_LOCK;
for (s=0;s<2;s++) {
if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
- plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
+ plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
else
- plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
+ plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
}
FFTW_UNLOCK;
#endif
plan->lin=lin;
plan->lout=lout;
+ plan->lout2=lout2;
+ plan->lout3=lout3;
plan->NG=NG;plan->MG=MG;plan->KG=KG;
plan->realcomplex=realcomplex;
*/
plan->flags=flags;
+ plan->nthreads=nthreads;
*rlin=lin;
*rlout=lout;
+ *rlout2=lout2;
+ *rlout3=lout3;
return plan;
}
NG, MG, KG is size of global data*/
static void splitaxes(t_complex* lout,const t_complex* lin,
int maxN,int maxM,int maxK, int pN, int pM, int pK,
- int P,int NG,int *N, int* oN)
+ int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
{
int x,y,z,i;
int in_i,out_i,in_z,out_z,in_y,out_y;
+ int s_y,e_y;
-#ifdef FFT5D_THREADS
- int zi;
-
- /* In the thread parallel case we want to loop over z and i
- * in a single for loop to allow for better load balancing.
- */
-#pragma omp parallel for private(z,in_z,out_z,i,in_i,out_i,y,in_y,out_y,x) schedule(static)
- for (zi=0; zi<pK*P; zi++)
- {
- z = zi/P;
- i = zi - z*P;
-#else
- for (z=0; z<pK; z++) /*3. z l*/
+ for (z=startz; z<endz+1; z++) /*3. z l*/
{
-#endif
- in_z = z*maxN*maxM;
- out_z = z*NG*pM;
+ if (z==startz) {
+ s_y=starty;
+ } else {
+ s_y=0;
+ }
+ if (z==endz) {
+ e_y=endy;
+ } else {
+ e_y=pM;
+ }
+ out_z = z*maxN*maxM;
+ in_z = z*NG*pM;
-#ifndef FFT5D_THREADS
for (i=0; i<P; i++) /*index cube along long axis*/
-#endif
{
- in_i = in_z + i*maxN*maxM*maxK;
- out_i = out_z + oN[i];
- for (y=0;y<pM;y++) { /*2. y k*/
- in_y = in_i + y*maxN;
- out_y = out_i + y*NG;
+ out_i = out_z + i*maxN*maxM*maxK;
+ in_i = in_z + oN[i];
+ for (y=s_y;y<e_y;y++) { /*2. y k*/
+ out_y = out_i + y*maxN;
+ in_y = in_i + y*NG;
for (x=0;x<N[i];x++) { /*1. x j*/
- lout[in_y+x] = lin[out_y+x];
+ lout[out_y+x] = lin[in_y+x]; /*in=z*NG*pM+oN[i]+y*NG+x*/
/*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
/*before split data contiguos - thus if different processor get different amount oN is different*/
}
the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
N,M,K local dimensions
KG global size*/
-static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
+static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
int maxN,int maxM,int maxK,int pN, int pM, int pK,
- int P,int KG, int* K, int* oK)
+ int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
{
int i,x,y,z;
- int in_i,out_i,in_x,out_x,in_z,out_z;
-
-#ifdef FFT5D_THREADS
- int xi;
+ int out_i,in_i,out_x,in_x,out_z,in_z;
+ int s_y,e_y;
- /* In the thread parallel case we want to loop over x and i
- * in a single for loop to allow for better load balancing.
- */
-#pragma omp parallel for private(x,in_x,out_x,i,in_i,out_i,z,in_z,out_z,y) schedule(static)
- for (xi=0; xi<pN*P; xi++)
- {
- x = xi/P;
- i = xi - x*P;
-#else
- for (x=0;x<pN;x++) /*1.j*/
+ for (x=startx;x<endx+1;x++) /*1.j*/
{
-#endif
- in_x = x*KG*pM;
- out_x = x;
+ if (x==startx)
+ {
+ s_y=starty;
+ }
+ else
+ {
+ s_y=0;
+ }
+ if (x==endx)
+ {
+ e_y=endy;
+ }
+ else
+ {
+ e_y=pM;
+ }
+
+ out_x = x*KG*pM;
+ in_x = x;
-#ifndef FFT5D_THREADS
for (i=0;i<P;i++) /*index cube along long axis*/
-#endif
{
- in_i = in_x + oK[i];
- out_i = out_x + i*maxM*maxN*maxK;
+ out_i = out_x + oK[i];
+ in_i = in_x + i*maxM*maxN*maxK;
for (z=0;z<K[i];z++) /*3.l*/
{
- in_z = in_i + z;
- out_z = out_i + z*maxM*maxN;
- for (y=0;y<pM;y++) { /*2.k*/
- lin[in_z+y*KG] = lout[out_z+y*maxN];
+ out_z = out_i + z;
+ in_z = in_i + z*maxM*maxN;
+ for (y=s_y;y<e_y;y++) /*2.k*/
+ {
+ lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
}
}
}
the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
N,M,K local size
MG, global size*/
-static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
- int P,int MG, int* M, int* oM) {
+static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
+ int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
int i,z,y,x;
- int in_i,out_i,in_z,out_z,in_x,out_x;
+ int out_i,in_i,out_z,in_z,out_x,in_x;
+ int s_x,e_x;
-#ifdef FFT5D_THREADS
- int zi;
-
- /* In the thread parallel case we want to loop over z and i
- * in a single for loop to allow for better load balancing.
- */
-#pragma omp parallel for private(i,in_i,out_i,z,in_z,out_z,in_x,out_x,x,y) schedule(static)
- for (zi=0; zi<pK*P; zi++)
- {
- z = zi/P;
- i = zi - z*P;
-#else
- for (z=0; z<pK; z++)
+ for (z=startz; z<endz+1; z++)
{
-#endif
- in_z = z*MG*pN;
- out_z = z*maxM*maxN;
+ if (z==startz)
+ {
+ s_x=startx;
+ }
+ else
+ {
+ s_x=0;
+ }
+ if (z==endz)
+ {
+ e_x=endx;
+ }
+ else
+ {
+ e_x=pN;
+ }
+ out_z = z*MG*pN;
+ in_z = z*maxM*maxN;
-#ifndef FFT5D_THREADS
for (i=0; i<P; i++) /*index cube along long axis*/
-#endif
{
- in_i = in_z + oM[i];
- out_i = out_z + i*maxM*maxN*maxK;
- for (x=0;x<pN;x++) {
- in_x = in_i + x*MG;
- out_x = out_i + x;
- for (y=0;y<M[i];y++) {
- lin[in_x+y] = lout[out_x+y*maxN];
+ out_i = out_z + oM[i];
+ in_i = in_z + i*maxM*maxN*maxK;
+ for (x=s_x;x<e_x;x++)
+ {
+ out_x = out_i + x*MG;
+ in_x = in_i + x;
+ for (y=0;y<M[i];y++)
+ {
+ lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
}
}
}
}
static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
- int x,y,z,l;
+ int x,y,z,l,t;
int *coor = plan->coor;
int xs[3],xl[3],xc[3],NG[3];
int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
}
}
-void fft5d_execute(fft5d_plan plan,fft5d_time times) {
+void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
t_complex *lin = plan->lin;
t_complex *lout = plan->lout;
+ t_complex *lout2 = plan->lout2;
+ t_complex *lout3 = plan->lout3;
+ t_complex *fftout,*joinin;
- gmx_fft_t *p1d=plan->p1d;
+ gmx_fft_t **p1d=plan->p1d;
#ifdef FFT5D_MPI_TRANSPOSE
FFTW(plan) *mpip=plan->mpip;
#endif
double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
*C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
- int s=0;
+ int s=0,tstart,tend,bParallelDim;
#ifdef GMX_FFT_FFTW3
- if (plan->p3d) {
- if (times!=0)
- time=MPI_Wtime();
- FFTW(execute)(plan->p3d);
- if (times!=0)
- times->fft+=MPI_Wtime()-time;
+ if (plan->p3d)
+ {
+ if (thread == 0)
+ {
+#ifdef NOGMX
+ if (times!=0)
+ {
+ time=MPI_Wtime();
+ }
+#endif
+ FFTW(execute)(plan->p3d);
+#ifdef NOGMX
+ if (times!=0)
+ {
+ times->fft+=MPI_Wtime()-time;
+ }
+#endif
+ }
return;
}
#endif
+ s=0;
+
/*lin: x,y,z*/
- if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
- for (s=0;s<2;s++) {
- if (times!=0)
- time=MPI_Wtime();
-
- if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
- gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
- } else {
- gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
+ if (plan->flags&FFT5D_DEBUG && thread == 0)
+ {
+ print_localdata(lin, "%d %d: copy in lin\n", s, plan);
}
- if (times!=0)
- time_fft+=MPI_Wtime()-time;
-
- if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
-
+
+ for (s=0;s<2;s++) { /*loop over first two FFT steps (corner rotations)*/
+
#ifdef GMX_MPI
if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
{
- if (times!=0)
- time=MPI_Wtime();
- /*prepare for AllToAll
- 1. (most outer) axes (x) is split into P[s] parts of size N[s]
- for sending*/
- splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
+ bParallelDim = 1;
+ }
+ else
+#endif
+ {
+ bParallelDim = 0;
+ }
- if (times!=0)
+ /* ---------- START FFT ------------ */
+#ifdef NOGMX
+ if (times!=0 && thread == 0)
+ {
+ time=MPI_Wtime();
+ }
+#endif
+
+ if (bParallelDim) {
+ fftout = lout;
+ }
+ else
+ {
+ if (s==0)
+ {
+ fftout = lout3;
+ } else
+ {
+ fftout = lout2;
+ }
+ }
+
+ tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
+ if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
+ {
+ gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
+ } else
+ {
+ gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,fftout+tstart);
+
+ }
+
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
+ time_fft+=MPI_Wtime()-time;
+ }
+#endif
+ if (plan->flags&FFT5D_DEBUG && thread == 0)
+ {
+ print_localdata(lout, "%d %d: FFT %d\n", s, plan);
+ }
+ /* ---------- END FFT ------------ */
+
+ /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
+ if (bParallelDim) {
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
{
- time_local+=MPI_Wtime()-time;
-
- /*send, recv*/
time=MPI_Wtime();
}
+#endif
+ /*prepare for A
+llToAll
+ 1. (most outer) axes (x) is split into P[s] parts of size N[s]
+ for sending*/
+ if (pM[s]>0)
+ {
+ tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
+ tstart/=C[s];
+ splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
+ }
+#pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
+ time_local+=MPI_Wtime()-time;
+ }
+#endif
+ /* ---------- END SPLIT , START TRANSPOSE------------ */
+
+ if (thread == 0)
+ {
+#ifdef NOGMX
+ if (times!=0)
+ {
+ time=MPI_Wtime();
+ }
+#else
+ wallcycle_start(times,ewcPME_FFTCOMM);
+#endif
#ifdef FFT5D_MPI_TRANSPOSE
- FFTW(execute)(mpip[s]);
+ FFTW(execute)(mpip[s]);
#else
- if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
- MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
- else
- MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
-#endif /*FFT5D_MPI_TRANSPOSE*/
- if (times!=0)
- time_mpi[s]=MPI_Wtime()-time;
- }
+#ifdef GMX_MPI
+ if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
+ MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
+ else
+ MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
+#else
+ gmx_incons("fft5d MPI call without MPI configuration");
#endif /*GMX_MPI*/
+#endif /*FFT5D_MPI_TRANSPOSE*/
+#ifdef NOGMX
+ if (times!=0)
+ {
+ time_mpi[s]=MPI_Wtime()-time;
+ }
+#else
+ wallcycle_stop(times,ewcPME_FFTCOMM);
+#endif
+ } /*master*/
+ } /* bPrallelDim */
+#pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
-
- if (times!=0)
+ /* ---------- END SPLIT + TRANSPOSE------------ */
+
+ /* ---------- START JOIN ------------ */
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
time=MPI_Wtime();
+ }
+#endif
+
+ if (bParallelDim) {
+ joinin = lout3;
+ } else {
+ joinin = fftout;
+ }
/*bring back in matrix form
thus make new 1. axes contiguos
- also local transpose 1 and 2/3 */
- if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
- joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
- else
- joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
- if (times!=0)
+ also local transpose 1 and 2/3
+ runs on thread used for following FFT (thus needing a barrier before but not afterwards)
+ */
+ if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
+ if (pM[s]>0)
+ {
+ tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
+ tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
+ joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
+ }
+ }
+ else {
+ if (pN[s]>0)
+ {
+ tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
+ tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
+ joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
+ }
+ }
+
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
time_local+=MPI_Wtime()-time;
-
- if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
-
+ }
+#endif
+ if (plan->flags&FFT5D_DEBUG && thread == 0)
+ {
+ print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
+ }
+ /* ---------- END JOIN ------------ */
+
/*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
- }
-
- if (times!=0)
- time=MPI_Wtime();
- if (plan->flags&FFT5D_INPLACE) lout=lin;
+ } /* for(s=0;s<2;s++) */
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
+ time=MPI_Wtime();
+ }
+#endif
+
+ if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
+
+ /* ----------- FFT ----------- */
+ tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
- gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
+ gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
} else {
- gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
+ gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,lout+tstart);
}
+ /* ------------ END FFT ---------*/
- if (times!=0)
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
time_fft+=MPI_Wtime()-time;
- if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
- /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
-
- if (times!=0) {
+
times->fft+=time_fft;
times->local+=time_local;
times->mpi2+=time_mpi[1];
times->mpi1+=time_mpi[0];
}
+#endif
+
+ if (plan->flags&FFT5D_DEBUG && thread == 0)
+ {
+ print_localdata(lout, "%d %d: FFT %d\n", s, plan);
+ }
}
void fft5d_destroy(fft5d_plan plan) {
- int s;
- for (s=0;s<3;s++) {
- gmx_many_fft_destroy(plan->p1d[s]);
- if (plan->iNin[s]) {
+ int s,t;
+ for (s=0;s<3;s++)
+ {
+ if (plan->p1d[s])
+ {
+ for (t=0;t<plan->nthreads;t++)
+ {
+ gmx_many_fft_destroy(plan->p1d[s][t]);
+ }
+ free(plan->p1d[s]);
+ }
+ if (plan->iNin[s])
+ {
free(plan->iNin[s]);
plan->iNin[s]=0;
}
- if (plan->oNin[s]) {
+ if (plan->oNin[s])
+ {
free(plan->oNin[s]);
plan->oNin[s]=0;
}
- if (plan->iNout[s]) {
+ if (plan->iNout[s])
+ {
free(plan->iNout[s]);
plan->iNout[s]=0;
}
- if (plan->oNout[s]) {
+ if (plan->oNout[s])
+ {
free(plan->oNout[s]);
plan->oNout[s]=0;
}
FFTW_LOCK;
#ifdef FFT5D_MPI_TRANSPOS
for (s=0;s<2;s++)
+ {
FFTW(destroy_plan)(plan->mpip[s]);
+ }
+ if (plan->p3d)
+ {
+ FFTW(destroy_plan)(plan->p3d);
+ }
#endif /* FFT5D_MPI_TRANSPOS */
#endif /* GMX_FFT_FFTW3 */
#ifdef FFT5D_THREADS
#ifdef FFT5D_FFTW_THREADS
- FFTW(cleanup_threads)();
+ /*FFTW(cleanup_threads)();*/
#endif
#endif
/*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
of processor dimensions*/
-fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
+fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
MPI_Comm cart[2]={0};
#ifdef GMX_MPI
int size=1,prank=0;
MPI_Comm_rank(comm,&prank);
if (P0==0) P0 = lfactor(size);
- if (size%P0!=0) {
+ if (size%P0!=0)
+ {
if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
P0 = lfactor(size);
}
MPI_Cart_sub(gcart, rdim1 , &cart[0]);
MPI_Cart_sub(gcart, rdim2 , &cart[1]);
#endif
- return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
+ return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
}
/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
*
- *
+ *
* This source code is part of
- *
+ *
* G R O M A C S
- *
+ *
* GROningen MAchine for Chemical Simulations
- *
+ *
* VERSION 3.2.0
* Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
- *
+ *
* If you want to redistribute modifications, please consider that
* scientific software is very special. Version control is crucial -
* bugs must be traceable. We will be happy to consider code for
* inclusion in the official distribution, but derived work must not
* be called official GROMACS. Details are found in the README & COPYING
* files - if they are missing, get the official version at www.gromacs.org.
- *
+ *
* To help us fund GROMACS development, we humbly ask that you cite
* the papers on the package - you can find them in the top README file.
- *
+ *
* For more info, check our website at http://www.gromacs.org
- *
+ *
* And Hey:
* GROwing Monsters And Cloning Shrimps
*/
*
* It might seem an overkill, but better safe than sorry.
* /Erik 001109
- */
+ */
#ifdef HAVE_CONFIG_H
#include <config.h>
#include "tmpi.h"
#endif
+#ifdef GMX_OPENMP
+#include <omp.h>
+#endif
#include <stdio.h>
#include <string.h>
#include "gmx_wallcycle.h"
#include "gmx_parallel_3dfft.h"
#include "pdbio.h"
+#include "gmx_cyclecounter.h"
#if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
#include "gmx_sse2_single.h"
+
+#define PME_SSE
+/* Some old AMD processors could have problems with unaligned loads+stores */
+#ifndef GMX_FAHCORE
+#define PME_SSE_UNALIGNED
+#endif
#endif
#include "mpelogging.h"
/* #define TAKETIME (step > 1 && timesteps < 10) */
#define TAKETIME FALSE
+/* #define PME_TIME_THREADS */
+
#ifdef GMX_DOUBLE
#define mpi_type MPI_DOUBLE
#else
#define mpi_type MPI_FLOAT
#endif
+/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+#define GMX_CACHE_SEP 64
+
+/* We only define a maximum to be able to use local arrays without allocation.
+ * An order larger than 12 should never be needed, even for test cases.
+ * If needed it can be changed here.
+ */
+#define PME_ORDER_MAX 12
+
/* Internal datastructures */
typedef struct {
int send_index0;
int noverlap_nodes;
int *send_id,*recv_id;
pme_grid_comm_t *comm_data;
+ real *sendbuf;
+ real *recvbuf;
} pme_overlap_t;
+typedef struct {
+ int *n; /* Cumulative counts of the number of particles per thread */
+ int nalloc; /* Allocation size of i */
+ int *i; /* Particle indices ordered on thread index (n) */
+} thread_plist_t;
+
+typedef struct {
+ int n;
+ int *ind;
+ splinevec theta;
+ splinevec dtheta;
+} splinedata_t;
+
typedef struct {
int dimind; /* The index of the dimension, 0=x, 1=y */
int nslab;
int pd_nalloc;
int *pd;
int *count; /* The number of atoms to send to each node */
+ int **count_thread;
int *rcount; /* The number of atoms to receive */
int n;
rvec *x;
real *q;
rvec *f;
- gmx_bool bSpread; /* These coordinates are used for spreading */
+ gmx_bool bSpread; /* These coordinates are used for spreading */
int pme_order;
- splinevec theta,dtheta;
ivec *idx;
rvec *fractx; /* Fractional coordinate relative to the
- * lower cell boundary
+ * lower cell boundary
*/
+ int nthread;
+ int *thread_idx; /* Which thread should spread which charge */
+ thread_plist_t *thread_plist;
+ splinedata_t *spline;
} pme_atomcomm_t;
+#define FLBS 3
+#define FLBSZ 4
+
+typedef struct {
+ ivec ci; /* The spatial location of this grid */
+ ivec n; /* The size of *grid, including order-1 */
+ ivec offset; /* The grid offset from the full node grid */
+ int order; /* PME spreading order */
+ real *grid; /* The grid local thread, size n */
+} pmegrid_t;
+
+typedef struct {
+ pmegrid_t grid; /* The full node grid (non thread-local) */
+ int nthread; /* The number of threads operating on this grid */
+ ivec nc; /* The local spatial decomposition over the threads */
+ pmegrid_t *grid_th; /* Array of grids for each thread */
+ int **g2t; /* The grid to thread index */
+ ivec nthread_comm; /* The number of threads to communicate with */
+} pmegrids_t;
+
+
+typedef struct {
+#ifdef PME_SSE
+ /* Masks for SSE aligned spreading and gathering */
+ __m128 mask_SSE0[6],mask_SSE1[6];
+#else
+ int dummy; /* C89 requires that struct has at least one member */
+#endif
+} pme_spline_work_t;
+
+typedef struct {
+ /* work data for solve_pme */
+ int nalloc;
+ real * mhx;
+ real * mhy;
+ real * mhz;
+ real * m2;
+ real * denom;
+ real * tmp1_alloc;
+ real * tmp1;
+ real * eterm;
+ real * m2inv;
+
+ real energy;
+ matrix vir;
+} pme_work_t;
+
typedef struct gmx_pme {
int ndecompdim; /* The number of decomposition dimensions */
int nodeid; /* Our nodeid in mpi->mpi_comm */
MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
#endif
- gmx_bool bPPnode; /* Node also does particle-particle forces */
- gmx_bool bFEP; /* Compute Free energy contribution */
+ int nthread; /* The number of threads doing PME */
+
+ gmx_bool bPPnode; /* Node also does particle-particle forces */
+ gmx_bool bFEP; /* Compute Free energy contribution */
int nkx,nky,nkz; /* Grid dimensions */
int pme_order;
- real epsilon_r;
-
- real * pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
- real * pmegridB;
+ real epsilon_r;
+
+ pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
+ pmegrids_t pmegridB;
+ /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
int pmegrid_nx,pmegrid_ny,pmegrid_nz;
- int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
-
- real * pmegrid_sendbuf;
- real * pmegrid_recvbuf;
-
+ /* pmegrid_nz might be larger than strictly necessary to ensure
+ * memory alignment, pmegrid_nz_base gives the real base size.
+ */
+ int pmegrid_nz_base;
+ /* The local PME grid starting indices */
+ int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
+
+ /* Work data for spreading and gathering */
+ pme_spline_work_t spline_work;
+
real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
int fftgrid_nx,fftgrid_ny,fftgrid_nz;
-
+
t_complex *cfftgridA; /* Grids for complex FFT data */
- t_complex *cfftgridB;
+ t_complex *cfftgridB;
int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
-
+
gmx_parallel_3dfft_t pfft_setupA;
gmx_parallel_3dfft_t pfft_setupB;
-
+
int *nnx,*nny,*nnz;
real *fshx,*fshy,*fshz;
-
+
pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
matrix recipbox;
splinevec bsp_mod;
-
- pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
+ pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
-
+
rvec *bufv; /* Communication buffer */
real *bufr; /* Communication buffer */
int buf_nalloc; /* The communication buffer size */
- /* work data for solve_pme */
- int work_nalloc;
- real * work_mhx;
- real * work_mhy;
- real * work_mhz;
- real * work_m2;
- real * work_denom;
- real * work_tmp1_alloc;
- real * work_tmp1;
- real * work_m2inv;
+ /* thread local work data for solve_pme */
+ pme_work_t *work;
/* Work data for PME_redist */
- gmx_bool redist_init;
- int * scounts;
+ gmx_bool redist_init;
+ int * scounts;
int * rcounts;
int * sdispls;
int * rdispls;
int * sidx;
- int * idxa;
+ int * idxa;
real * redist_buf;
int redist_buf_nalloc;
-
+
/* Work data for sum_qgrid */
real * sum_qgrid_tmp;
real * sum_qgrid_dd_tmp;
} t_gmx_pme;
-static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
+static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
+ int start,int end,int thread)
{
int i;
int *idxptr,tix,tiy,tiz;
real rxx,ryx,ryy,rzx,rzy,rzz;
int nx,ny,nz;
int start_ix,start_iy,start_iz;
-
+ int *g2tx,*g2ty,*g2tz;
+ gmx_bool bThreads;
+ int *thread_idx=NULL;
+ thread_plist_t *tpl=NULL;
+ int *tpl_n=NULL;
+ int thread_i;
+
nx = pme->nkx;
ny = pme->nky;
nz = pme->nkz;
-
+
start_ix = pme->pmegrid_start_ix;
start_iy = pme->pmegrid_start_iy;
start_iz = pme->pmegrid_start_iz;
-
+
rxx = pme->recipbox[XX][XX];
ryx = pme->recipbox[YY][XX];
ryy = pme->recipbox[YY][YY];
rzx = pme->recipbox[ZZ][XX];
rzy = pme->recipbox[ZZ][YY];
rzz = pme->recipbox[ZZ][ZZ];
-
- for(i=0; (i<atc->n); i++) {
+
+ g2tx = pme->pmegridA.g2t[XX];
+ g2ty = pme->pmegridA.g2t[YY];
+ g2tz = pme->pmegridA.g2t[ZZ];
+
+ bThreads = (atc->nthread > 1);
+ if (bThreads)
+ {
+ thread_idx = atc->thread_idx;
+
+ tpl = &atc->thread_plist[thread];
+ tpl_n = tpl->n;
+ for(i=0; i<atc->nthread; i++)
+ {
+ tpl_n[i] = 0;
+ }
+ }
+
+ for(i=start; i<end; i++) {
xptr = atc->x[i];
idxptr = atc->idx[i];
fptr = atc->fractx[i];
-
+
/* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
tz = nz * ( xptr[ZZ] * rzz + 2.0 );
-
+
tix = (int)(tx);
tiy = (int)(ty);
tiz = (int)(tz);
-
+
/* Because decomposition only occurs in x and y,
* we never have a fraction correction in z.
*/
fptr[XX] = tx - tix + pme->fshx[tix];
fptr[YY] = ty - tiy + pme->fshy[tiy];
- fptr[ZZ] = tz - tiz;
+ fptr[ZZ] = tz - tiz;
idxptr[XX] = pme->nnx[tix];
idxptr[YY] = pme->nny[tiy];
range_check(idxptr[YY],0,pme->pmegrid_ny);
range_check(idxptr[ZZ],0,pme->pmegrid_nz);
#endif
- }
+
+ if (bThreads)
+ {
+ thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
+ thread_idx[i] = thread_i;
+ tpl_n[thread_i]++;
+ }
+ }
+
+ if (bThreads)
+ {
+ /* Make a list of particle indices sorted on thread */
+
+ /* Get the cumulative count */
+ for(i=1; i<atc->nthread; i++)
+ {
+ tpl_n[i] += tpl_n[i-1];
+ }
+ /* The current implementation distributes particles equally
+ * over the threads, so we could actually allocate for that
+ * in pme_realloc_atomcomm_things.
+ */
+ if (tpl_n[atc->nthread-1] > tpl->nalloc)
+ {
+ tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
+ srenew(tpl->i,tpl->nalloc);
+ }
+ /* Set tpl_n to the cumulative start */
+ for(i=atc->nthread-1; i>=1; i--)
+ {
+ tpl_n[i] = tpl_n[i-1];
+ }
+ tpl_n[0] = 0;
+
+ /* Fill our thread local array with indices sorted on thread */
+ for(i=start; i<end; i++)
+ {
+ tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
+ }
+ /* Now tpl_n contains the cummulative count again */
+ }
}
-static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
- pme_atomcomm_t *atc)
+static void make_thread_local_ind(pme_atomcomm_t *atc,
+ int thread,splinedata_t *spline)
+{
+ int n,t,i,start,end;
+ thread_plist_t *tpl;
+
+ /* Combine the indices made by each thread into one index */
+
+ n = 0;
+ start = 0;
+ for(t=0; t<atc->nthread; t++)
+ {
+ tpl = &atc->thread_plist[t];
+ /* Copy our part (start - end) from the list of thread t */
+ if (thread > 0)
+ {
+ start = tpl->n[thread-1];
+ }
+ end = tpl->n[thread];
+ for(i=start; i<end; i++)
+ {
+ spline->ind[n++] = tpl->i[i];
+ }
+ }
+
+ spline->n = n;
+}
+
+
+static void pme_calc_pidx(int start, int end,
+ matrix recipbox, rvec x[],
+ pme_atomcomm_t *atc, int *count)
{
int nslab,i;
int si;
real *xptr,s;
real rxx,ryx,rzx,ryy,rzy;
- int *pd,*count;
+ int *pd;
/* Calculate PME task index (pidx) for each grid index.
* Here we always assign equally sized slabs to each node
* for load balancing reasons (the PME grid spacing is not used).
*/
-
+
nslab = atc->nslab;
pd = atc->pd;
- count = atc->count;
/* Reset the count */
for(i=0; i<nslab; i++)
{
count[i] = 0;
}
-
+
if (atc->dimind == 0)
{
rxx = recipbox[XX][XX];
ryx = recipbox[YY][XX];
rzx = recipbox[ZZ][XX];
/* Calculate the node index in x-dimension */
- for(i=0; (i<natoms); i++)
+ for(i=start; i<end; i++)
{
xptr = x[i];
/* Fractional coordinates along box vectors */
ryy = recipbox[YY][YY];
rzy = recipbox[ZZ][YY];
/* Calculate the node index in y-dimension */
- for(i=0; (i<natoms); i++)
+ for(i=start; i<end; i++)
{
xptr = x[i];
/* Fractional coordinates along box vectors */
}
}
+static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
+ pme_atomcomm_t *atc)
+{
+ int nthread,thread,slab;
+
+ nthread = atc->nthread;
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for(thread=0; thread<nthread; thread++)
+ {
+ pme_calc_pidx(natoms* thread /nthread,
+ natoms*(thread+1)/nthread,
+ recipbox,x,atc,atc->count_thread[thread]);
+ }
+ /* Non-parallel reduction, since nslab is small */
+
+ for(thread=1; thread<nthread; thread++)
+ {
+ for(slab=0; slab<atc->nslab; slab++)
+ {
+ atc->count_thread[0][slab] += atc->count_thread[thread][slab];
+ }
+ }
+}
+
+static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
+{
+ int i,d;
+
+ srenew(spline->ind,atc->nalloc);
+ /* Initialize the index to identity so it works without threads */
+ for(i=0; i<atc->nalloc; i++)
+ {
+ spline->ind[i] = i;
+ }
+
+ for(d=0;d<DIM;d++)
+ {
+ srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
+ srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
+ }
+}
+
static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
{
- int nalloc_old,i;
-
+ int nalloc_old,i,j,nalloc_tpl;
+
/* We have to avoid a NULL pointer for atc->x to avoid
* possible fatal errors in MPI routines.
*/
{
nalloc_old = atc->nalloc;
atc->nalloc = over_alloc_dd(max(atc->n,1));
-
+
if (atc->nslab > 1) {
srenew(atc->x,atc->nalloc);
srenew(atc->q,atc->nalloc);
}
}
if (atc->bSpread) {
- for(i=0;i<DIM;i++) {
- srenew(atc->theta[i] ,atc->pme_order*atc->nalloc);
- srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
- }
- srenew(atc->fractx,atc->nalloc);
+ srenew(atc->fractx,atc->nalloc);
srenew(atc->idx ,atc->nalloc);
+
+ if (atc->nthread > 1)
+ {
+ srenew(atc->thread_idx,atc->nalloc);
+ }
+
+ for(i=0; i<atc->nthread; i++)
+ {
+ pme_realloc_splinedata(&atc->spline[i],atc);
+ }
}
}
}
{
int *idxa;
int i, ii;
-
+
if(FALSE == pme->redist_init) {
snew(pme->scounts,atc->nslab);
snew(pme->rcounts,atc->nslab);
pme->redist_buf_nalloc = over_alloc_dd(n);
srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
}
-
+
pme->idxa = atc->pd;
#ifdef GMX_MPI
if (forw && bXF) {
- /* forward, redistribution from pp to pme */
-
+ /* forward, redistribution from pp to pme */
+
/* Calculate send counts and exchange them with other nodes */
for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-
- /* Calculate send and receive displacements and index into send
+
+ /* Calculate send and receive displacements and index into send
buffer */
pme->sdispls[0]=0;
pme->rdispls[0]=0;
}
/* Total # of particles to be received */
atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-
+
pme_realloc_atomcomm_things(atc);
-
+
/* Copy particle coordinates into send buffer and exchange*/
for(i=0; (i<n); i++) {
ii=DIM*pme->sidx[pme->idxa[i]];
pme->redist_buf[ii+YY]=x_f[i][YY];
pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
}
- MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
- pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
+ MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
+ pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
pme->rvec_mpi, atc->mpi_comm);
}
if (forw) {
atc->q, pme->rcounts, pme->rdispls, mpi_type,
atc->mpi_comm);
}
- else { /* backward, redistribution from pme to pp */
+ else { /* backward, redistribution from pme to pp */
MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
- pme->redist_buf, pme->scounts, pme->sdispls,
+ pme->redist_buf, pme->scounts, pme->sdispls,
pme->rvec_mpi, atc->mpi_comm);
-
+
/* Copy data from receive buffer */
for(i=0; i<atc->nslab; i++)
pme->sidx[i] = pme->sdispls[i];
pme->sidx[pme->idxa[i]]++;
}
}
-#endif
+#endif
}
static void pme_dd_sendrecv(pme_atomcomm_t *atc,
#ifdef GMX_MPI
int dest,src;
MPI_Status stat;
-
+
if (bBackward == FALSE) {
dest = atc->node_dest[shift];
src = atc->node_src[shift];
dest = atc->node_src[shift];
src = atc->node_dest[shift];
}
-
+
if (nbyte_s > 0 && nbyte_r > 0) {
MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
dest,shift,
#endif
}
-static void dd_pmeredist_x_q(gmx_pme_t pme,
+static void dd_pmeredist_x_q(gmx_pme_t pme,
int n, gmx_bool bX, rvec *x, real *charge,
pme_atomcomm_t *atc)
{
int *commnode,*buf_index;
int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
-
+
commnode = atc->node_dest;
buf_index = atc->buf_index;
-
+
nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
-
+
nsend = 0;
for(i=0; i<nnodes_comm; i++) {
buf_index[commnode[i]] = nsend;
"This usually means that your system is not well equilibrated.",
n - (atc->count[atc->nodeid] + nsend),
pme->nodeid,'x'+atc->dimind);
-
+
if (nsend > pme->buf_nalloc) {
pme->buf_nalloc = over_alloc_dd(nsend);
srenew(pme->bufv,pme->buf_nalloc);
srenew(pme->bufr,pme->buf_nalloc);
}
-
+
atc->n = atc->count[atc->nodeid];
for(i=0; i<nnodes_comm; i++) {
scount = atc->count[commnode[i]];
&atc->rcount[i],sizeof(int));
atc->n += atc->rcount[i];
}
-
+
pme_realloc_atomcomm_things(atc);
}
-
+
local_pos = 0;
for(i=0; i<n; i++) {
node = atc->pd[i];
buf_index[node]++;
}
}
-
+
buf_pos = 0;
for(i=0; i<nnodes_comm; i++) {
scount = atc->count[commnode[i]];
}
#ifdef GMX_MPI
-static void
+static void
gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
{
pme_overlap_t *overlap;
int ipulse,send_id,recv_id,datasize;
real *p;
real *sendptr,*recvptr;
-
+
/* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
overlap = &pme->overlap[1];
-
+
for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
{
/* Since we have already (un)wrapped the overlap in the z-dimension,
send_id = overlap->recv_id[ipulse];
recv_id = overlap->send_id[ipulse];
send_index0 = overlap->comm_data[ipulse].recv_index0;
- send_nindex = overlap->comm_data[ipulse].recv_nindex;
+ send_nindex = overlap->comm_data[ipulse].recv_nindex;
recv_index0 = overlap->comm_data[ipulse].send_index0;
recv_nindex = overlap->comm_data[ipulse].send_nindex;
}
for(k=0;k<pme->nkz;k++)
{
iz = k;
- pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
+ overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
}
}
}
-
+
datasize = pme->pmegrid_nx * pme->nkz;
-
- MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
+
+ MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
send_id,ipulse,
- pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
+ overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
recv_id,ipulse,
overlap->mpi_comm,&stat);
-
+
/* Get data from contiguous recv buffer */
if (debug)
{
iz = k;
if(direction==GMX_SUM_QGRID_FORWARD)
{
- grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
+ grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
}
else
{
- grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = pme->pmegrid_recvbuf[icnt++];
+ grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
}
}
}
}
}
-
+
/* Major dimension is easier, no copying required,
* but we might have to sum to separate array.
* Since we don't copy, we have to communicate up to pmegrid_nz,
* not nkz as for the minor direction.
*/
overlap = &pme->overlap[0];
-
+
for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
{
if(direction==GMX_SUM_QGRID_FORWARD)
send_nindex = overlap->comm_data[ipulse].send_nindex;
recv_index0 = overlap->comm_data[ipulse].recv_index0;
recv_nindex = overlap->comm_data[ipulse].recv_nindex;
- recvptr = pme->pmegrid_recvbuf;
+ recvptr = overlap->recvbuf;
}
else
{
send_id = overlap->recv_id[ipulse];
recv_id = overlap->send_id[ipulse];
send_index0 = overlap->comm_data[ipulse].recv_index0;
- send_nindex = overlap->comm_data[ipulse].recv_nindex;
+ send_nindex = overlap->comm_data[ipulse].recv_nindex;
recv_index0 = overlap->comm_data[ipulse].send_index0;
recv_nindex = overlap->comm_data[ipulse].send_nindex;
recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
}
-
+
sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
datasize = pme->pmegrid_ny * pme->pmegrid_nz;
recvptr,recv_nindex*datasize,GMX_MPI_REAL,
recv_id,ipulse,
overlap->mpi_comm,&stat);
-
+
/* ADD data from contiguous recv buffer */
if(direction==GMX_SUM_QGRID_FORWARD)
- {
+ {
p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
for(i=0;i<recv_nindex*datasize;i++)
{
- p[i] += pme->pmegrid_recvbuf[i];
+ p[i] += overlap->recvbuf[i];
}
}
}
local_fft_ndata,
local_fft_offset,
local_fft_size);
-
+
local_pme_size[0] = pme->pmegrid_nx;
local_pme_size[1] = pme->pmegrid_ny;
local_pme_size[2] = pme->pmegrid_nz;
-
- /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
+
+ /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
the offset is identical, and the PME grid always has more data (due to overlap)
*/
{
fp2 = ffopen(fn,"w");
sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
#endif
+
for(ix=0;ix<local_fft_ndata[XX];ix++)
{
for(iy=0;iy<local_fft_ndata[YY];iy++)
}
+static gmx_cycles_t omp_cyc_start()
+{
+ return gmx_cycles_read();
+}
+
+static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
+{
+ return gmx_cycles_read() - c;
+}
+
+
static int
-copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
+copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
+ int nthread,int thread)
{
ivec local_fft_ndata,local_fft_offset,local_fft_size;
ivec local_pme_size;
- int i,ix,iy,iz;
+ int ixy0,ixy1,ixy,ix,iy,iz;
int pmeidx,fftidx;
-
+#ifdef PME_TIME_THREADS
+ gmx_cycles_t c1;
+ static double cs1=0;
+ static int cnt=0;
+#endif
+
+#ifdef PME_TIME_THREADS
+ c1 = omp_cyc_start();
+#endif
/* Dimensions should be identical for A/B grid, so we just use A here */
gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
local_fft_ndata,
local_pme_size[0] = pme->pmegrid_nx;
local_pme_size[1] = pme->pmegrid_ny;
local_pme_size[2] = pme->pmegrid_nz;
-
- /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
+
+ /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
the offset is identical, and the PME grid always has more data (due to overlap)
*/
- for(ix=0;ix<local_fft_ndata[XX];ix++)
+ ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
+ ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
+
+ for(ixy=ixy0;ixy<ixy1;ixy++)
{
- for(iy=0;iy<local_fft_ndata[YY];iy++)
+ ix = ixy/local_fft_ndata[YY];
+ iy = ixy - ix*local_fft_ndata[YY];
+
+ pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
+ fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
+ for(iz=0;iz<local_fft_ndata[ZZ];iz++)
{
- for(iz=0;iz<local_fft_ndata[ZZ];iz++)
- {
- pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
- fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
- pmegrid[pmeidx] = fftgrid[fftidx];
- }
+ pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
}
- }
+ }
+
+#ifdef PME_TIME_THREADS
+ c1 = omp_cyc_end(c1);
+ cs1 += (double)c1;
+ cnt++;
+ if (cnt % 20 == 0)
+ {
+ printf("copy %.2f\n",cs1*1e-9);
+ }
+#endif
+
return 0;
}
overlap = pme->pme_order - 1;
/* Add periodic overlap in z */
- for(ix=0; ix<pnx; ix++)
+ for(ix=0; ix<pme->pmegrid_nx; ix++)
{
- for(iy=0; iy<pny; iy++)
+ for(iy=0; iy<pme->pmegrid_ny; iy++)
{
for(iz=0; iz<overlap; iz++)
{
if (pme->nnodes_minor == 1)
{
- for(ix=0; ix<pnx; ix++)
+ for(ix=0; ix<pme->pmegrid_nx; ix++)
{
for(iy=0; iy<overlap; iy++)
{
}
}
}
-
+
if (pme->nnodes_major == 1)
{
- ny_x = (pme->nnodes_minor == 1 ? ny : pny);
+ ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
for(ix=0; ix<overlap; ix++)
{
static void
unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
{
- int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
+ int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
nx = pme->nkx;
ny = pme->nky;
if (pme->nnodes_major == 1)
{
- ny_x = (pme->nnodes_minor == 1 ? ny : pny);
+ ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
for(ix=0; ix<overlap; ix++)
{
+ int iy,iz;
+
for(iy=0; iy<ny_x; iy++)
{
for(iz=0; iz<nz; iz++)
if (pme->nnodes_minor == 1)
{
- for(ix=0; ix<pnx; ix++)
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for(ix=0; ix<pme->pmegrid_nx; ix++)
{
+ int iy,iz;
+
for(iy=0; iy<overlap; iy++)
{
for(iz=0; iz<nz; iz++)
}
/* Copy periodic overlap in z */
- for(ix=0; ix<pnx; ix++)
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for(ix=0; ix<pme->pmegrid_nx; ix++)
{
- for(iy=0; iy<pny; iy++)
+ int iy,iz;
+
+ for(iy=0; iy<pme->pmegrid_ny; iy++)
{
for(iz=0; iz<overlap; iz++)
{
}
}
+static void clear_grid(int nx,int ny,int nz,real *grid,
+ ivec fs,int *flag,
+ int fx,int fy,int fz,
+ int order)
+{
+ int nc,ncz;
+ int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
+ int flind;
+
+ nc = 2 + (order - 2)/FLBS;
+ ncz = 2 + (order - 2)/FLBSZ;
+
+ for(fsx=fx; fsx<fx+nc; fsx++)
+ {
+ for(fsy=fy; fsy<fy+nc; fsy++)
+ {
+ for(fsz=fz; fsz<fz+ncz; fsz++)
+ {
+ flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
+ if (flag[flind] == 0)
+ {
+ gx = fsx*FLBS;
+ gy = fsy*FLBS;
+ gz = fsz*FLBSZ;
+ g0x = (gx*ny + gy)*nz + gz;
+ for(x=0; x<FLBS; x++)
+ {
+ g0y = g0x;
+ for(y=0; y<FLBS; y++)
+ {
+ for(z=0; z<FLBSZ; z++)
+ {
+ grid[g0y+z] = 0;
+ }
+ g0y += nz;
+ }
+ g0x += ny*nz;
+ }
+
+ flag[flind] = 1;
+ }
+ }
+ }
+ }
+}
/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
#define DO_BSPLINE(order) \
}
-static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc,
- real *grid)
+static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
+ pme_atomcomm_t *atc, splinedata_t *spline,
+ pme_spline_work_t *work)
{
/* spread charges from home atoms to local grid */
+ real *grid;
pme_overlap_t *ol;
int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
int * idxptr;
real valx,valxy,qn;
real *thx,*thy,*thz;
int localsize, bndsize;
-
int pnx,pny,pnz,ndatatot;
-
- pnx = pme->pmegrid_nx;
- pny = pme->pmegrid_ny;
- pnz = pme->pmegrid_nz;
+ int offx,offy,offz;
+
+ pnx = pmegrid->n[XX];
+ pny = pmegrid->n[YY];
+ pnz = pmegrid->n[ZZ];
+
+ offx = pmegrid->offset[XX];
+ offy = pmegrid->offset[YY];
+ offz = pmegrid->offset[ZZ];
+
ndatatot = pnx*pny*pnz;
-
+ grid = pmegrid->grid;
for(i=0;i<ndatatot;i++)
{
grid[i] = 0;
}
- order = pme->pme_order;
+ order = pmegrid->order;
- for(nn=0; (nn<atc->n);nn++)
+ for(nn=0; nn<spline->n; nn++)
{
- n = nn;
- qn = atc->q[n];
+ n = spline->ind[nn];
+ qn = atc->q[n];
- if (qn != 0)
+ if (qn != 0)
{
idxptr = atc->idx[n];
- norder = n*order;
-
- i0 = idxptr[XX];
- j0 = idxptr[YY];
- k0 = idxptr[ZZ];
- thx = atc->theta[XX] + norder;
- thy = atc->theta[YY] + norder;
- thz = atc->theta[ZZ] + norder;
-
+ norder = nn*order;
+
+ i0 = idxptr[XX] - offx;
+ j0 = idxptr[YY] - offy;
+ k0 = idxptr[ZZ] - offz;
+
+ thx = spline->theta[XX] + norder;
+ thy = spline->theta[YY] + norder;
+ thz = spline->theta[ZZ] + norder;
+
switch (order) {
- case 4: DO_BSPLINE(4); break;
- case 5: DO_BSPLINE(5); break;
- default: DO_BSPLINE(order); break;
+ case 4:
+#ifdef PME_SSE
+#ifdef PME_SSE_UNALIGNED
+#define PME_SPREAD_SSE_ORDER4
+#else
+#define PME_SPREAD_SSE_ALIGNED
+#define PME_ORDER 4
+#endif
+#include "pme_sse_single.h"
+#else
+ DO_BSPLINE(4);
+#endif
+ break;
+ case 5:
+#ifdef PME_SSE
+#define PME_SPREAD_SSE_ALIGNED
+#define PME_ORDER 5
+#include "pme_sse_single.h"
+#else
+ DO_BSPLINE(5);
+#endif
+ break;
+ default:
+ DO_BSPLINE(order);
+ break;
}
}
- }
+ }
}
-#if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
- /* Calculate exponentials through SSE in float precision */
-#define CALC_EXPONENTIALS(start,end,r_aligned) \
- { \
- __m128 tmp_sse; \
- for(kx=0; kx<end; kx+=4) \
- { \
- tmp_sse = _mm_load_ps(r_aligned+kx); \
- tmp_sse = gmx_mm_exp_ps(tmp_sse); \
- _mm_store_ps(r_aligned+kx,tmp_sse); \
- } \
+static void alloc_real_aligned(int n,real **ptr_raw,real **ptr)
+{
+ snew(*ptr_raw,n+8);
+
+ *ptr = (real *) (((size_t) *ptr_raw + 16) & (~((size_t) 15)));
+
+}
+static void set_grid_alignment(int *pmegrid_nz,int pme_order)
+{
+#ifdef PME_SSE
+ if (pme_order == 5
+#ifndef PME_SSE_UNALIGNED
+ || pme_order == 4
+#endif
+ )
+ {
+ /* Round nz up to a multiple of 4 to ensure alignment */
+ *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
}
-#else
-#define CALC_EXPONENTIALS(start,end,r) \
- for(kx=start; kx<end; kx++) \
- { \
- r[kx] = exp(r[kx]); \
+#endif
+}
+
+static void set_gridsize_alignment(int *gridsize,int pme_order)
+{
+#ifdef PME_SSE
+#ifndef PME_SSE_UNALIGNED
+ if (pme_order == 4)
+ {
+ /* Add extra elements to ensured aligned operations do not go
+ * beyond the allocated grid size.
+ * Note that for pme_order=5, the pme grid z-size alignment
+ * ensures that we will not go beyond the grid size.
+ */
+ *gridsize += 4;
}
#endif
+#endif
+}
+static void pmegrid_init(pmegrid_t *grid,
+ int cx, int cy, int cz,
+ int x0, int y0, int z0,
+ int x1, int y1, int z1,
+ gmx_bool set_alignment,
+ int pme_order,
+ real *ptr)
+{
+ int nz,gridsize;
+
+ grid->ci[XX] = cx;
+ grid->ci[YY] = cy;
+ grid->ci[ZZ] = cz;
+ grid->offset[XX] = x0;
+ grid->offset[YY] = y0;
+ grid->offset[ZZ] = z0;
+ grid->n[XX] = x1 - x0 + pme_order - 1;
+ grid->n[YY] = y1 - y0 + pme_order - 1;
+ grid->n[ZZ] = z1 - z0 + pme_order - 1;
+
+ nz = grid->n[ZZ];
+ set_grid_alignment(&nz,pme_order);
+ if (set_alignment)
+ {
+ grid->n[ZZ] = nz;
+ }
+ else if (nz != grid->n[ZZ])
+ {
+ gmx_incons("pmegrid_init call with an unaligned z size");
+ }
-static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
- real ewaldcoeff,real vol,
- gmx_bool bEnerVir,real *mesh_energy,matrix vir)
+ grid->order = pme_order;
+ if (ptr == NULL)
+ {
+ gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
+ set_gridsize_alignment(&gridsize,pme_order);
+ snew_aligned(grid->grid,gridsize,16);
+ }
+ else
+ {
+ grid->grid = ptr;
+ }
+}
+
+static int div_round_up(int enumerator,int denominator)
+{
+ return (enumerator + denominator - 1)/denominator;
+}
+
+static void make_subgrid_division(const ivec n,int ovl,int nthread,
+ ivec nsub)
+{
+ int gsize_opt,gsize;
+ int nsx,nsy,nsz;
+ char *env;
+
+ gsize_opt = -1;
+ for(nsx=1; nsx<=nthread; nsx++)
+ {
+ if (nthread % nsx == 0)
+ {
+ for(nsy=1; nsy<=nthread; nsy++)
+ {
+ if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
+ {
+ nsz = nthread/(nsx*nsy);
+
+ /* Determine the number of grid points per thread */
+ gsize =
+ (div_round_up(n[XX],nsx) + ovl)*
+ (div_round_up(n[YY],nsy) + ovl)*
+ (div_round_up(n[ZZ],nsz) + ovl);
+
+ /* Minimize the number of grids points per thread
+ * and, secondarily, the number of cuts in minor dimensions.
+ */
+ if (gsize_opt == -1 ||
+ gsize < gsize_opt ||
+ (gsize == gsize_opt &&
+ (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
+ {
+ nsub[XX] = nsx;
+ nsub[YY] = nsy;
+ nsub[ZZ] = nsz;
+ gsize_opt = gsize;
+ }
+ }
+ }
+ }
+ }
+
+ env = getenv("GMX_PME_THREAD_DIVISION");
+ if (env != NULL)
+ {
+ sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
+ }
+
+ if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
+ {
+ gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
+ }
+}
+
+static void pmegrids_init(pmegrids_t *grids,
+ int nx,int ny,int nz,int nz_base,
+ int pme_order,
+ int nthread,
+ int overlap_x,
+ int overlap_y)
+{
+ ivec n,n_base,g0,g1;
+ int t,x,y,z,d,i,tfac;
+ int max_comm_lines;
+
+ n[XX] = nx - (pme_order - 1);
+ n[YY] = ny - (pme_order - 1);
+ n[ZZ] = nz - (pme_order - 1);
+
+ copy_ivec(n,n_base);
+ n_base[ZZ] = nz_base;
+
+ pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
+ NULL);
+
+ grids->nthread = nthread;
+
+ make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
+
+ if (grids->nthread > 1)
+ {
+ ivec nst;
+ int gridsize;
+ real *grid_all;
+
+ for(d=0; d<DIM; d++)
+ {
+ nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
+ }
+ set_grid_alignment(&nst[ZZ],pme_order);
+
+ if (debug)
+ {
+ fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
+ grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
+ fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
+ nx,ny,nz,
+ nst[XX],nst[YY],nst[ZZ]);
+ }
+
+ snew(grids->grid_th,grids->nthread);
+ t = 0;
+ gridsize = nst[XX]*nst[YY]*nst[ZZ];
+ set_gridsize_alignment(&gridsize,pme_order);
+ snew_aligned(grid_all,
+ grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
+ 16);
+
+ for(x=0; x<grids->nc[XX]; x++)
+ {
+ for(y=0; y<grids->nc[YY]; y++)
+ {
+ for(z=0; z<grids->nc[ZZ]; z++)
+ {
+ pmegrid_init(&grids->grid_th[t],
+ x,y,z,
+ (n[XX]*(x ))/grids->nc[XX],
+ (n[YY]*(y ))/grids->nc[YY],
+ (n[ZZ]*(z ))/grids->nc[ZZ],
+ (n[XX]*(x+1))/grids->nc[XX],
+ (n[YY]*(y+1))/grids->nc[YY],
+ (n[ZZ]*(z+1))/grids->nc[ZZ],
+ TRUE,
+ pme_order,
+ grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
+ t++;
+ }
+ }
+ }
+ }
+
+ snew(grids->g2t,DIM);
+ tfac = 1;
+ for(d=DIM-1; d>=0; d--)
+ {
+ snew(grids->g2t[d],n[d]);
+ t = 0;
+ for(i=0; i<n[d]; i++)
+ {
+ /* The second check should match the parameters
+ * of the pmegrid_init call above.
+ */
+ while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
+ {
+ t++;
+ }
+ grids->g2t[d][i] = t*tfac;
+ }
+
+ tfac *= grids->nc[d];
+
+ switch (d)
+ {
+ case XX: max_comm_lines = overlap_x; break;
+ case YY: max_comm_lines = overlap_y; break;
+ case ZZ: max_comm_lines = pme_order - 1; break;
+ }
+ grids->nthread_comm[d] = 0;
+ while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
+ {
+ grids->nthread_comm[d]++;
+ }
+ if (debug != NULL)
+ {
+ fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
+ 'x'+d,grids->nthread_comm[d]);
+ }
+ /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
+ * work, but this is not a problematic restriction.
+ */
+ if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
+ {
+ gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
+ }
+ }
+}
+
+
+static void pmegrids_destroy(pmegrids_t *grids)
+{
+ int t;
+
+ if (grids->grid.grid != NULL)
+ {
+ sfree(grids->grid.grid);
+
+ if (grids->nthread > 0)
+ {
+ for(t=0; t<grids->nthread; t++)
+ {
+ sfree(grids->grid_th[t].grid);
+ }
+ sfree(grids->grid_th);
+ }
+ }
+}
+
+
+static void realloc_work(pme_work_t *work,int nkx)
+{
+ if (nkx > work->nalloc)
+ {
+ work->nalloc = nkx;
+ srenew(work->mhx ,work->nalloc);
+ srenew(work->mhy ,work->nalloc);
+ srenew(work->mhz ,work->nalloc);
+ srenew(work->m2 ,work->nalloc);
+ srenew(work->denom,work->nalloc);
+ /* Allocate an aligned pointer for SSE operations, including 3 extra
+ * elements at the end since SSE operates on 4 elements at a time.
+ */
+ sfree_aligned(work->denom);
+ sfree_aligned(work->tmp1);
+ sfree_aligned(work->eterm);
+ snew_aligned(work->denom,work->nalloc+3,16);
+ snew_aligned(work->tmp1 ,work->nalloc+3,16);
+ snew_aligned(work->eterm,work->nalloc+3,16);
+ srenew(work->m2inv,work->nalloc);
+ }
+}
+
+
+static void free_work(pme_work_t *work)
+{
+ sfree(work->mhx);
+ sfree(work->mhy);
+ sfree(work->mhz);
+ sfree(work->m2);
+ sfree_aligned(work->denom);
+ sfree_aligned(work->tmp1);
+ sfree_aligned(work->eterm);
+ sfree(work->m2inv);
+}
+
+
+#ifdef PME_SSE
+ /* Calculate exponentials through SSE in float precision */
+inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+{
+ {
+ const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
+ __m128 f_sse;
+ __m128 lu;
+ __m128 tmp_d1,d_inv,tmp_r,tmp_e;
+ int kx;
+ f_sse = _mm_load1_ps(&f);
+ for(kx=0; kx<end; kx+=4)
+ {
+ tmp_d1 = _mm_load_ps(d_aligned+kx);
+ lu = _mm_rcp_ps(tmp_d1);
+ d_inv = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
+ tmp_r = _mm_load_ps(r_aligned+kx);
+ tmp_r = gmx_mm_exp_ps(tmp_r);
+ tmp_e = _mm_mul_ps(f_sse,d_inv);
+ tmp_e = _mm_mul_ps(tmp_e,tmp_r);
+ _mm_store_ps(e_aligned+kx,tmp_e);
+ }
+ }
+}
+#else
+inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
+{
+ int kx;
+ for(kx=start; kx<end; kx++)
+ {
+ d[kx] = 1.0/d[kx];
+ }
+ for(kx=start; kx<end; kx++)
+ {
+ r[kx] = exp(r[kx]);
+ }
+ for(kx=start; kx<end; kx++)
+ {
+ e[kx] = f*r[kx]*d[kx];
+ }
+}
+#endif
+
+
+static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
+ real ewaldcoeff,real vol,
+ gmx_bool bEnerVir,
+ int nthread,int thread)
{
/* do recip sum over local cells in grid */
/* y major, z middle, x minor or continuous */
t_complex *p0;
int kx,ky,kz,maxkx,maxky,maxkz;
- int nx,ny,nz,iy,iz,kxstart,kxend;
+ int nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
real mx,my,mz;
real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
real ets2,struct2,vfactor,ets2vf;
- real eterm,d1,d2,energy=0;
+ real d1,d2,energy=0;
real by,bz;
real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
real rxx,ryx,ryy,rzx,rzy,rzz;
- real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
+ pme_work_t *work;
+ real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
real mhxk,mhyk,mhzk,m2k;
real corner_fac;
ivec complex_order;
ivec local_ndata,local_offset,local_size;
-
+ real elfac;
+
+ elfac = ONE_4PI_EPS0/pme->epsilon_r;
+
nx = pme->nkx;
ny = pme->nky;
nz = pme->nkz;
-
+
/* Dimensions should be identical for A/B grid, so we just use A here */
gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
complex_order,
local_ndata,
local_offset,
local_size);
-
+
rxx = pme->recipbox[XX][XX];
ryx = pme->recipbox[YY][XX];
ryy = pme->recipbox[YY][YY];
rzx = pme->recipbox[ZZ][XX];
rzy = pme->recipbox[ZZ][YY];
rzz = pme->recipbox[ZZ][ZZ];
-
+
maxkx = (nx+1)/2;
maxky = (ny+1)/2;
maxkz = nz/2+1;
-
- mhx = pme->work_mhx;
- mhy = pme->work_mhy;
- mhz = pme->work_mhz;
- m2 = pme->work_m2;
- denom = pme->work_denom;
- tmp1 = pme->work_tmp1;
- m2inv = pme->work_m2inv;
- for(iy=0;iy<local_ndata[YY];iy++)
+ work = &pme->work[thread];
+ mhx = work->mhx;
+ mhy = work->mhy;
+ mhz = work->mhz;
+ m2 = work->m2;
+ denom = work->denom;
+ tmp1 = work->tmp1;
+ eterm = work->eterm;
+ m2inv = work->m2inv;
+
+ iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
+ iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
+
+ for(iyz=iyz0; iyz<iyz1; iyz++)
{
+ iy = iyz/local_ndata[ZZ];
+ iz = iyz - iy*local_ndata[ZZ];
+
ky = iy + local_offset[YY];
-
- if (ky < maxky)
+
+ if (ky < maxky)
{
my = ky;
}
- else
+ else
{
my = (ky - ny);
}
-
+
by = M_PI*vol*pme->bsp_mod[YY][ky];
- for(iz=0;iz<local_ndata[ZZ];iz++)
- {
- kz = iz + local_offset[ZZ];
-
- mz = kz;
-
- bz = pme->bsp_mod[ZZ][kz];
-
- /* 0.5 correction for corner points */
- corner_fac = 1;
- if (kz == 0)
- corner_fac = 0.5;
- if (kz == (nz+1)/2)
- corner_fac = 0.5;
-
- p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
-
- /* We should skip the k-space point (0,0,0) */
- if (local_offset[XX] > 0 ||
- local_offset[YY] > 0 || ky > 0 ||
- kz > 0)
+ kz = iz + local_offset[ZZ];
+
+ mz = kz;
+
+ bz = pme->bsp_mod[ZZ][kz];
+
+ /* 0.5 correction for corner points */
+ corner_fac = 1;
+ if (kz == 0 || kz == (nz+1)/2)
+ {
+ corner_fac = 0.5;
+ }
+
+ p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+
+ /* We should skip the k-space point (0,0,0) */
+ if (local_offset[XX] > 0 || ky > 0 || kz > 0)
+ {
+ kxstart = local_offset[XX];
+ }
+ else
+ {
+ kxstart = local_offset[XX] + 1;
+ p0++;
+ }
+ kxend = local_offset[XX] + local_ndata[XX];
+
+ if (bEnerVir)
+ {
+ /* More expensive inner loop, especially because of the storage
+ * of the mh elements in array's.
+ * Because x is the minor grid index, all mh elements
+ * depend on kx for triclinic unit cells.
+ */
+
+ /* Two explicit loops to avoid a conditional inside the loop */
+ for(kx=kxstart; kx<maxkx; kx++)
{
- kxstart = local_offset[XX];
+ mx = kx;
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ mhx[kx] = mhxk;
+ mhy[kx] = mhyk;
+ mhz[kx] = mhzk;
+ m2[kx] = m2k;
+ denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
}
- else
+
+ for(kx=maxkx; kx<kxend; kx++)
{
- kxstart = local_offset[XX] + 1;
- p0++;
+ mx = (kx - nx);
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ mhx[kx] = mhxk;
+ mhy[kx] = mhyk;
+ mhz[kx] = mhzk;
+ m2[kx] = m2k;
+ denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
}
- kxend = local_offset[XX] + local_ndata[XX];
-
- if (bEnerVir)
+
+ for(kx=kxstart; kx<kxend; kx++)
{
- /* More expensive inner loop, especially because of the storage
- * of the mh elements in array's.
- * Because x is the minor grid index, all mh elements
- * depend on kx for triclinic unit cells.
- */
+ m2inv[kx] = 1.0/m2[kx];
+ }
- /* Two explicit loops to avoid a conditional inside the loop */
- for(kx=kxstart; kx<maxkx; kx++)
- {
- mx = kx;
-
- mhxk = mx * rxx;
- mhyk = mx * ryx + my * ryy;
- mhzk = mx * rzx + my * rzy + mz * rzz;
- m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
- mhx[kx] = mhxk;
- mhy[kx] = mhyk;
- mhz[kx] = mhzk;
- m2[kx] = m2k;
- denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
- tmp1[kx] = -factor*m2k;
- }
-
- for(kx=maxkx; kx<kxend; kx++)
- {
- mx = (kx - nx);
-
- mhxk = mx * rxx;
- mhyk = mx * ryx + my * ryy;
- mhzk = mx * rzx + my * rzy + mz * rzz;
- m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
- mhx[kx] = mhxk;
- mhy[kx] = mhyk;
- mhz[kx] = mhzk;
- m2[kx] = m2k;
- denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
- tmp1[kx] = -factor*m2k;
- }
-
- for(kx=kxstart; kx<kxend; kx++)
- {
- m2inv[kx] = 1.0/m2[kx];
- }
- for(kx=kxstart; kx<kxend; kx++)
- {
- denom[kx] = 1.0/denom[kx];
- }
+ calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
- CALC_EXPONENTIALS(kxstart,kxend,tmp1);
+ for(kx=kxstart; kx<kxend; kx++,p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
- for(kx=kxstart; kx<kxend; kx++,p0++)
- {
- d1 = p0->re;
- d2 = p0->im;
-
- eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
-
- p0->re = d1*eterm;
- p0->im = d2*eterm;
-
- struct2 = 2.0*(d1*d1+d2*d2);
-
- tmp1[kx] = eterm*struct2;
- }
-
- for(kx=kxstart; kx<kxend; kx++)
- {
- ets2 = corner_fac*tmp1[kx];
- vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
- energy += ets2;
-
- ets2vf = ets2*vfactor;
- virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
- virxy += ets2vf*mhx[kx]*mhy[kx];
- virxz += ets2vf*mhx[kx]*mhz[kx];
- viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
- viryz += ets2vf*mhy[kx]*mhz[kx];
- virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
- }
+ p0->re = d1*eterm[kx];
+ p0->im = d2*eterm[kx];
+
+ struct2 = 2.0*(d1*d1+d2*d2);
+
+ tmp1[kx] = eterm[kx]*struct2;
}
- else
+
+ for(kx=kxstart; kx<kxend; kx++)
{
- /* We don't need to calculate the energy and the virial.
- * In this case the triclinic overhead is small.
- */
+ ets2 = corner_fac*tmp1[kx];
+ vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
+ energy += ets2;
+
+ ets2vf = ets2*vfactor;
+ virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
+ virxy += ets2vf*mhx[kx]*mhy[kx];
+ virxz += ets2vf*mhx[kx]*mhz[kx];
+ viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
+ viryz += ets2vf*mhy[kx]*mhz[kx];
+ virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
+ }
+ }
+ else
+ {
+ /* We don't need to calculate the energy and the virial.
+ * In this case the triclinic overhead is small.
+ */
- /* Two explicit loops to avoid a conditional inside the loop */
+ /* Two explicit loops to avoid a conditional inside the loop */
- for(kx=kxstart; kx<maxkx; kx++)
- {
- mx = kx;
-
- mhxk = mx * rxx;
- mhyk = mx * ryx + my * ryy;
- mhzk = mx * rzx + my * rzy + mz * rzz;
- m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
- denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
- tmp1[kx] = -factor*m2k;
- }
-
- for(kx=maxkx; kx<kxend; kx++)
- {
- mx = (kx - nx);
-
- mhxk = mx * rxx;
- mhyk = mx * ryx + my * ryy;
- mhzk = mx * rzx + my * rzy + mz * rzz;
- m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
- denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
- tmp1[kx] = -factor*m2k;
- }
-
- for(kx=kxstart; kx<kxend; kx++)
- {
- denom[kx] = 1.0/denom[kx];
- }
+ for(kx=kxstart; kx<maxkx; kx++)
+ {
+ mx = kx;
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ }
- CALC_EXPONENTIALS(kxstart,kxend,tmp1);
-
- for(kx=kxstart; kx<kxend; kx++,p0++)
- {
- d1 = p0->re;
- d2 = p0->im;
-
- eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
-
- p0->re = d1*eterm;
- p0->im = d2*eterm;
- }
+ for(kx=maxkx; kx<kxend; kx++)
+ {
+ mx = (kx - nx);
+
+ mhxk = mx * rxx;
+ mhyk = mx * ryx + my * ryy;
+ mhzk = mx * rzx + my * rzy + mz * rzz;
+ m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+ denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+ tmp1[kx] = -factor*m2k;
+ }
+
+ calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
+
+ for(kx=kxstart; kx<kxend; kx++,p0++)
+ {
+ d1 = p0->re;
+ d2 = p0->im;
+
+ p0->re = d1*eterm[kx];
+ p0->im = d2*eterm[kx];
}
}
}
-
+
if (bEnerVir)
{
/* Update virial with local values.
* experiencing problems on semiisotropic membranes.
* IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
*/
- vir[XX][XX] = 0.25*virxx;
- vir[YY][YY] = 0.25*viryy;
- vir[ZZ][ZZ] = 0.25*virzz;
- vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
- vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
- vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
-
+ work->vir[XX][XX] = 0.25*virxx;
+ work->vir[YY][YY] = 0.25*viryy;
+ work->vir[ZZ][ZZ] = 0.25*virzz;
+ work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
+ work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
+ work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
+
/* This energy should be corrected for a charged system */
- *mesh_energy = 0.5*energy;
+ work->energy = 0.5*energy;
}
/* Return the loop count */
- return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
+ return local_ndata[YY]*local_ndata[XX];
+}
+
+static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
+ real *mesh_energy,matrix vir)
+{
+ /* This function sums output over threads
+ * and should therefore only be called after thread synchronization.
+ */
+ int thread;
+
+ *mesh_energy = pme->work[0].energy;
+ copy_mat(pme->work[0].vir,vir);
+
+ for(thread=1; thread<nthread; thread++)
+ {
+ *mesh_energy += pme->work[thread].energy;
+ m_add(vir,pme->work[thread].vir,vir);
+ }
+}
+
+static int solve_pme_yzx_wrapper(gmx_pme_t pme,t_complex *grid,
+ real ewaldcoeff,real vol,
+ gmx_bool bEnerVir,real *mesh_energy,matrix vir)
+{
+ int nthread,thread;
+ int nelements=0;
+
+ nthread = pme->nthread;
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for(thread=0; thread<nthread; thread++)
+ {
+ int n;
+
+ n = solve_pme_yzx(pme,grid,ewaldcoeff,vol,bEnerVir,nthread,thread);
+ if (thread == 0)
+ {
+ nelements = n;
+ }
+ }
+
+ if (bEnerVir)
+ {
+ get_pme_ener_vir(pme,nthread,mesh_energy,vir);
+ }
+
+ return nelements;
}
#define DO_FSPLINE(order) \
for(ithx=0; (ithx<order); ithx++) \
-{ \
+{ \
index_x = (i0+ithx)*pny*pnz; \
tx = thx[ithx]; \
dx = dthx[ithx]; \
\
for(ithy=0; (ithy<order); ithy++) \
- { \
+ { \
index_xy = index_x+(j0+ithy)*pnz; \
ty = thy[ithy]; \
dy = dthy[ithy]; \
fxy1 = fz1 = 0; \
\
for(ithz=0; (ithz<order); ithz++) \
- { \
+ { \
gval = grid[index_xy+(k0+ithz)]; \
fxy1 += thz[ithz]*gval; \
fz1 += dthz[ithz]*gval; \
void gather_f_bsplines(gmx_pme_t pme,real *grid,
- gmx_bool bClearF,pme_atomcomm_t *atc,real scale)
+ gmx_bool bClearF,pme_atomcomm_t *atc,
+ splinedata_t *spline,
+ real scale)
{
- /* sum forces for local particles */
+ /* sum forces for local particles */
int nn,n,ithx,ithy,ithz,i0,j0,k0;
int index_x,index_xy;
int nx,ny,nz,pnx,pny,pnz;
int norder;
real rxx,ryx,ryy,rzx,rzy,rzz;
int order;
-
+
+ pme_spline_work_t *work;
+
+ work = &pme->spline_work;
+
order = pme->pme_order;
- thx = atc->theta[XX];
- thy = atc->theta[YY];
- thz = atc->theta[ZZ];
- dthx = atc->dtheta[XX];
- dthy = atc->dtheta[YY];
- dthz = atc->dtheta[ZZ];
+ thx = spline->theta[XX];
+ thy = spline->theta[YY];
+ thz = spline->theta[ZZ];
+ dthx = spline->dtheta[XX];
+ dthy = spline->dtheta[YY];
+ dthz = spline->dtheta[ZZ];
nx = pme->nkx;
ny = pme->nky;
nz = pme->nkz;
pnx = pme->pmegrid_nx;
pny = pme->pmegrid_ny;
pnz = pme->pmegrid_nz;
-
+
rxx = pme->recipbox[XX][XX];
ryx = pme->recipbox[YY][XX];
ryy = pme->recipbox[YY][YY];
rzy = pme->recipbox[ZZ][YY];
rzz = pme->recipbox[ZZ][ZZ];
- for(nn=0; (nn<atc->n); nn++)
+ for(nn=0; nn<spline->n; nn++)
{
- n = nn;
- qn = scale*atc->q[n];
-
- if (bClearF)
+ n = spline->ind[nn];
+ qn = atc->q[n];
+
+ if (bClearF)
{
atc->f[n][XX] = 0;
atc->f[n][YY] = 0;
atc->f[n][ZZ] = 0;
}
- if (qn != 0)
+ if (qn != 0)
{
fx = 0;
fy = 0;
fz = 0;
idxptr = atc->idx[n];
- norder = n*order;
-
- i0 = idxptr[XX];
+ norder = nn*order;
+
+ i0 = idxptr[XX];
j0 = idxptr[YY];
k0 = idxptr[ZZ];
-
+
/* Pointer arithmetic alert, next six statements */
- thx = atc->theta[XX] + norder;
- thy = atc->theta[YY] + norder;
- thz = atc->theta[ZZ] + norder;
- dthx = atc->dtheta[XX] + norder;
- dthy = atc->dtheta[YY] + norder;
- dthz = atc->dtheta[ZZ] + norder;
-
+ thx = spline->theta[XX] + norder;
+ thy = spline->theta[YY] + norder;
+ thz = spline->theta[ZZ] + norder;
+ dthx = spline->dtheta[XX] + norder;
+ dthy = spline->dtheta[YY] + norder;
+ dthz = spline->dtheta[ZZ] + norder;
+
switch (order) {
- case 4: DO_FSPLINE(4); break;
- case 5: DO_FSPLINE(5); break;
- default: DO_FSPLINE(order); break;
+ case 4:
+#ifdef PME_SSE
+#ifdef PME_SSE_UNALIGNED
+#define PME_GATHER_F_SSE_ORDER4
+#else
+#define PME_GATHER_F_SSE_ALIGNED
+#define PME_ORDER 4
+#endif
+#include "pme_sse_single.h"
+#else
+ DO_FSPLINE(4);
+#endif
+ break;
+ case 5:
+#ifdef PME_SSE
+#define PME_GATHER_F_SSE_ALIGNED
+#define PME_ORDER 5
+#include "pme_sse_single.h"
+#else
+ DO_FSPLINE(5);
+#endif
+ break;
+ default:
+ DO_FSPLINE(order);
+ break;
}
atc->f[n][XX] += -qn*( fx*nx*rxx );
*/
}
+
static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
pme_atomcomm_t *atc)
{
+ splinedata_t *spline;
int n,ithx,ithy,ithz,i0,j0,k0;
int index_x,index_xy;
int * idxptr;
real *thx,*thy,*thz;
int norder;
int order;
-
-
+
+ spline = &atc->spline[0];
+
order = pme->pme_order;
-
+
energy = 0;
for(n=0; (n<atc->n); n++) {
qn = atc->q[n];
-
+
if (qn != 0) {
idxptr = atc->idx[n];
norder = n*order;
-
- i0 = idxptr[XX];
+
+ i0 = idxptr[XX];
j0 = idxptr[YY];
k0 = idxptr[ZZ];
-
+
/* Pointer arithmetic alert, next three statements */
- thx = atc->theta[XX] + norder;
- thy = atc->theta[YY] + norder;
- thz = atc->theta[ZZ] + norder;
+ thx = spline->theta[XX] + norder;
+ thy = spline->theta[YY] + norder;
+ thz = spline->theta[ZZ] + norder;
pot = 0;
for(ithx=0; (ithx<order); ithx++)
return energy;
}
+/* Macro to force loop unrolling by fixing order.
+ * This gives a significant performance gain.
+ */
+#define CALC_SPLINE(order) \
+{ \
+ int j,k,l; \
+ real dr,div; \
+ real data[PME_ORDER_MAX]; \
+ real ddata[PME_ORDER_MAX]; \
+ \
+ for(j=0; (j<DIM); j++) \
+ { \
+ dr = xptr[j]; \
+ \
+ /* dr is relative offset from lower cell limit */ \
+ data[order-1] = 0; \
+ data[1] = dr; \
+ data[0] = 1 - dr; \
+ \
+ for(k=3; (k<order); k++) \
+ { \
+ div = 1.0/(k - 1.0); \
+ data[k-1] = div*dr*data[k-2]; \
+ for(l=1; (l<(k-1)); l++) \
+ { \
+ data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
+ data[k-l-1]); \
+ } \
+ data[0] = div*(1-dr)*data[0]; \
+ } \
+ /* differentiate */ \
+ ddata[0] = -data[0]; \
+ for(k=1; (k<order); k++) \
+ { \
+ ddata[k] = data[k-1] - data[k]; \
+ } \
+ \
+ div = 1.0/(order - 1); \
+ data[order-1] = div*dr*data[order-2]; \
+ for(l=1; (l<(order-1)); l++) \
+ { \
+ data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
+ (order-l-dr)*data[order-l-1]); \
+ } \
+ data[0] = div*(1 - dr)*data[0]; \
+ \
+ for(k=0; k<order; k++) \
+ { \
+ theta[j][i*order+k] = data[k]; \
+ dtheta[j][i*order+k] = ddata[k]; \
+ } \
+ } \
+}
+
void make_bsplines(splinevec theta,splinevec dtheta,int order,
- rvec fractx[],int nr,real charge[],
+ rvec fractx[],int nr,int ind[],real charge[],
gmx_bool bFreeEnergy)
{
/* construct splines for local atoms */
- int i,j,k,l;
- real dr,div;
- real *data,*ddata,*xptr;
-
- for(i=0; (i<nr); i++) {
+ int i,ii;
+ real *xptr;
+
+ for(i=0; i<nr; i++)
+ {
/* With free energy we do not use the charge check.
* In most cases this will be more efficient than calling make_bsplines
* twice, since usually more than half the particles have charges.
*/
- if (bFreeEnergy || charge[i] != 0.0) {
- xptr = fractx[i];
- for(j=0; (j<DIM); j++) {
- dr = xptr[j];
-
- /* dr is relative offset from lower cell limit */
- data=&(theta[j][i*order]);
- data[order-1]=0;
- data[1]=dr;
- data[0]=1-dr;
-
- for(k=3; (k<order); k++) {
- div=1.0/(k-1.0);
- data[k-1]=div*dr*data[k-2];
- for(l=1; (l<(k-1)); l++) {
- data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
- data[k-l-1]);
- }
- data[0]=div*(1-dr)*data[0];
- }
- /* differentiate */
- ddata = &(dtheta[j][i*order]);
- ddata[0] = -data[0];
- for(k=1; (k<order); k++) {
- ddata[k]=data[k-1]-data[k];
- }
-
- div=1.0/(order-1);
- data[order-1]=div*dr*data[order-2];
- for(l=1; (l<(order-1)); l++) {
- data[order-l-1]=div*((dr+l)*data[order-l-2]+
- (order-l-dr)*data[order-l-1]);
- }
- data[0]=div*(1-dr)*data[0];
+ ii = ind[i];
+ if (bFreeEnergy || charge[ii] != 0.0) {
+ xptr = fractx[ii];
+ switch(order) {
+ case 4: CALC_SPLINE(4); break;
+ case 5: CALC_SPLINE(5); break;
+ default: CALC_SPLINE(order); break;
}
}
}
{
int i,j;
real sc,ss,arg;
-
+
for(i=0;i<ndata;i++) {
sc=ss=0;
for(j=0;j<ndata;j++) {
real *data,*ddata,*bsp_data;
int i,k,l;
real div;
-
+
snew(data,order);
snew(ddata,order);
snew(bsp_data,nmax);
data[order-1]=0;
data[1]=0;
data[0]=1;
-
+
for(k=3;k<order;k++) {
div=1.0/(k-1.0);
data[k-1]=0;
data[order-1]=0;
for(l=1;l<(order-1);l++)
data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
- data[0]=div*data[0];
+ data[0]=div*data[0];
for(i=0;i<nmax;i++)
bsp_data[i]=0;
for(i=1;i<=order;i++)
bsp_data[i]=data[i-1];
-
+
make_dft_mod(bsp_mod[XX],bsp_data,nx);
make_dft_mod(bsp_mod[YY],bsp_data,ny);
make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
atc->node_dest[n] = fw;
atc->node_src[n] = bw;
n++;
- }
+ }
if (n < nslab - 1) {
atc->node_dest[n] = bw;
atc->node_src[n] = fw;
int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
{
+ int thread;
+
if(NULL != log)
{
fprintf(log,"Destroying PME data structures.\n");
sfree((*pmedata)->nnx);
sfree((*pmedata)->nny);
sfree((*pmedata)->nnz);
-
- sfree((*pmedata)->pmegridA);
+
+ pmegrids_destroy(&(*pmedata)->pmegridA);
+
sfree((*pmedata)->fftgridA);
sfree((*pmedata)->cfftgridA);
gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
-
- if((*pmedata)->pmegridB)
+
+ if ((*pmedata)->pmegridB.grid.grid != NULL)
{
- sfree((*pmedata)->pmegridB);
+ pmegrids_destroy(&(*pmedata)->pmegridB);
sfree((*pmedata)->fftgridB);
sfree((*pmedata)->cfftgridB);
gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
}
- sfree((*pmedata)->work_mhz);
- sfree((*pmedata)->work_m2);
- sfree((*pmedata)->work_denom);
- sfree((*pmedata)->work_tmp1_alloc);
- sfree((*pmedata)->work_m2inv);
-
+ for(thread=0; thread<(*pmedata)->nthread; thread++)
+ {
+ free_work(&(*pmedata)->work[thread]);
+ }
+ sfree((*pmedata)->work);
+
sfree(*pmedata);
*pmedata = NULL;
-
+
return 0;
}
static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
int dimind,gmx_bool bSpread)
{
- int nk,k,s;
+ int nk,k,s,thread;
atc->dimind = dimind;
atc->nslab = 1;
snew(atc->node_dest,atc->nslab);
snew(atc->node_src,atc->nslab);
setup_coordinate_communication(atc);
-
- snew(atc->count,atc->nslab);
+
+ snew(atc->count_thread,pme->nthread);
+ for(thread=0; thread<pme->nthread; thread++)
+ {
+ snew(atc->count_thread[thread],atc->nslab);
+ }
+ atc->count = atc->count_thread[0];
snew(atc->rcount,atc->nslab);
snew(atc->buf_index,atc->nslab);
}
+
+ atc->nthread = pme->nthread;
+ if (atc->nthread > 1)
+ {
+ snew(atc->thread_plist,atc->nthread);
+ }
+ snew(atc->spline,atc->nthread);
+ for(thread=0; thread<atc->nthread; thread++)
+ {
+ if (atc->nthread > 1)
+ {
+ snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
+ atc->thread_plist[thread].n += GMX_CACHE_SEP;
+ }
+ }
}
-static void
+static void
init_overlap_comm(pme_overlap_t * ol,
int norder,
#ifdef GMX_MPI
- MPI_Comm comm,
+ MPI_Comm comm,
#endif
- int nnodes,
+ int nnodes,
int nodeid,
- int ndata)
+ int ndata,
+ int commplainsize)
{
int lbnd,rbnd,maxlr,b,i;
int exten;
pme_grid_comm_t *pgc;
gmx_bool bCont;
int fft_start,fft_end,send_index1,recv_index1;
-
+
#ifdef GMX_MPI
ol->mpi_comm = comm;
#endif
-
+
ol->nnodes = nnodes;
ol->nodeid = nodeid;
snew(ol->s2g0,ol->nnodes+1);
snew(ol->s2g1,ol->nnodes);
if (debug) { fprintf(debug,"PME slab boundaries:"); }
- for(i=0; i<nnodes; i++)
+ for(i=0; i<nnodes; i++)
{
/* s2g0 the local interpolation grid start.
* s2g1 the local interpolation grid end.
pgc->recv_index0 = fft_start;
pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
}
+
+ /* For non-divisible grid we need pme_order iso pme_order-1 */
+ snew(ol->sendbuf,norder*commplainsize);
+ snew(ol->recvbuf,norder*commplainsize);
}
static void
if (gtl[i] == n-1)
{
gtl[i] = 0;
- fsh[i] = -1;
+ fsh[i] = -1;
}
else if (gtl[i] == local_range)
{
*fraction_shift = fsh;
}
+static void sse_mask_init(pme_spline_work_t *work,int order)
+{
+#ifdef PME_SSE
+ float tmp[8];
+ __m128 zero_SSE;
+ int of,i;
+
+ zero_SSE = _mm_setzero_ps();
+
+ for(of=0; of<8-(order-1); of++)
+ {
+ for(i=0; i<8; i++)
+ {
+ tmp[i] = (i >= of && i < of+order ? 1 : 0);
+ }
+ work->mask_SSE0[of] = _mm_loadu_ps(tmp);
+ work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
+ work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
+ work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
+ }
+#endif
+}
+
static void
gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
{
gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
dim,*nk,dim,nnodes,dim);
}
-
+
if (fplog != NULL)
{
fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
dim,*nk,dim,nnodes,dim,nk_new,dim);
}
-
+
*nk = nk_new;
}
}
int nnodes_minor,
t_inputrec * ir,
int homenr,
- gmx_bool bFreeEnergy,
- gmx_bool bReproducible)
+ gmx_bool bFreeEnergy,
+ gmx_bool bReproducible,
+ int nthread)
{
gmx_pme_t pme=NULL;
-
+
pme_atomcomm_t *atc;
- int bufsizex,bufsizey,bufsize;
ivec ndata;
-
+
if (debug)
fprintf(debug,"Creating PME data structures.\n");
snew(pme,1);
-
+
pme->redist_init = FALSE;
pme->sum_qgrid_tmp = NULL;
pme->sum_qgrid_dd_tmp = NULL;
pme->buf_nalloc = 0;
pme->redist_buf_nalloc = 0;
-
+
pme->nnodes = 1;
pme->bPPnode = TRUE;
-
+
pme->nnodes_major = nnodes_major;
pme->nnodes_minor = nnodes_minor;
#ifdef GMX_MPI
- if (nnodes_major*nnodes_minor > 1 && PAR(cr))
+ if (PAR(cr))
{
pme->mpi_comm = cr->mpi_comm_mygroup;
-
+
MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
- if (pme->nnodes != nnodes_major*nnodes_minor)
- {
- gmx_incons("PME node count mismatch");
- }
}
#endif
pme->ndecompdim = 0;
pme->nodeid_major = 0;
pme->nodeid_minor = 0;
+#ifdef GMX_MPI
+ pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
+#endif
}
else
{
{
#ifdef GMX_MPI
pme->mpi_comm_d[0] = pme->mpi_comm;
- pme->mpi_comm_d[1] = NULL;
+ pme->mpi_comm_d[1] = MPI_COMM_NULL;
#endif
pme->ndecompdim = 1;
pme->nodeid_major = pme->nodeid;
pme->nodeid_minor = 0;
-
+
}
else if (nnodes_major == 1)
{
#ifdef GMX_MPI
- pme->mpi_comm_d[0] = NULL;
+ pme->mpi_comm_d[0] = MPI_COMM_NULL;
pme->mpi_comm_d[1] = pme->mpi_comm;
#endif
pme->ndecompdim = 1;
pme->nodeid_major = 0;
pme->nodeid_minor = pme->nodeid;
}
- else
+ else
{
if (pme->nnodes % nnodes_major != 0)
{
gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
}
pme->ndecompdim = 2;
-
+
#ifdef GMX_MPI
MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
-
+
MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
}
pme->bPPnode = (cr->duty & DUTY_PP);
}
-
+
+ pme->nthread = nthread;
+
if (ir->ePBC == epbcSCREW)
{
gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
}
-
+
pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
pme->nkx = ir->nkx;
pme->nky = ir->nky;
pme->nkz = ir->nkz;
pme->pme_order = ir->pme_order;
pme->epsilon_r = ir->epsilon_r;
-
+
+ if (pme->pme_order > PME_ORDER_MAX)
+ {
+ gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
+ pme->pme_order,PME_ORDER_MAX);
+ }
+
/* Currently pme.c supports only the fft5d FFT code.
* Therefore the grid always needs to be divisible by nnodes.
* When the old 1D code is also supported again, change this check.
*/
}
- if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
- pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
- pme->nkz <= pme->pme_order)
+ if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
+ pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
+ pme->nkz <= pme->pme_order)
+ {
+ gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
+ }
+
+ if (pme->nnodes > 1) {
+ double imbal;
+
+#ifdef GMX_MPI
+ MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
+ MPI_Type_commit(&(pme->rvec_mpi));
+#endif
+
+ /* Note that the charge spreading and force gathering, which usually
+ * takes about the same amount of time as FFT+solve_pme,
+ * is always fully load balanced
+ * (unless the charge distribution is inhomogeneous).
+ */
+
+ imbal = pme_load_imbalance(pme);
+ if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
+ {
+ fprintf(stderr,
+ "\n"
+ "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
+ " For optimal PME load balancing\n"
+ " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
+ " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+ "\n",
+ (int)((imbal-1)*100 + 0.5),
+ pme->nkx,pme->nky,pme->nnodes_major,
+ pme->nky,pme->nkz,pme->nnodes_minor);
+ }
+ }
+
+ /* For non-divisible grid we need pme_order iso pme_order-1 */
+ /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
+ * y is always copied through a buffer: we don't need padding in z,
+ * but we do need the overlap in x because of the communication order.
+ */
+ init_overlap_comm(&pme->overlap[0],pme->pme_order,
+#ifdef GMX_MPI
+ pme->mpi_comm_d[0],
+#endif
+ pme->nnodes_major,pme->nodeid_major,
+ pme->nkx,
+ (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
+
+ init_overlap_comm(&pme->overlap[1],pme->pme_order,
+#ifdef GMX_MPI
+ pme->mpi_comm_d[1],
+#endif
+ pme->nnodes_minor,pme->nodeid_minor,
+ pme->nky,
+ (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
+
+ /* Check for a limitation of the (current) sum_fftgrid_dd code */
+ if (pme->nthread > 1 &&
+ (pme->overlap[0].noverlap_nodes > 1 ||
+ pme->overlap[1].noverlap_nodes > 1))
+ {
+ gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
+ }
+
+ snew(pme->bsp_mod[XX],pme->nkx);
+ snew(pme->bsp_mod[YY],pme->nky);
+ snew(pme->bsp_mod[ZZ],pme->nkz);
+
+ /* The required size of the interpolation grid, including overlap.
+ * The allocated size (pmegrid_n?) might be slightly larger.
+ */
+ pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
+ pme->overlap[0].s2g0[pme->nodeid_major];
+ pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
+ pme->overlap[1].s2g0[pme->nodeid_minor];
+ pme->pmegrid_nz_base = pme->nkz;
+ pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
+ set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
+
+ pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
+ pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
+ pme->pmegrid_start_iz = 0;
+
+ make_gridindex5_to_localindex(pme->nkx,
+ pme->pmegrid_start_ix,
+ pme->pmegrid_nx - (pme->pme_order-1),
+ &pme->nnx,&pme->fshx);
+ make_gridindex5_to_localindex(pme->nky,
+ pme->pmegrid_start_iy,
+ pme->pmegrid_ny - (pme->pme_order-1),
+ &pme->nny,&pme->fshy);
+ make_gridindex5_to_localindex(pme->nkz,
+ pme->pmegrid_start_iz,
+ pme->pmegrid_nz_base,
+ &pme->nnz,&pme->fshz);
+
+ pmegrids_init(&pme->pmegridA,
+ pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
+ pme->pmegrid_nz_base,
+ pme->pme_order,
+ pme->nthread,
+ pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
+ pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
+
+ sse_mask_init(&pme->spline_work,pme->pme_order);
+
+ ndata[0] = pme->nkx;
+ ndata[1] = pme->nky;
+ ndata[2] = pme->nkz;
+
+ /* This routine will allocate the grid data to fit the FFTs */
+ gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
+ &pme->fftgridA,&pme->cfftgridA,
+ pme->mpi_comm_d,
+ pme->overlap[0].s2g0,pme->overlap[1].s2g0,
+ bReproducible,pme->nthread);
+
+ if (bFreeEnergy)
+ {
+ pmegrids_init(&pme->pmegridB,
+ pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
+ pme->pmegrid_nz_base,
+ pme->pme_order,
+ pme->nthread,
+ pme->nkx % pme->nnodes_major != 0,
+ pme->nky % pme->nnodes_minor != 0);
+
+ gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
+ &pme->fftgridB,&pme->cfftgridB,
+ pme->mpi_comm_d,
+ pme->overlap[0].s2g0,pme->overlap[1].s2g0,
+ bReproducible,pme->nthread);
+ }
+ else
+ {
+ pme->pmegridB.grid.grid = NULL;
+ pme->fftgridB = NULL;
+ pme->cfftgridB = NULL;
+ }
+
+ make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
+
+ /* Use atc[0] for spreading */
+ init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
+ if (pme->ndecompdim >= 2)
+ {
+ init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
+ }
+
+ if (pme->nnodes == 1) {
+ pme->atc[0].n = homenr;
+ pme_realloc_atomcomm_things(&pme->atc[0]);
+ }
+
+ {
+ int thread;
+
+ /* Use fft5d, order after FFT is y major, z, x minor */
+
+ snew(pme->work,pme->nthread);
+ for(thread=0; thread<pme->nthread; thread++)
+ {
+ realloc_work(&pme->work[thread],pme->nkx);
+ }
+ }
+
+ *pmedata = pme;
+
+ return 0;
+}
+
+
+static void copy_local_grid(gmx_pme_t pme,
+ pmegrids_t *pmegrids,int thread,real *fftgrid)
+{
+ ivec local_fft_ndata,local_fft_offset,local_fft_size;
+ int fft_my,fft_mz;
+ int nsx,nsy,nsz;
+ ivec nf;
+ int offx,offy,offz,x,y,z,i0,i0t;
+ int d;
+ pmegrid_t *pmegrid;
+ real *grid_th;
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+ fft_my = local_fft_size[YY];
+ fft_mz = local_fft_size[ZZ];
+
+ pmegrid = &pmegrids->grid_th[thread];
+
+ nsx = pmegrid->n[XX];
+ nsy = pmegrid->n[YY];
+ nsz = pmegrid->n[ZZ];
+
+ for(d=0; d<DIM; d++)
+ {
+ nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
+ local_fft_ndata[d] - pmegrid->offset[d]);
+ }
+
+ offx = pmegrid->offset[XX];
+ offy = pmegrid->offset[YY];
+ offz = pmegrid->offset[ZZ];
+
+ /* Directly copy the non-overlapping parts of the local grids.
+ * This also initializes the full grid.
+ */
+ grid_th = pmegrid->grid;
+ for(x=0; x<nf[XX]; x++)
+ {
+ for(y=0; y<nf[YY]; y++)
+ {
+ i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
+ i0t = (x*nsy + y)*nsz;
+ for(z=0; z<nf[ZZ]; z++)
+ {
+ fftgrid[i0+z] = grid_th[i0t+z];
+ }
+ }
+ }
+}
+
+static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
+{
+ ivec local_fft_ndata,local_fft_offset,local_fft_size;
+ pme_overlap_t *overlap;
+ int datasize,nind;
+ int i,x,y,z,n;
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+ /* Major dimension */
+ overlap = &pme->overlap[0];
+
+ nind = overlap->comm_data[0].send_nindex;
+
+ for(y=0; y<local_fft_ndata[YY]; y++) {
+ printf(" %2d",y);
+ }
+ printf("\n");
+
+ i = 0;
+ for(x=0; x<nind; x++) {
+ for(y=0; y<local_fft_ndata[YY]; y++) {
+ n = 0;
+ for(z=0; z<local_fft_ndata[ZZ]; z++) {
+ if (sendbuf[i] != 0) n++;
+ i++;
+ }
+ printf(" %2d",n);
+ }
+ printf("\n");
+ }
+}
+
+static void
+reduce_threadgrid_overlap(gmx_pme_t pme,
+ const pmegrids_t *pmegrids,int thread,
+ real *fftgrid,real *commbuf_x,real *commbuf_y)
+{
+ ivec local_fft_ndata,local_fft_offset,local_fft_size;
+ int fft_nx,fft_ny,fft_nz;
+ int fft_my,fft_mz;
+ int buf_my=-1;
+ int nsx,nsy,nsz;
+ ivec ne;
+ int offx,offy,offz,x,y,z,i0,i0t;
+ int sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
+ gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
+ gmx_bool bCommX,bCommY;
+ int d;
+ int thread_f;
+ const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
+ const real *grid_th;
+ real *commbuf=NULL;
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+ fft_nx = local_fft_ndata[XX];
+ fft_ny = local_fft_ndata[YY];
+ fft_nz = local_fft_ndata[ZZ];
+
+ fft_my = local_fft_size[YY];
+ fft_mz = local_fft_size[ZZ];
+
+ /* This routine is called when all thread have finished spreading.
+ * Here each thread sums grid contributions calculated by other threads
+ * to the thread local grid volume.
+ * To minimize the number of grid copying operations,
+ * this routines sums immediately from the pmegrid to the fftgrid.
+ */
+
+ /* Determine which part of the full node grid we should operate on,
+ * this is our thread local part of the full grid.
+ */
+ pmegrid = &pmegrids->grid_th[thread];
+
+ for(d=0; d<DIM; d++)
+ {
+ ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+ local_fft_ndata[d]);
+ }
+
+ offx = pmegrid->offset[XX];
+ offy = pmegrid->offset[YY];
+ offz = pmegrid->offset[ZZ];
+
+
+ bClearBufX = TRUE;
+ bClearBufY = TRUE;
+ bClearBufXY = TRUE;
+
+ /* Now loop over all the thread data blocks that contribute
+ * to the grid region we (our thread) are operating on.
+ */
+ /* Note that ffy_nx/y is equal to the number of grid points
+ * between the first point of our node grid and the one of the next node.
+ */
+ for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
+ {
+ fx = pmegrid->ci[XX] + sx;
+ ox = 0;
+ bCommX = FALSE;
+ if (fx < 0) {
+ fx += pmegrids->nc[XX];
+ ox -= fft_nx;
+ bCommX = (pme->nnodes_major > 1);
+ }
+ pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
+ ox += pmegrid_g->offset[XX];
+ if (!bCommX)
+ {
+ tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
+ }
+ else
+ {
+ tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
+ }
+
+ for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
+ {
+ fy = pmegrid->ci[YY] + sy;
+ oy = 0;
+ bCommY = FALSE;
+ if (fy < 0) {
+ fy += pmegrids->nc[YY];
+ oy -= fft_ny;
+ bCommY = (pme->nnodes_minor > 1);
+ }
+ pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
+ oy += pmegrid_g->offset[YY];
+ if (!bCommY)
+ {
+ ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
+ }
+ else
+ {
+ ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
+ }
+
+ for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
+ {
+ fz = pmegrid->ci[ZZ] + sz;
+ oz = 0;
+ if (fz < 0)
+ {
+ fz += pmegrids->nc[ZZ];
+ oz -= fft_nz;
+ }
+ pmegrid_g = &pmegrids->grid_th[fz];
+ oz += pmegrid_g->offset[ZZ];
+ tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
+
+ if (sx == 0 && sy == 0 && sz == 0)
+ {
+ /* We have already added our local contribution
+ * before calling this routine, so skip it here.
+ */
+ continue;
+ }
+
+ thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
+
+ pmegrid_f = &pmegrids->grid_th[thread_f];
+
+ grid_th = pmegrid_f->grid;
+
+ nsx = pmegrid_f->n[XX];
+ nsy = pmegrid_f->n[YY];
+ nsz = pmegrid_f->n[ZZ];
+
+#ifdef DEBUG_PME_REDUCE
+ printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
+ pme->nodeid,thread,thread_f,
+ pme->pmegrid_start_ix,
+ pme->pmegrid_start_iy,
+ pme->pmegrid_start_iz,
+ sx,sy,sz,
+ offx-ox,tx1-ox,offx,tx1,
+ offy-oy,ty1-oy,offy,ty1,
+ offz-oz,tz1-oz,offz,tz1);
+#endif
+
+ if (!(bCommX || bCommY))
+ {
+ /* Copy from the thread local grid to the node grid */
+ for(x=offx; x<tx1; x++)
+ {
+ for(y=offy; y<ty1; y++)
+ {
+ i0 = (x*fft_my + y)*fft_mz;
+ i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
+ for(z=offz; z<tz1; z++)
+ {
+ fftgrid[i0+z] += grid_th[i0t+z];
+ }
+ }
+ }
+ }
+ else
+ {
+ /* The order of this conditional decides
+ * where the corner volume gets stored with x+y decomp.
+ */
+ if (bCommY)
+ {
+ commbuf = commbuf_y;
+ buf_my = ty1 - offy;
+ if (bCommX)
+ {
+ /* We index commbuf modulo the local grid size */
+ commbuf += buf_my*fft_nx*fft_nz;
+
+ bClearBuf = bClearBufXY;
+ bClearBufXY = FALSE;
+ }
+ else
+ {
+ bClearBuf = bClearBufY;
+ bClearBufY = FALSE;
+ }
+ }
+ else
+ {
+ commbuf = commbuf_x;
+ buf_my = fft_ny;
+ bClearBuf = bClearBufX;
+ bClearBufX = FALSE;
+ }
+
+ /* Copy to the communication buffer */
+ for(x=offx; x<tx1; x++)
+ {
+ for(y=offy; y<ty1; y++)
+ {
+ i0 = (x*buf_my + y)*fft_nz;
+ i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
+
+ if (bClearBuf)
+ {
+ /* First access of commbuf, initialize it */
+ for(z=offz; z<tz1; z++)
+ {
+ commbuf[i0+z] = grid_th[i0t+z];
+ }
+ }
+ else
+ {
+ for(z=offz; z<tz1; z++)
+ {
+ commbuf[i0+z] += grid_th[i0t+z];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+
+static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
+{
+ ivec local_fft_ndata,local_fft_offset,local_fft_size;
+ pme_overlap_t *overlap;
+ int send_nindex;
+ int recv_index0,recv_nindex;
+#ifdef GMX_MPI
+ MPI_Status stat;
+#endif
+ int ipulse,send_id,recv_id,datasize,gridsize,size_yx;
+ real *sendptr,*recvptr;
+ int x,y,z,indg,indb;
+
+ /* Note that this routine is only used for forward communication.
+ * Since the force gathering, unlike the charge spreading,
+ * can be trivially parallelized over the particles,
+ * the backwards process is much simpler and can use the "old"
+ * communication setup.
+ */
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+
+ /* Currently supports only a single communication pulse */
+
+/* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
+ if (pme->nnodes_minor > 1)
+ {
+ /* Major dimension */
+ overlap = &pme->overlap[1];
+
+ if (pme->nnodes_major > 1)
+ {
+ size_yx = pme->overlap[0].comm_data[0].send_nindex;
+ }
+ else
+ {
+ size_yx = 0;
+ }
+ datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
+
+ ipulse = 0;
+
+ send_id = overlap->send_id[ipulse];
+ recv_id = overlap->recv_id[ipulse];
+ send_nindex = overlap->comm_data[ipulse].send_nindex;
+ /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
+ recv_index0 = 0;
+ recv_nindex = overlap->comm_data[ipulse].recv_nindex;
+
+ sendptr = overlap->sendbuf;
+ recvptr = overlap->recvbuf;
+
+ /*
+ printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
+ local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
+ printf("node %d send %f, %f\n",pme->nodeid,
+ sendptr[0],sendptr[send_nindex*datasize-1]);
+ */
+
+#ifdef GMX_MPI
+ MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
+ send_id,ipulse,
+ recvptr,recv_nindex*datasize,GMX_MPI_REAL,
+ recv_id,ipulse,
+ overlap->mpi_comm,&stat);
+#endif
+
+ for(x=0; x<local_fft_ndata[XX]; x++)
+ {
+ for(y=0; y<recv_nindex; y++)
+ {
+ indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
+ indb = (x*recv_nindex + y)*local_fft_ndata[ZZ];
+ for(z=0; z<local_fft_ndata[ZZ]; z++)
+ {
+ fftgrid[indg+z] += recvptr[indb+z];
+ }
+ }
+ }
+ if (pme->nnodes_major > 1)
+ {
+ sendptr = pme->overlap[0].sendbuf;
+ for(x=0; x<size_yx; x++)
+ {
+ for(y=0; y<recv_nindex; y++)
+ {
+ indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+ indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
+ for(z=0; z<local_fft_ndata[ZZ]; z++)
+ {
+ sendptr[indg+z] += recvptr[indb+z];
+ }
+ }
+ }
+ }
+ }
+
+ /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
+ if (pme->nnodes_major > 1)
{
- gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_order for x and/or y",pme->pme_order);
- }
+ /* Major dimension */
+ overlap = &pme->overlap[0];
- if (pme->nnodes > 1) {
- double imbal;
+ datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
+ gridsize = local_fft_size[YY] *local_fft_size[ZZ];
+
+ ipulse = 0;
+
+ send_id = overlap->send_id[ipulse];
+ recv_id = overlap->recv_id[ipulse];
+ send_nindex = overlap->comm_data[ipulse].send_nindex;
+ /* recv_index0 = overlap->comm_data[ipulse].recv_index0; */
+ recv_index0 = 0;
+ recv_nindex = overlap->comm_data[ipulse].recv_nindex;
+
+ sendptr = overlap->sendbuf;
+ recvptr = overlap->recvbuf;
+
+ if (debug != NULL)
+ {
+ fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
+ send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
+ }
#ifdef GMX_MPI
- MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
- MPI_Type_commit(&(pme->rvec_mpi));
+ MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
+ send_id,ipulse,
+ recvptr,recv_nindex*datasize,GMX_MPI_REAL,
+ recv_id,ipulse,
+ overlap->mpi_comm,&stat);
#endif
-
- /* Note that the charge spreading and force gathering, which usually
- * takes about the same amount of time as FFT+solve_pme,
- * is always fully load balanced
- * (unless the charge distribution is inhomogeneous).
- */
-
- imbal = pme_load_imbalance(pme);
- if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
+
+ for(x=0; x<recv_nindex; x++)
{
- fprintf(stderr,
- "\n"
- "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
- " For optimal PME load balancing\n"
- " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
- " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
- "\n",
- (int)((imbal-1)*100 + 0.5),
- pme->nkx,pme->nky,pme->nnodes_major,
- pme->nky,pme->nkz,pme->nnodes_minor);
+ for(y=0; y<local_fft_ndata[YY]; y++)
+ {
+ indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
+ indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+ for(z=0; z<local_fft_ndata[ZZ]; z++)
+ {
+ fftgrid[indg+z] += recvptr[indb+z];
+ }
+ }
}
}
+}
- init_overlap_comm(&pme->overlap[0],pme->pme_order,
-#ifdef GMX_MPI
- pme->mpi_comm_d[0],
+
+static void spread_on_grid(gmx_pme_t pme,
+ pme_atomcomm_t *atc,pmegrids_t *grids,
+ gmx_bool bCalcSplines,gmx_bool bSpread,
+ real *fftgrid)
+{
+ int nthread,thread;
+#ifdef PME_TIME_THREADS
+ gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
+ static double cs1=0,cs2=0,cs3=0;
+ static double cs1a[6]={0,0,0,0,0,0};
+ static int cnt=0;
#endif
- pme->nnodes_major,pme->nodeid_major,pme->nkx);
-
- init_overlap_comm(&pme->overlap[1],pme->pme_order,
-#ifdef GMX_MPI
- pme->mpi_comm_d[1],
+
+ nthread = pme->nthread;
+
+#ifdef PME_TIME_THREADS
+ c1 = omp_cyc_start();
#endif
- pme->nnodes_minor,pme->nodeid_minor,pme->nky);
-
- snew(pme->bsp_mod[XX],pme->nkx);
- snew(pme->bsp_mod[YY],pme->nky);
- snew(pme->bsp_mod[ZZ],pme->nkz);
-
- /* Allocate data for the interpolation grid, including overlap */
- pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
- pme->overlap[0].s2g0[pme->nodeid_major];
- pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
- pme->overlap[1].s2g0[pme->nodeid_minor];
- pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
-
- pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
- pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
- pme->pmegrid_start_iz = 0;
-
- make_gridindex5_to_localindex(pme->nkx,
- pme->pmegrid_start_ix,
- pme->pmegrid_nx - (pme->pme_order-1),
- &pme->nnx,&pme->fshx);
- make_gridindex5_to_localindex(pme->nky,
- pme->pmegrid_start_iy,
- pme->pmegrid_ny - (pme->pme_order-1),
- &pme->nny,&pme->fshy);
- make_gridindex5_to_localindex(pme->nkz,
- pme->pmegrid_start_iz,
- pme->pmegrid_nz - (pme->pme_order-1),
- &pme->nnz,&pme->fshz);
-
- snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
-
- /* For non-divisible grid we need pme_order iso pme_order-1 */
- /* x overlap is copied in place: take padding into account.
- * y is always copied through a buffer: we don't need padding in z,
- * but we do need the overlap in x because of the communication order.
- */
- bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
- bufsizey = pme->pme_order*pme->pmegrid_nx*pme->nkz;
- bufsize = (bufsizex>bufsizey) ? bufsizex : bufsizey;
-
- snew(pme->pmegrid_sendbuf,bufsize);
- snew(pme->pmegrid_recvbuf,bufsize);
-
- ndata[0] = pme->nkx;
- ndata[1] = pme->nky;
- ndata[2] = pme->nkz;
-
- /* This routine will allocate the grid data to fit the FFTs */
- gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
- &pme->fftgridA,&pme->cfftgridA,
- pme->mpi_comm_d,
- pme->overlap[0].s2g0,pme->overlap[1].s2g0,
- bReproducible);
-
- if (bFreeEnergy)
- {
- snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
- gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
- &pme->fftgridB,&pme->cfftgridB,
- pme->mpi_comm_d,
- pme->overlap[0].s2g0,pme->overlap[1].s2g0,
- bReproducible);
- } else
+ if (bCalcSplines)
{
- pme->pmegridB = NULL;
- pme->fftgridB = NULL;
- pme->cfftgridB = NULL;
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for(thread=0; thread<nthread; thread++)
+ {
+ int start,end;
+
+ start = atc->n* thread /nthread;
+ end = atc->n*(thread+1)/nthread;
+
+ /* Compute fftgrid index for all atoms,
+ * with help of some extra variables.
+ */
+ calc_interpolation_idx(pme,atc,start,end,thread);
+ }
}
-
- make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
-
- /* Use atc[0] for spreading */
- init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
- if (pme->ndecompdim >= 2)
+#ifdef PME_TIME_THREADS
+ c1 = omp_cyc_end(c1);
+ cs1 += (double)c1;
+#endif
+
+#ifdef PME_TIME_THREADS
+ c2 = omp_cyc_start();
+#endif
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for(thread=0; thread<nthread; thread++)
{
- init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
+ splinedata_t *spline;
+ pmegrid_t *grid;
+
+ /* make local bsplines */
+ if (grids->nthread == 1)
+ {
+ spline = &atc->spline[0];
+
+ spline->n = atc->n;
+
+ grid = &grids->grid;
+ }
+ else
+ {
+ spline = &atc->spline[thread];
+
+ make_thread_local_ind(atc,thread,spline);
+
+ grid = &grids->grid_th[thread];
+ }
+
+ if (bCalcSplines)
+ {
+ make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
+ atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
+ }
+
+ if (bSpread)
+ {
+ /* put local atoms on grid. */
+#ifdef PME_TIME_SPREAD
+ ct1a = omp_cyc_start();
+#endif
+ spread_q_bsplines_thread(grid,atc,spline,&pme->spline_work);
+
+ if (grids->nthread > 1)
+ {
+ copy_local_grid(pme,grids,thread,fftgrid);
+ }
+#ifdef PME_TIME_SPREAD
+ ct1a = omp_cyc_end(ct1a);
+ cs1a[thread] += (double)ct1a;
+#endif
+ }
}
-
- if (pme->nnodes == 1) {
- pme->atc[0].n = homenr;
- pme_realloc_atomcomm_things(&pme->atc[0]);
+#ifdef PME_TIME_THREADS
+ c2 = omp_cyc_end(c2);
+ cs2 += (double)c2;
+#endif
+
+ if (grids->nthread > 1)
+ {
+#ifdef PME_TIME_THREADS
+ c3 = omp_cyc_start();
+#endif
+#pragma omp parallel for num_threads(grids->nthread) schedule(static)
+ for(thread=0; thread<grids->nthread; thread++)
+ {
+ reduce_threadgrid_overlap(pme,grids,thread,
+ fftgrid,
+ pme->overlap[0].sendbuf,
+ pme->overlap[1].sendbuf);
+#ifdef PRINT_PME_SENDBUF
+ print_sendbuf(pme,pme->overlap[0].sendbuf);
+#endif
+ }
+#ifdef PME_TIME_THREADS
+ c3 = omp_cyc_end(c3);
+ cs3 += (double)c3;
+#endif
+
+ if (pme->nnodes > 1)
+ {
+ /* Communicate the overlapping part of the fftgrid */
+ sum_fftgrid_dd(pme,fftgrid);
+ }
}
-
- /* Use fft5d, order after FFT is y major, z, x minor */
- pme->work_nalloc = pme->nkx;
- snew(pme->work_mhx,pme->work_nalloc);
- snew(pme->work_mhy,pme->work_nalloc);
- snew(pme->work_mhz,pme->work_nalloc);
- snew(pme->work_m2,pme->work_nalloc);
- snew(pme->work_denom,pme->work_nalloc);
- /* Allocate an aligned pointer for SSE operations, including 3 extra
- * elements at the end since SSE operates on 4 elements at a time.
- */
- snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
- pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
- snew(pme->work_m2inv,pme->work_nalloc);
- *pmedata = pme;
-
- return 0;
+#ifdef PME_TIME_THREADS
+ cnt++;
+ if (cnt % 20 == 0)
+ {
+ printf("idx %.2f spread %.2f red %.2f",
+ cs1*1e-9,cs2*1e-9,cs3*1e-9);
+#ifdef PME_TIME_SPREAD
+ for(thread=0; thread<nthread; thread++)
+ printf(" %.2f",cs1a[thread]*1e-9);
+#endif
+ printf("\n");
+ }
+#endif
}
-static void spread_on_grid(gmx_pme_t pme,
- pme_atomcomm_t *atc,real *grid,
- gmx_bool bCalcSplines,gmx_bool bSpread)
-{
- if (bCalcSplines)
- {
-
- /* Compute fftgrid index for all atoms,
- * with help of some extra variables.
- */
- calc_interpolation_idx(pme,atc);
-
- /* make local bsplines */
- make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
- atc->fractx,atc->n,atc->q,pme->bFEP);
- }
-
- if (bSpread)
+
+static void dump_grid(FILE *fp,
+ int sx,int sy,int sz,int nx,int ny,int nz,
+ int my,int mz,const real *g)
+{
+ int x,y,z;
+
+ for(x=0; x<nx; x++)
{
- /* put local atoms on grid. */
- spread_q_bsplines(pme,atc,grid);
+ for(y=0; y<ny; y++)
+ {
+ for(z=0; z<nz; z++)
+ {
+ fprintf(fp,"%2d %2d %2d %6.3f\n",
+ sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
+ }
+ }
}
}
+static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
+{
+ ivec local_fft_ndata,local_fft_offset,local_fft_size;
+
+ gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+ local_fft_ndata,
+ local_fft_offset,
+ local_fft_size);
+
+ dump_grid(stderr,
+ pme->pmegrid_start_ix,
+ pme->pmegrid_start_iy,
+ pme->pmegrid_start_iz,
+ pme->pmegrid_nx-pme->pme_order+1,
+ pme->pmegrid_ny-pme->pme_order+1,
+ pme->pmegrid_nz-pme->pme_order+1,
+ local_fft_size[YY],
+ local_fft_size[ZZ],
+ fftgrid);
+}
+
+
void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
{
pme_atomcomm_t *atc;
- real *grid;
+ pmegrids_t *grid;
if (pme->nnodes > 1)
{
pme_realloc_atomcomm_things(atc);
atc->x = x;
atc->q = q;
-
+
/* We only use the A-charges grid */
- grid = pme->pmegridA;
+ grid = &pme->pmegridA;
- spread_on_grid(pme,atc,NULL,TRUE,FALSE);
+ spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
- *V = gather_energy_bsplines(pme,grid,atc);
+ *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
}
int count;
gmx_bool bEnerVir;
gmx_large_int_t step,step_rel;
-
-
+
+
pme_pp = gmx_pme_pp_init(cr);
-
+
init_nrnb(nrnb);
-
+
count = 0;
do /****** this is a quasi-loop over time steps! */
{
&pme->bFEP,&lambda,
&bEnerVir,
&step);
-
+
if (natoms == -1) {
/* We should stop: break out of the loop */
break;
}
-
+
step_rel = step - ir->init_step;
-
+
if (count == 0)
wallcycle_start(wcycle,ewcRUN);
-
+
wallcycle_start(wcycle,ewcPMEMESH);
-
+
dvdlambda = 0;
clear_mat(vir);
gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
&energy,lambda,&dvdlambda,
GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
-
+
cycles = wallcycle_stop(wcycle,ewcPMEMESH);
-
+
gmx_pme_send_force_vir_ener(pme_pp,
f_pp,vir,energy,dvdlambda,
cycles);
-
+
count++;
if (step_rel == wcycle_get_reset_counters(wcycle))
reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
wcycle_set_reset_counters(wcycle, 0);
}
-
+
} /***** end of quasi-loop, we stop with the break above */
while (TRUE);
-
+
return 0;
}
int start, int homenr,
rvec x[], rvec f[],
real *chargeA, real *chargeB,
- matrix box, t_commrec *cr,
+ matrix box, t_commrec *cr,
int maxshift_x, int maxshift_y,
t_nrnb *nrnb, gmx_wallcycle_t wcycle,
matrix vir, real ewaldcoeff,
- real *energy, real lambda,
+ real *energy, real lambda,
real *dvdlambda, int flags)
{
int q,d,i,j,ntot,npme;
int nx,ny,nz;
int n_d,local_ny;
- int loop_count;
pme_atomcomm_t *atc=NULL;
- real * grid=NULL;
+ pmegrids_t *pmegrid=NULL;
+ real *grid=NULL;
real *ptr;
rvec *x_d,*f_d;
- real *charge=NULL,*q_d,vol;
+ real *charge=NULL,*q_d;
real energy_AB[2];
matrix vir_AB[2];
- gmx_bool bClearF;
+ gmx_bool bClearF;
gmx_parallel_3dfft_t pfft_setup;
real * fftgrid;
t_complex * cfftgrid;
+ int thread;
if (pme->nnodes > 1) {
atc = &pme->atc[0];
/* This could be necessary for TPI */
pme->atc[0].n = homenr;
}
-
+
for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
if (q == 0) {
- grid = pme->pmegridA;
+ pmegrid = &pme->pmegridA;
fftgrid = pme->fftgridA;
cfftgrid = pme->cfftgridA;
pfft_setup = pme->pfft_setupA;
charge = chargeA+start;
} else {
- grid = pme->pmegridB;
+ pmegrid = &pme->pmegridB;
fftgrid = pme->fftgridB;
cfftgrid = pme->cfftgridB;
pfft_setup = pme->pfft_setupB;
charge = chargeB+start;
}
+ grid = pmegrid->grid.grid;
/* Unpack structure */
if (debug) {
fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
gmx_fatal(FARGS,"No grid!");
}
where();
-
- m_inv_ur0(box,pme->recipbox);
+
+ m_inv_ur0(box,pme->recipbox);
if (pme->nnodes == 1) {
atc = &pme->atc[0];
srenew(atc->pd,atc->pd_nalloc);
}
atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
- pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
+ pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
where();
-
+
GMX_BARRIER(cr->mpi_comm_mygroup);
/* Redistribute x (only once) and qA or qB */
if (DOMAINDECOMP(cr)) {
wallcycle_stop(wcycle,ewcPME_REDISTXF);
}
-
+
if (debug)
fprintf(debug,"Node= %6d, pme local particles=%6d\n",
cr->nodeid,atc->n);
/* Spread the charges on a grid */
GMX_MPE_LOG(ev_spread_on_grid_start);
-
+
/* Spread the charges on a grid */
- spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
+ spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
GMX_MPE_LOG(ev_spread_on_grid_finish);
if (q == 0)
inc_nrnb(nrnb,eNR_SPREADQBSP,
pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
- wrap_periodic_pmegrid(pme,grid);
+ if (pme->nthread == 1)
+ {
+ wrap_periodic_pmegrid(pme,grid);
- /* sum contributions to local grid from other nodes */
+ /* sum contributions to local grid from other nodes */
#ifdef GMX_MPI
- if (pme->nnodes > 1) {
- GMX_BARRIER(cr->mpi_comm_mygroup);
- gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
- where();
- }
+ if (pme->nnodes > 1)
+ {
+ GMX_BARRIER(cr->mpi_comm_mygroup);
+ gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
+ where();
+ }
#endif
- where();
- copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
+ copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
+ }
wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
- }
-
- if (flags & GMX_PME_SOLVE)
- {
- /* do 3d-fft */
- GMX_BARRIER(cr->mpi_comm_mygroup);
- GMX_MPE_LOG(ev_gmxfft3d_start);
- wallcycle_start(wcycle,ewcPME_FFT);
- gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
- wallcycle_stop(wcycle,ewcPME_FFT);
- GMX_MPE_LOG(ev_gmxfft3d_finish);
- where();
-
- /* solve in k-space for our local cells */
- vol = det(box);
- GMX_BARRIER(cr->mpi_comm_mygroup);
- GMX_MPE_LOG(ev_solve_pme_start);
- wallcycle_start(wcycle,ewcPME_SOLVE);
- loop_count =
- solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
- flags & GMX_PME_CALC_ENER_VIR,
- &energy_AB[q],vir_AB[q]);
- wallcycle_stop(wcycle,ewcPME_SOLVE);
- where();
- GMX_MPE_LOG(ev_solve_pme_finish);
- inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
+
+ /*
+ dump_local_fftgrid(pme,fftgrid);
+ exit(0);
+ */
}
- if ((flags & GMX_PME_CALC_F) ||
- (flags & GMX_PME_CALC_POT))
+ /* Here we start a large thread parallel region */
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for(thread=0; thread<pme->nthread; thread++)
{
-
- /* do 3d-invfft */
- GMX_BARRIER(cr->mpi_comm_mygroup);
- GMX_MPE_LOG(ev_gmxfft3d_start);
- where();
- wallcycle_start(wcycle,ewcPME_FFT);
- gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
- wallcycle_stop(wcycle,ewcPME_FFT);
+ if (flags & GMX_PME_SOLVE)
+ {
+ int loop_count;
- where();
- GMX_MPE_LOG(ev_gmxfft3d_finish);
+ /* do 3d-fft */
+ if (thread == 0)
+ {
+ GMX_BARRIER(cr->mpi_comm_mygroup);
+ GMX_MPE_LOG(ev_gmxfft3d_start);
+ wallcycle_start(wcycle,ewcPME_FFT);
+ }
+ gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
+ fftgrid,cfftgrid,thread,wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle,ewcPME_FFT);
+ GMX_MPE_LOG(ev_gmxfft3d_finish);
+ }
+ where();
- if (pme->nodeid == 0)
- {
- ntot = pme->nkx*pme->nky*pme->nkz;
- npme = ntot*log((real)ntot)/log(2.0);
- inc_nrnb(nrnb,eNR_FFT,2*npme);
+ /* solve in k-space for our local cells */
+ if (thread == 0)
+ {
+ GMX_BARRIER(cr->mpi_comm_mygroup);
+ GMX_MPE_LOG(ev_solve_pme_start);
+ wallcycle_start(wcycle,ewcPME_SOLVE);
+ }
+ loop_count =
+ solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
+ box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+ flags & GMX_PME_CALC_ENER_VIR,
+ pme->nthread,thread);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle,ewcPME_SOLVE);
+ where();
+ GMX_MPE_LOG(ev_solve_pme_finish);
+ inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
+ }
}
- wallcycle_start(wcycle,ewcPME_SPREADGATHER);
+ if (flags & GMX_PME_CALC_F)
+ {
+ /* do 3d-invfft */
+ if (thread == 0)
+ {
+ GMX_BARRIER(cr->mpi_comm_mygroup);
+ GMX_MPE_LOG(ev_gmxfft3d_start);
+ where();
+ wallcycle_start(wcycle,ewcPME_FFT);
+ }
+ gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
+ cfftgrid,fftgrid,thread,wcycle);
+ if (thread == 0)
+ {
+ wallcycle_stop(wcycle,ewcPME_FFT);
+
+ where();
+ GMX_MPE_LOG(ev_gmxfft3d_finish);
+
+ if (pme->nodeid == 0)
+ {
+ ntot = pme->nkx*pme->nky*pme->nkz;
+ npme = ntot*log((real)ntot)/log(2.0);
+ inc_nrnb(nrnb,eNR_FFT,2*npme);
+ }
+
+ wallcycle_start(wcycle,ewcPME_SPREADGATHER);
+ }
- copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
+ copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
+ }
+ }
+ /* End of thread parallel section.
+ * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
+ */
+ if (flags & GMX_PME_CALC_F)
+ {
/* distribute local grid to all nodes */
#ifdef GMX_MPI
if (pme->nnodes > 1) {
where();
unwrap_periodic_pmegrid(pme,grid);
- }
- if (flags & GMX_PME_CALC_F)
- {
/* interpolate forces for our local atoms */
GMX_BARRIER(cr->mpi_comm_mygroup);
GMX_MPE_LOG(ev_gather_f_bsplines_start);
where();
-
+
/* If we are running without parallelization,
* atc->f is the actual force array, not a buffer,
* therefore we should not clear it.
*/
bClearF = (q == 0 && PAR(cr));
- gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
- pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for(thread=0; thread<pme->nthread; thread++)
+ {
+ gather_f_bsplines(pme,grid,bClearF,atc,
+ &atc->spline[thread],
+ pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
+ }
+
where();
-
+
GMX_MPE_LOG(ev_gather_f_bsplines_finish);
-
+
inc_nrnb(nrnb,eNR_GATHERFBSP,
pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
- }
+ }
+
+ if (flags & GMX_PME_CALC_ENER_VIR)
+ {
+ /* This should only be called on the master thread
+ * and after the threads have synchronized.
+ */
+ get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
+ }
} /* of q-loop */
-
+
if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
wallcycle_start(wcycle,ewcPME_REDISTXF);
for(d=0; d<pme->ndecompdim; d++)
wallcycle_stop(wcycle,ewcPME_REDISTXF);
}
where();
-
+
if (!pme->bFEP) {
*energy = energy_AB[0];
m_add(vir,vir_AB[0],vir);
}
if (debug)
+ {
fprintf(debug,"PME mesh energy: %g\n",*energy);
-
+ }
+
return 0;
}