}
#endif
+ s=0;
+
/*lin: x,y,z*/
- if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
- for (s=0;s<2;s++) {
- if (times!=0)
- time=MPI_Wtime();
-
- if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
- gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
- } else {
- gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
+ if (plan->flags&FFT5D_DEBUG && thread == 0)
+ {
+ print_localdata(lin, "%d %d: copy in lin\n", s, plan);
}
- if (times!=0)
- time_fft+=MPI_Wtime()-time;
-
- if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
-
+
+ for (s=0;s<2;s++) { /*loop over first two FFT steps (corner rotations)*/
+
#ifdef GMX_MPI
- if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
+ if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
{
- if (times!=0)
- time=MPI_Wtime();
- /*prepare for AllToAll
- 1. (most outer) axes (x) is split into P[s] parts of size N[s]
- for sending*/
- splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
+ bParallelDim = 1;
+ }
+ else
+#endif
+ {
+ bParallelDim = 0;
+ }
- if (times!=0)
+ /* ---------- START FFT ------------ */
+#ifdef NOGMX
+ if (times!=0 && thread == 0)
+ {
+ time=MPI_Wtime();
+ }
+#endif
+
+ if (bParallelDim) {
+ fftout = lout;
+ }
+ else
+ {
+ if (s==0)
+ {
+ fftout = lout3;
+ } else
+ {
+ fftout = lout2;
+ }
+ }
+
+ tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
+ if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
+ {
+ gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
+ } else
+ {
+ gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,fftout+tstart);
+
+ }
+
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
+ time_fft+=MPI_Wtime()-time;
+ }
+#endif
+ if (plan->flags&FFT5D_DEBUG && thread == 0)
+ {
+ print_localdata(lout, "%d %d: FFT %d\n", s, plan);
+ }
+ /* ---------- END FFT ------------ */
+
+ /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
+ if (bParallelDim) {
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
{
- time_local+=MPI_Wtime()-time;
-
- /*send, recv*/
time=MPI_Wtime();
}
+#endif
+ /*prepare for A
+llToAll
+ 1. (most outer) axes (x) is split into P[s] parts of size N[s]
+ for sending*/
+ if (pM[s]>0)
+ {
+ tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
+ tstart/=C[s];
+ splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
+ }
+#pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
+ time_local+=MPI_Wtime()-time;
+ }
+#endif
+ /* ---------- END SPLIT , START TRANSPOSE------------ */
+
+ if (thread == 0)
+ {
+#ifdef NOGMX
+ if (times!=0)
+ {
+ time=MPI_Wtime();
+ }
+#else
+ wallcycle_start(times,ewcPME_FFTCOMM);
+#endif
#ifdef FFT5D_MPI_TRANSPOSE
- FFTW(execute)(mpip[s]);
+ FFTW(execute)(mpip[s]);
#else
- if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
- MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
- else
- MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
-#endif /*FFT5D_MPI_TRANSPOSE*/
- if (times!=0)
- time_mpi[s]=MPI_Wtime()-time;
- }
+#ifdef GMX_MPI
+ if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
+ MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
+ else
+ MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
+#else
+ gmx_incons("fft5d MPI call without MPI configuration");
#endif /*GMX_MPI*/
+#endif /*FFT5D_MPI_TRANSPOSE*/
+#ifdef NOGMX
+ if (times!=0)
+ {
+ time_mpi[s]=MPI_Wtime()-time;
+ }
+#else
+ wallcycle_stop(times,ewcPME_FFTCOMM);
+#endif
+ } /*master*/
+ } /* bPrallelDim */
+#pragma omp barrier /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
-
- if (times!=0)
+ /* ---------- END SPLIT + TRANSPOSE------------ */
+
+ /* ---------- START JOIN ------------ */
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
time=MPI_Wtime();
+ }
+#endif
+
+ if (bParallelDim) {
+ joinin = lout3;
+ } else {
+ joinin = fftout;
+ }
/*bring back in matrix form
thus make new 1. axes contiguos
- also local transpose 1 and 2/3 */
- if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
- joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
- else
- joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
- if (times!=0)
+ also local transpose 1 and 2/3
+ runs on thread used for following FFT (thus needing a barrier before but not afterwards)
+ */
+ if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
+ if (pM[s]>0)
+ {
+ tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
+ tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
+ joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
+ }
+ }
+ else {
+ if (pN[s]>0)
+ {
+ tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
+ tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
+ joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
+ }
+ }
+
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
time_local+=MPI_Wtime()-time;
-
- if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
-
+ }
+#endif
+ if (plan->flags&FFT5D_DEBUG && thread == 0)
+ {
+ print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
+ }
+ /* ---------- END JOIN ------------ */
+
/*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
- }
-
- if (times!=0)
- time=MPI_Wtime();
- if (plan->flags&FFT5D_INPLACE) lout=lin;
+ } /* for(s=0;s<2;s++) */
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
+ time=MPI_Wtime();
+ }
+#endif
+
+ if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
+
+ /* ----------- FFT ----------- */
+ tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
- gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
+ gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
} else {
- gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
+ gmx_fft_many_1d( p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin+tstart,lout+tstart);
}
+ /* ------------ END FFT ---------*/
- if (times!=0)
+#ifdef NOGMX
+ if (times != NULL && thread == 0)
+ {
time_fft+=MPI_Wtime()-time;
- if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
- /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
-
- if (times!=0) {
+
times->fft+=time_fft;
times->local+=time_local;
times->mpi2+=time_mpi[1];