pN, pM, pK is local size specific to current processor (only different to max if not divisible)
NG, MG, KG is size of global data*/
static void splitaxes(t_complex* lout, const t_complex* lin,
- int maxN, int maxM, int maxK, int pN, int pM, int pK,
+ int maxN, int maxM, int maxK, int pM,
int P, int NG, int *N, int* oN, int starty, int startz, int endy, int endz)
{
int x, y, z, i;
N,M,K local dimensions
KG global size*/
static void joinAxesTrans13(t_complex* lout, const t_complex* lin,
- int maxN, int maxM, int maxK, int pN, int pM, int pK,
+ int maxN, int maxM, int maxK, int pM,
int P, int KG, int* K, int* oK, int starty, int startx, int endy, int endx)
{
int i, x, y, z;
the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
N,M,K local size
MG, global size*/
-static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN, int pM, int pK,
+static void joinAxesTrans12(t_complex* lout, const t_complex* lin, int maxN, int maxM, int maxK, int pN,
int P, int MG, int* M, int* oM, int startx, int startz, int endx, int endz)
{
int i, z, y, x;
{
tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
tstart /= C[s];
- splitaxes(lout2, lout, N[s], M[s], K[s], pN[s], pM[s], pK[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
+ splitaxes(lout2, lout, N[s], M[s], K[s], pM[s], P[s], C[s], iNout[s], oNout[s], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
}
#pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
#ifdef NOGMX
{
tstart = ( thread *pM[s]*pN[s]/plan->nthreads);
tend = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
- joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pN[s], pM[s], pK[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
+ joinAxesTrans13(lin, joinin, N[s], pM[s], K[s], pM[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pM[s], tstart/pM[s], tend%pM[s], tend/pM[s]);
}
}
else
{
tstart = ( thread *pK[s]*pN[s]/plan->nthreads);
tend = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
- joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], pM[s], pK[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
+ joinAxesTrans12(lin, joinin, N[s], M[s], pK[s], pN[s], P[s], C[s+1], iNin[s+1], oNin[s+1], tstart%pN[s], tstart/pN[s], tend%pN[s], tend/pN[s]);
}
}
real ** real_data,
t_complex ** complex_data,
MPI_Comm comm[2],
- int * slab2index_major,
- int * slab2index_minor,
gmx_bool bReproducible,
int nthreads)
{
int
gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t pfft_setup,
enum gmx_fft_direction dir,
- void * in_data,
- void * out_data,
int thread,
gmx_wallcycle_t wcycle)
{
* \param comm MPI communicator for both parallelization axis.
* Needs to be either initialized or MPI_NULL for
* no parallelization in that axis.
- * \param slab2index_major Not used
- * \param slab2index_minor Not used
* \param bReproducible Try to avoid FFT timing optimizations and other stuff
* that could make results differ for two runs with
* identical input (reproducibility for debugging).
real **real_data,
t_complex **complex_data,
MPI_Comm comm[2],
- int * slab2index_major,
- int * slab2index_minor,
gmx_bool bReproducible,
int nthreads);
int
gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t pfft_setup,
enum gmx_fft_direction dir,
- void * in_data,
- void * out_data,
int thread,
gmx_wallcycle_t wcycle);
ivec local_ndata, offset, rsize, csize, complex_order;
gmx_parallel_3dfft_init(&fft_, ndata, &rdata, &cdata,
- comm, NULL, NULL, TRUE, 1);
+ comm, TRUE, 1);
gmx_parallel_3dfft_real_limits(fft_, local_ndata, offset, rsize);
gmx_parallel_3dfft_complex_limits(fft_, complex_order,
int size = csize[0]*csize[1]*csize[2];
memcpy(rdata, inputdata, size*sizeof(t_complex));
- gmx_parallel_3dfft_execute(fft_, GMX_FFT_REAL_TO_COMPLEX, rdata, cdata,
- 0, NULL);
+ gmx_parallel_3dfft_execute(fft_, GMX_FFT_REAL_TO_COMPLEX, 0, NULL);
//TODO use std::complex and add checkComplex for it
checker_.checkSequenceArray(size*2,
reinterpret_cast<real*>(cdata), "forward");
memcpy(cdata, inputdata, size*sizeof(t_complex));
- gmx_parallel_3dfft_execute(fft_, GMX_FFT_COMPLEX_TO_REAL, rdata, cdata,
- 0, NULL);
+ gmx_parallel_3dfft_execute(fft_, GMX_FFT_COMPLEX_TO_REAL, 0, NULL);
for (int i = 0; i < ndata[0]*ndata[1]; i++) //check sequence but skip unused data
{
checker_.checkSequenceArray(ndata[2], rdata+i*rsize[2],
static void
F77_FUNC(dgetv0, DGETV0) (int * ido,
const char * bmat,
- int * itry,
+ int gmx_unused * itry,
int * initv,
int * n,
int * j,
double * tol,
double * resid,
int * mode,
- int * iupd,
+ int gmx_unused * iupd,
int * ishift,
int * mxiter,
double * v,
static void
F77_FUNC(sgetv0, SGETV0) (int * ido,
const char * bmat,
- int * itry,
+ int gmx_unused * itry,
int * initv,
int * n,
int * j,
float * tol,
float * resid,
int * mode,
- int * iupd,
+ int gmx_unused * iupd,
int * ishift,
int * mxiter,
float * v,
#include <ctype.h>
#include "../gmx_lapack.h"
+#include <types/simple.h>
void
F77_FUNC(dorml2,DORML2)(const char *side,
double *c,
int *ldc,
double *work,
- int *info)
+ int gmx_unused *info)
{
const char xside=toupper(*side);
const char xtrans=toupper(*trans);
double * d,
double * e,
double * tau,
- int * info)
+ int gmx_unused * info)
{
double minusone,zero;
double taui,alpha,tmp;
#include <ctype.h>
+#include <types/simple.h>
#include "../gmx_lapack.h"
void
float *c,
int *ldc,
float *work,
- int *info)
+ int gmx_unused *info)
{
const char xside=toupper(*side);
const char xtrans=toupper(*trans);
float * d,
float * e,
float * tau,
- int * info)
+ int gmx_unused * info)
{
float minusone,zero;
float taui,alpha,tmp;
return ptr;
}
-void free_matrix(double **a, int n)
+void free_matrix(double **a)
{
int i;
id = alloc_matrix(n, n);
matrix_multiply(fp, n, n, test, a, id);
dump_matrix(fp, "And here is the product of A and Ainv", n, id);
- free_matrix(id, n);
- free_matrix(test, n);
+ free_matrix(id);
+ free_matrix(test);
}
#endif
sfree(ipiv);
}
sfree(atx);
- free_matrix(a, nrow);
- free_matrix(at, ncol);
- free_matrix(ata, ncol);
+ free_matrix(a);
+ free_matrix(at);
+ free_matrix(ata);
return chi2;
}
double **alloc_matrix(int n, int m);
-void free_matrix(double **a, int n);
+void free_matrix(double **a);
void matrix_multiply(FILE *fp, int n, int m, double **x, double **y, double **z);
pme->nthread = nthread;
- /* Check if any of the PME MPI ranks uses threads */
+ /* Check if any of the PME MPI ranks uses threads */
use_threads = (pme->nthread > 1 ? 1 : 0);
#ifdef GMX_MPI
if (pme->nnodes > 1)
gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
&pme->fftgridA, &pme->cfftgridA,
pme->mpi_comm_d,
- pme->overlap[0].s2g0, pme->overlap[1].s2g0,
bReproducible, pme->nthread);
if (bFreeEnergy)
gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
&pme->fftgridB, &pme->cfftgridB,
pme->mpi_comm_d,
- pme->overlap[0].s2g0, pme->overlap[1].s2g0,
bReproducible, pme->nthread);
}
else
wallcycle_start(wcycle, ewcPME_FFT);
}
gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
- fftgrid, cfftgrid, thread, wcycle);
+ thread, wcycle);
if (thread == 0)
{
wallcycle_stop(wcycle, ewcPME_FFT);
wallcycle_start(wcycle, ewcPME_FFT);
}
gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
- cfftgrid, fftgrid, thread, wcycle);
+ thread, wcycle);
if (thread == 0)
{
wallcycle_stop(wcycle, ewcPME_FFT);