1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * Gromacs 4.0 Copyright (c) 1991-2003
5 * David van der Spoel, Erik Lindahl, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
16 * Grim Ragnarok Overthrows Midgard Amidst Cursing Silence
34 #include "gmx_fatal.h"
36 /* Since c has no support of templates, we get around the type restrictions
37 with macros. Our custom macro names obscure the float vs double interfaces
40 #define ACML_FFT1DX zfft1dx
41 #define ACML_FFT2DX zfft2dx
42 #define ACML_FFT3DX zfft3dy
43 #define ACML_FFT1DMX zfft1mx
45 #define ACML_RCFFT1D dzfft
46 #define ACML_CRFFT1D zdfft
47 #define ACML_RCFFT1DM dzfftm
48 #define ACML_CRFFT1DM zdfftm
50 #define acmlComplex doublecomplex
53 #define ACML_FFT1DX cfft1dx
54 #define ACML_FFT2DX cfft2dx
55 #define ACML_FFT3DX cfft3dy
56 #define ACML_FFT1DMX cfft1mx
58 #define ACML_RCFFT1D scfft
59 #define ACML_CRFFT1D csfft
60 #define ACML_RCFFT1DM scfftm
61 #define ACML_CRFFT1DM csfftm
63 #define acmlComplex complex
66 void cPrintArray( t_complex* arr, int outer, int mid, int inner )
71 for( i = 0; i < outer; ++i )
73 for( j = 0; j < mid; ++j )
75 for( k = 0; k < inner; ++k )
77 printf("%f,%f ", arr[i*inner*mid+j*inner+k].re, arr[i*inner*mid+j*inner+k].im );
87 void rPrintArray( real* arr, int outer, int mid, int inner )
92 for( i = 0; i < outer; ++i )
94 for( j = 0; j < mid; ++j )
96 for( k = 0; k < inner; ++k )
98 printf("%f ", arr[i*inner*mid+j*inner+k] );
108 /* Contents of the AMD ACML FFT fft datatype.
110 * Note that this is one of several possible implementations of gmx_fft_t.
112 * The ACML _API_ supports 1D,2D, and 3D transforms, including real-to-complex.
113 * Unfortunately the actual library implementation does not support 2D&3D real
114 * transforms as of version 4.2 In addition, ACML outputs their hermitian data
115 * in such a form that it can't be fed straight back into their complex API's.
117 * To work around this we roll our own 2D and 3D real-to-complex transforms,
118 * using separate X/Y/Z handles defined to perform (ny*nz), (nx*nz), and
119 * (nx*ny) transforms at once when necessary. To perform strided multiple
120 * transforms out-of-place (i.e., without padding in the last dimension)
121 * on the fly we also need to separate the forward and backward
122 * handles for real-to-complex/complex-to-real data permutation.
124 * So, the handles are enumerated as follows:
126 * 1D FFT (real too): Index 0 is the handle for the entire FFT
127 * 2D complex FFT: Index 0 is the handle for the entire FFT
128 * 3D complex FFT: Index 0 is the handle for the entire FFT
129 * 2D, real FFT: 0=FFTx, 1=FFTrc, 2=FFTcr handle
130 * 3D, real FFT: 0=FFTx, 1=FFTy, 2=FFTrc, 3=FFTcr handle
132 * AMD people reading this: Learn from FFTW what a good interface looks like :-)
139 * r2c = index0, c2r = index1
144 int ndim; /**< Number of dimensions in FFT */
145 int nx; /**< Length of X transform */
146 int ny; /**< Length of Y transform */
147 int nz; /**< Length of Z transform */
148 int real_fft; /**< 1 if real FFT, otherwise 0 */
151 /* ACML's real FFT support leaves the data in a swizzled 'hermitian' format. The
152 problem with this is that you can't just feed this data back into ACML :( A
153 pre-processing step is required to transform the hermitian swizzled complex
154 values into unswizzled complex values
155 -This function assumes that the complex conjugates are not expanded; the
156 calling function should not assume that they exist.
157 -This function pads all real numbers with 0's for imaginary components.
159 void hermitianUnPacking( float scale, int row, int col, void* src, int srcPitch, void* dst )
162 int loopCount = (col-1)/2;
163 int hermLength = mid + 1;
167 /* Currently this function is not written to handle aliased input/output pointers */
168 assert( src != dst );
170 /* These two pointers could be aliased on top of each other; be careful */
171 real* realData = (real*)src;
172 t_complex* hermData = (t_complex*)dst;
174 /* We have to expand the data from real to complex, which will clobber data if you start
175 * from the beginning. For this reason, we start at the last array and work backwards,
176 * however, we skip the very first row (nFFT == 0) because we still have data collision
177 * on that row. We copy the first row into a temporary buffer.
179 for( nFFT = row-1; nFFT >= 0; --nFFT )
181 realData = (real*)src + (nFFT*srcPitch);
182 hermData = (t_complex*)dst + (nFFT*hermLength);
184 /* The first complex number is real valued */
185 t_complex tmp = { realData[0]*scale, 0 };
189 for(i = 1; i <= loopCount; ++i)
191 t_complex tmp = { realData[i]*scale, realData[col-i]*scale };
195 /* A little cleanup for even lengths */
196 if( loopCount != mid )
198 /* The n/2 number is real valued */
199 t_complex tmp = { realData[mid]*scale, 0 };
207 /* ACML's real FFT support requires the data in a swizzled 'hermitian' format.
208 This is a non-standard format, and requires us to manually swizzle the data :(
209 A pre-processing step is required to transform the complex values into the
210 swizzled hermitian format
212 void hermitianPacking( float scale, int row, int col, void* src, void* dst, int dstPitch )
215 int loopCount = (col-1)/2;
216 int hermLength = mid + 1;
223 /* Currently this function is not written to handle aliased input/output pointers */
224 assert( src != dst );
226 for( nFFT = 0; nFFT < row; ++nFFT )
228 realData = (real*)dst + (nFFT*dstPitch);
229 hermData = (t_complex*)src + (nFFT*hermLength);
231 /* The first complex number is real valued */
232 realData[0] = hermData[0].re * scale;
234 /* ACML's complex to real function is documented as only handling 'forward'
235 * transforms, which mean we have to manually conjugate the data before doing
236 * the backwards transform. */
238 for(i = 1; i <= loopCount; ++i)
240 realData[i] = hermData[i].re * scale;
241 realData[col-i] = -hermData[i].im * scale;
244 /* A little cleanup for even lengths */
245 if( loopCount != mid )
247 /* The n/2 number is real valued */
248 realData[mid] = hermData[mid].re * scale;
256 /* Gromacs expects results that are returned to be in a packed form; No
257 complex conjugate's. This routine takes a 2D array of complex data,
258 and compresses it to fit into 'real' form by removing all the complex
259 conjugates and any known imaginary 0's. Data is expected in row-major
260 form, with ny describing the length of a row.
262 void compressConj2D( int nX, int nY, void* src, void* dst )
264 /* These two pointers are aliased on top of each other; be careful */
265 real* realData = (real*)dst;
266 t_complex* complexData;
268 int halfRows = nX/2 + 1;
269 int halfCols = nY/2 + 1;
270 int rowOdd = nX & 0x1;
271 int colOdd = nY & 0x1;
275 for( nRow = 0; nRow < nX; ++nRow )
277 complexData = (t_complex*)src + (nRow*halfCols);
280 for( nCol = 0; nCol < halfCols; ++nCol )
282 if(( nRow == 0) && (nCol == 0 ))
284 /* The complex number is real valued */
285 realData[0] = complexData[nCol].re;
288 /* Last column is real if even column length */
289 else if( (nRow == 0) && ( !colOdd && ( nCol == (halfCols-1) ) ) )
291 /* The complex number is real valued */
292 realData[0] = complexData[nCol].re;
295 /* The middle value of the first column is real if we have an even number of rows and
296 * the last column of middle row is real if we have an even number of rows and columns */
297 else if( !rowOdd && ( ( nRow == (halfRows-1) ) && ( ( nCol == 0 ) || ( !colOdd && ( nCol == (halfCols-1) ) ) ) ) )
299 /* The complex number is real valued */
300 realData[0] = complexData[nCol].re;
303 else if( (nCol == 0) || ( !colOdd && ( nCol == (halfCols-1) ) ) )
305 /* Last half of real columns are conjugates */
306 if( nRow < halfRows )
308 /* Copy a complex number, which is two reals */
309 realData[0] = complexData[nCol].re;
310 realData[1] = complexData[nCol].im;
316 /* Copy a complex number, which is two reals */
317 realData[0] = complexData[nCol].re;
318 realData[1] = complexData[nCol].im;
327 /* Gromacs expects results that are returned to be in a packed form; No
328 complex conjugate's. This routine takes a 2D array of real data,
329 and uncompresses it to fit into 'complex' form by creating all the complex
330 conjugates and any known imaginary 0's. Data is expected in row-major
331 form, with ny describing the length of a row.
333 void unCompressConj2D( int nX, int nY, void* src, void* dst )
335 /* These two pointers are aliased on top of each other; be careful */
336 real* realData = (real*)src;
337 t_complex* complexData = (t_complex*)dst;
339 int halfRows = nX/2 + 1;
340 int halfCols = nY/2 + 1;
341 int rowOdd = nX & 0x1;
342 int colOdd = nY & 0x1;
345 for( nRow = 0; nRow < nX; ++nRow )
348 for( nCol = 0; nCol < halfCols; ++nCol )
350 int iIndex = nRow*halfCols + nCol;
352 if(( nRow == 0) && (nCol == 0 ))
354 /* The complex number is real valued */
355 complexData[iIndex].re = realData[0];
356 complexData[iIndex].im = 0;
359 /* Last column is real if even column length */
360 else if( (nRow == 0) && ( !colOdd && ( nCol == (halfCols-1) ) ) )
362 /* The complex number is real valued */
363 complexData[iIndex].re = realData[0];
364 complexData[iIndex].im = 0;
367 /* The middle value of the first column is real if we have an even number of rows and
368 * the last column of middle row is real if we have an even number of rows and columns */
369 else if( !rowOdd && ( ( nRow == (halfRows-1) ) && ( ( nCol == 0 ) || ( !colOdd && ( nCol == (halfCols-1) ) ) ) ) )
371 /* The complex number is real valued */
372 complexData[iIndex].re = realData[0];
373 complexData[iIndex].im = 0;
376 else if( (nCol == 0) || (!colOdd && ( nCol == (halfCols-1) ) ) )
378 /* Last half of real columns are conjugates */
379 if( nRow < halfRows )
381 /* Copy a complex number, which is two reals */
382 complexData[iIndex].re = realData[0];
383 complexData[iIndex].im = realData[1];
388 int oIndex = (nX-nRow)*halfCols + nCol;
389 complexData[iIndex].re = complexData[oIndex].re;
390 complexData[iIndex].im = -complexData[oIndex].im;
395 /* Copy a complex number, which is two reals */
396 complexData[iIndex].re = realData[0];
397 complexData[iIndex].im = realData[1];
406 /* Support routine for the 1D array tranforms. ACML does not support a MODE
407 * flag on the real/complex interface. It assumes a 'forward' transform both
408 * directions, so that requires a manual conjugation of the imaginary comps. */
409 void negateConj( int len, void* src )
411 real* imag = (real*)src;
412 int half = len/2 + 1;
415 for( i = half; i < len; ++i)
424 gmx_fft_init_1d(gmx_fft_t * pfft,
426 enum gmx_fft_flag flags)
430 acmlComplex* comm = NULL;
436 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
441 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
446 // Single precision requires 5*nx+100
447 // Double precision requires 3*nx+100
448 if( sizeof( acmlComplex ) == 16 )
449 commSize = (3*nx+100)*sizeof( acmlComplex );
451 commSize = (5*nx+100)*sizeof( acmlComplex );
453 // Allocate communication work array
454 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
459 // Initialize communication work array
460 ACML_FFT1DX( 100, 1.0f, TRUE, nx, NULL, 1, NULL, 1, comm, &info );
464 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
465 gmx_fft_destroy( fft );
479 gmx_fft_init_1d_real(gmx_fft_t * pfft,
481 enum gmx_fft_flag flags)
491 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
496 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
502 commSize = (3*nx+100)*sizeof( float );
504 // Allocate communication work array, r2c
505 if( (commRC = (real*)malloc( commSize ) ) == NULL )
510 // Allocate communication work array, c2r
511 if( (commCR = (real*)malloc( commSize ) ) == NULL )
516 // Initialize communication work array
517 ACML_RCFFT1D( 100, nx, NULL, (real*)commRC, &info );
521 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
522 gmx_fft_destroy( fft );
526 // Initialize communication work array
527 ACML_CRFFT1D( 100, nx, NULL, (real*)commCR, &info );
531 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
532 gmx_fft_destroy( fft );
536 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
537 * full complex format */
538 if( (fft->realScratch = (acmlComplex*)malloc( nx*sizeof( acmlComplex ) ) ) == NULL )
546 fft->comm[0] = commRC;
547 fft->comm[1] = commCR;
554 gmx_fft_init_2d(gmx_fft_t * pfft,
557 enum gmx_fft_flag flags)
561 acmlComplex* comm = NULL;
567 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
572 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
577 // Single precision requires nx*ny+5*(nx+ny)
578 // Double precision requires nx*ny+3*(nx+ny)
579 if( sizeof( acmlComplex ) == 16 )
580 commSize = (nx*ny+3*(nx+ny)+200)*sizeof( acmlComplex );
582 commSize = (nx*ny+5*(nx+ny)+200)*sizeof( acmlComplex );
584 // Allocate communication work array
585 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
590 // Initialize communication work array
591 ACML_FFT2DX( 100, 1.0f, TRUE, TRUE, nx, ny, NULL, 1, nx, NULL, 1, nx, (acmlComplex*)comm, &info );
595 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
596 gmx_fft_destroy( fft );
611 gmx_fft_init_2d_real(gmx_fft_t * pfft,
614 enum gmx_fft_flag flags)
618 acmlComplex* comm = NULL;
626 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
631 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
638 /* Roll our own 2D real transform using multiple transforms in ACML,
639 * since the current ACML versions does not support our storage format,
640 * and all but the most recent don't even have 2D real FFTs.
643 // Single precision requires 5*nx+100
644 // Double precision requires 3*nx+100
645 if( sizeof( acmlComplex ) == 16 )
646 commSize = (3*nx+100)*sizeof( acmlComplex );
648 commSize = (5*nx+100)*sizeof( acmlComplex );
650 // Allocate communication work array
651 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
656 // Initialize communication work array
657 ACML_FFT1DMX( 100, 1.0f, FALSE, nyc, nx, NULL, nyc, 1, NULL, nyc, 1, comm, &info );
661 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
662 gmx_fft_destroy( fft );
666 commSize = (3*ny+100)*sizeof( real );
667 // Allocate communication work array
668 if( (commRC = (real*)malloc( commSize ) ) == NULL )
673 // TODO: Is there no MODE or PLAN for multiple hermetian sequences?
674 // Initialize communication work array
675 ACML_RCFFT1D( 100, ny, NULL, commRC, &info );
679 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
680 gmx_fft_destroy( fft );
684 commSize = (3*ny+100)*sizeof( real );
685 // Allocate communication work array
686 if( (commCR = (real*)malloc( commSize ) ) == NULL )
691 // TODO: Is there no MODE or PLAN for multiple hermetian sequences?
692 // Initialize communication work array
693 ACML_CRFFT1D( 100, ny, NULL, commCR, &info );
697 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
698 gmx_fft_destroy( fft );
702 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
703 * full complex format */
704 if( (fft->realScratch = (acmlComplex*)malloc( (nx*ny)*sizeof( acmlComplex ) ) ) == NULL )
714 fft->comm[1] = commRC;
715 fft->comm[2] = commCR;
722 gmx_fft_init_3d(gmx_fft_t * pfft,
726 enum gmx_fft_flag flags)
730 acmlComplex* comm = NULL;
736 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
741 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
746 commSize = (nx*ny*nz+4*(nx+ny+nz)+300)*sizeof( acmlComplex );
748 // Allocate communication work array
749 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
754 ACML_FFT3DX( 100, 1.0f, TRUE, nx, ny, nz, NULL, 1, nx, nx*ny, NULL, 1, nx, nx*ny, comm, commSize, &info );
768 gmx_fft_init_3d_real(gmx_fft_t * pfft,
772 enum gmx_fft_flag flags)
776 acmlComplex* commX = NULL;
777 acmlComplex* commY = NULL;
784 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
789 /* nzc = (nz/2 + 1); */
791 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
796 /* Roll our own 3D real transform using multiple transforms in ACML,
797 * since the current ACML versions does not support 2D
798 * or 3D real transforms.
802 * ny*nz complex-to-complex transforms, length nx
803 * transform distance: 1
804 * element strides: ny*nz
807 /* Single precision requires 5*nx+100
808 Double precision requires 3*nx+100
810 if( sizeof( acmlComplex ) == 16 )
811 commSize = (3*nx+100)*sizeof( acmlComplex );
813 commSize = (5*nx+100)*sizeof( acmlComplex );
815 /* Allocate communication work array */
816 if( (commX = (acmlComplex*)malloc( commSize ) ) == NULL )
821 /* Initialize communication work array */
822 ACML_FFT1DMX( 100, 1.0f, TRUE, ny*nz, nx, NULL, ny*nz, 1, NULL, ny*nz, 1, commX, &info );
826 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
827 gmx_fft_destroy( fft );
832 * We cannot do all NX*NZ transforms at once, so define a handle to do
833 * NZ transforms, and then execute it NX times.
834 * nz complex-to-complex transforms, length ny
835 * transform distance: 1
836 * element strides: nz
838 /* Single precision requires 5*nx+100
839 Double precision requires 3*nx+100
841 if( sizeof( acmlComplex ) == 16 )
842 commSize = (3*ny+100)*sizeof( acmlComplex );
844 commSize = (5*ny+100)*sizeof( acmlComplex );
846 /* Allocate communication work array */
847 if( (commY = (acmlComplex*)malloc( commSize ) ) == NULL )
852 /* Initialize communication work array */
853 /* We want to do multiple 1D FFT's in z-y plane, so we have to loop over x
854 * dimension recalculating z-y plane for each slice.
856 ACML_FFT1DMX( 100, 1.0f, TRUE, nz, ny, NULL, nz, 1, NULL, nz, 1, commY, &info );
860 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
861 gmx_fft_destroy( fft );
866 * nx*ny real-to-complex transforms, length nz
867 * transform distance: nzc*2 -> nzc*2
871 commSize = (3*nz+100)*sizeof( real );
872 /* Allocate communication work array */
873 if( (commRC = (real*)malloc( commSize ) ) == NULL )
878 /* TODO: Is there no MODE or PLAN for multiple hermetian sequences? */
879 // Initialize communication work array
880 ACML_RCFFT1D( 100, nz, NULL, commRC, &info );
885 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
886 gmx_fft_destroy( fft );
890 /* Out-of-place complex-to-real (affects distance) Z FFT:
891 * nx*ny real-to-complex transforms, length nz
892 * transform distance: nzc*2 -> nz
895 commSize = (3*nz+100)*sizeof( real );
896 /* Allocate communication work array */
897 if( (commCR = (real*)malloc( commSize ) ) == NULL )
902 // Initialize communication work array
903 ACML_CRFFT1D( 100, nz, NULL, commCR, &info );
907 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
908 gmx_fft_destroy( fft );
912 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
913 * full complex format */
914 if( (fft->realScratch = (acmlComplex*)malloc( (nx*ny*nz)*sizeof( acmlComplex ) ) ) == NULL )
924 fft->comm[0] = commX;
925 fft->comm[1] = commY;
926 fft->comm[2] = commRC;
927 fft->comm[3] = commCR;
934 gmx_fft_1d(gmx_fft_t fft,
935 enum gmx_fft_direction dir,
939 int inpl = (in_data == out_data);
941 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
943 if( (fft->real_fft == 1) || (fft->ndim != 1) ||
944 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
946 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
950 ACML_FFT1DX( mode, 1.0f, inpl, fft->nx, in_data, 1, out_data, 1, fft->comm[0], &info );
954 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
962 gmx_fft_1d_real(gmx_fft_t fft,
963 enum gmx_fft_direction dir,
970 if( (fft->real_fft != 1) || (fft->ndim != 1) ||
971 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
973 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
977 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
978 float recipCorrection = sqrtf( nx );
980 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
981 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
982 * are also only in-place, so we manually do a memcpy() first */
983 if(dir==GMX_FFT_REAL_TO_COMPLEX)
985 memcpy( fft->realScratch, inPtr, nx*sizeof( real ) );
986 ACML_RCFFT1D( 1, nx, fft->realScratch, fft->comm[0], &info );
988 hermitianUnPacking( recipCorrection, 1, nx, fft->realScratch, 0, outPtr );
992 memcpy( fft->realScratch, inPtr, nx*sizeof( t_complex ) );
993 hermitianPacking( recipCorrection, 1, nx, fft->realScratch, outPtr, 0 );
995 ACML_CRFFT1D( 1, nx, outPtr, fft->comm[1], &info );
1000 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1008 gmx_fft_2d(gmx_fft_t fft,
1009 enum gmx_fft_direction dir,
1013 int inpl = (in_data == out_data);
1015 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
1017 if( (fft->real_fft == 1) || (fft->ndim != 2) ||
1018 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
1020 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1024 ACML_FFT2DX( mode, 1.0f, TRUE, inpl, fft->nx, fft->ny,
1025 in_data, 1, fft->nx, out_data, 1, fft->nx, fft->comm[0], &info );
1029 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1038 gmx_fft_2d_real(gmx_fft_t fft,
1039 enum gmx_fft_direction dir,
1043 int inpl = (inPtr == outPtr);
1049 int nyc = (fft->ny/2 + 1);
1052 /* Depending on whether we are calculating the FFT in place or not, the rows of the
1053 * 2D FFT will be packed or not.
1057 /* Not packed; 1 or 2 extra reals padding at end of each row */
1065 if( (fft->real_fft != 1) || (fft->ndim != 2) ||
1066 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
1068 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1072 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
1073 float recipCorrection = sqrtf( ny );
1075 if(dir==GMX_FFT_REAL_TO_COMPLEX)
1077 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1078 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1079 * are also only in-place, so we manually do a memcpy() first */
1082 memcpy( outPtr, inPtr, nx*ny*sizeof( real ) );
1085 /* real-to-complex in Y dimension, in-place
1086 * SCFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1087 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1088 * on even or odd lengths.
1090 for( i = 0; i < nx; ++i )
1093 ACML_RCFFT1D( 1, ny, (real*)outPtr+i*nyLen, fft->comm[1], &info );
1096 hermitianUnPacking( 1.0f, nx, ny, outPtr, nyLen, fft->realScratch );
1098 /* complex-to-complex in X dimension */
1100 ACML_FFT1DMX( -1, recipCorrection, FALSE, nyc, nx, fft->realScratch, nyc, 1,
1101 outPtr, nyc, 1, fft->comm[0], &info );
1105 /* complex-to-complex in X dimension, in-place */
1106 ACML_FFT1DMX( 1, recipCorrection, FALSE, nyc, nx, inPtr, nyc, 1,
1107 fft->realScratch, nyc, 1, fft->comm[0], &info );
1109 hermitianPacking( 1.0f, nx, ny, fft->realScratch, outPtr, nyLen );
1111 /* complex-to-real in Y dimension
1112 * CSFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1113 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1114 * on even or odd lengths.
1118 for( i = 0; i < nx; ++i )
1121 ACML_CRFFT1D( 1, ny, (real*)outPtr+i*nyLen, fft->comm[2], &info );
1128 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1137 gmx_fft_3d(gmx_fft_t fft,
1138 enum gmx_fft_direction dir,
1142 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
1143 int inpl = ( in_data == out_data );
1151 if( (fft->real_fft == 1) || (fft->ndim != 3) ||
1152 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
1154 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1158 commSize = (nx*ny*nz+4*(nx+ny+nz)+300)*sizeof( acmlComplex );
1160 ACML_FFT3DX( mode, 1.0f, inpl, nx, ny, nz, in_data, 1, nx, nx*ny,
1161 out_data, 1, nx, nx*ny, fft->comm[0], commSize, &info );
1165 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1173 gmx_fft_3d_real(gmx_fft_t fft,
1174 enum gmx_fft_direction dir,
1178 int inpl = (inPtr == outPtr);
1181 int nx,ny,nz, nzLen = 0;
1186 int nzc = (nz/2 + 1);
1188 /* Depending on whether we are calculating the FFT in place or not, the rows of the
1189 * 3D FFT will be packed or not.
1193 /* Not packed; 1 or 2 extra reals padding at end of each row */
1201 if( (fft->real_fft != 1) || (fft->ndim != 3) ||
1202 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
1204 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1208 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
1209 float recipCorrection = sqrtf( nz );
1211 if(dir==GMX_FFT_REAL_TO_COMPLEX)
1213 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1214 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1215 * are also only in-place, so we manually do a memcpy() first */
1218 memcpy( outPtr, inPtr, nx*ny*nz*sizeof( real ) );
1221 /* real-to-complex in Z dimension, in-place
1222 * SCFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1223 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1224 * on even or odd lengths.
1226 for( i = 0; i < nx*ny; ++i )
1229 ACML_RCFFT1D( 1, nz, (real*)outPtr+i*nzLen, fft->comm[2], &info );
1232 hermitianUnPacking( 1.0f, nx*ny, nz, outPtr, nzLen, fft->realScratch );
1234 /* complex-to-complex in Y dimension, in-place */
1238 ACML_FFT1DMX( -1, 1.0f, TRUE, nzc, ny, (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1,
1239 (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1, fft->comm[1], &info );
1242 /* complex-to-complex in X dimension, in-place */
1244 ACML_FFT1DMX( -1, recipCorrection, FALSE, ny*nzc, nx, fft->realScratch, ny*nzc, 1, outPtr, ny*nzc, 1, fft->comm[0], &info );
1248 /* complex-to-complex in X dimension, from inPtr to work */
1249 ACML_FFT1DMX( 1, recipCorrection, FALSE, ny*nzc, nx, inPtr, ny*nzc, 1, fft->realScratch, ny*nzc, 1, fft->comm[0], &info );
1251 /* complex-to-complex in Y dimension, in-place */
1252 for( i=0; i < nx; i++ )
1255 ACML_FFT1DMX( 1, 1.0f, TRUE, nzc, ny, (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1,
1256 (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1, fft->comm[1], &info );
1259 hermitianPacking( 1.0f, nx*ny, nz, fft->realScratch, outPtr, nzLen );
1261 /* complex-to-real in Z dimension, in-place
1262 * CSFFTM is not valid to call here. CSFFTM does not take any stride information, and assumes that
1263 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1264 * on even or odd lengths.
1266 for( i = 0; i < nx*ny; ++i )
1269 ACML_CRFFT1D( 1, nz, (real*)outPtr+i*nzLen, fft->comm[3], &info );
1276 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1284 gmx_fft_destroy(gmx_fft_t fft)
1292 if(fft->comm[d] != NULL)
1298 if( fft->realScratch != NULL)
1299 free( fft->realScratch );
1308 #endif /* GMX_FFT_ACML */