2 * This file is part of the GROMACS molecular simulation package.
4 * Gromacs 4.0 Copyright (c) 1991-2003
5 * David van der Spoel, Erik Lindahl, University of Groningen.
6 * Copyright (c) 2012, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
53 #include "gmx_fatal.h"
55 /* Since c has no support of templates, we get around the type restrictions
56 with macros. Our custom macro names obscure the float vs double interfaces
59 #define ACML_FFT1DX zfft1dx
60 #define ACML_FFT2DX zfft2dx
61 #define ACML_FFT3DX zfft3dy
62 #define ACML_FFT1DMX zfft1mx
64 #define ACML_RCFFT1D dzfft
65 #define ACML_CRFFT1D zdfft
66 #define ACML_RCFFT1DM dzfftm
67 #define ACML_CRFFT1DM zdfftm
69 #define acmlComplex doublecomplex
72 #define ACML_FFT1DX cfft1dx
73 #define ACML_FFT2DX cfft2dx
74 #define ACML_FFT3DX cfft3dy
75 #define ACML_FFT1DMX cfft1mx
77 #define ACML_RCFFT1D scfft
78 #define ACML_CRFFT1D csfft
79 #define ACML_RCFFT1DM scfftm
80 #define ACML_CRFFT1DM csfftm
82 #define acmlComplex complex
85 void cPrintArray( t_complex* arr, int outer, int mid, int inner )
90 for( i = 0; i < outer; ++i )
92 for( j = 0; j < mid; ++j )
94 for( k = 0; k < inner; ++k )
96 printf("%f,%f ", arr[i*inner*mid+j*inner+k].re, arr[i*inner*mid+j*inner+k].im );
106 void rPrintArray( real* arr, int outer, int mid, int inner )
111 for( i = 0; i < outer; ++i )
113 for( j = 0; j < mid; ++j )
115 for( k = 0; k < inner; ++k )
117 printf("%f ", arr[i*inner*mid+j*inner+k] );
127 /* Contents of the AMD ACML FFT fft datatype.
129 * Note that this is one of several possible implementations of gmx_fft_t.
131 * The ACML _API_ supports 1D,2D, and 3D transforms, including real-to-complex.
132 * Unfortunately the actual library implementation does not support 2D&3D real
133 * transforms as of version 4.2 In addition, ACML outputs their hermitian data
134 * in such a form that it can't be fed straight back into their complex API's.
136 * To work around this we roll our own 2D and 3D real-to-complex transforms,
137 * using separate X/Y/Z handles defined to perform (ny*nz), (nx*nz), and
138 * (nx*ny) transforms at once when necessary. To perform strided multiple
139 * transforms out-of-place (i.e., without padding in the last dimension)
140 * on the fly we also need to separate the forward and backward
141 * handles for real-to-complex/complex-to-real data permutation.
143 * So, the handles are enumerated as follows:
145 * 1D FFT (real too): Index 0 is the handle for the entire FFT
146 * 2D complex FFT: Index 0 is the handle for the entire FFT
147 * 3D complex FFT: Index 0 is the handle for the entire FFT
148 * 2D, real FFT: 0=FFTx, 1=FFTrc, 2=FFTcr handle
149 * 3D, real FFT: 0=FFTx, 1=FFTy, 2=FFTrc, 3=FFTcr handle
151 * AMD people reading this: Learn from FFTW what a good interface looks like :-)
158 * r2c = index0, c2r = index1
163 int ndim; /**< Number of dimensions in FFT */
164 int nx; /**< Length of X transform */
165 int ny; /**< Length of Y transform */
166 int nz; /**< Length of Z transform */
167 int real_fft; /**< 1 if real FFT, otherwise 0 */
170 /* ACML's real FFT support leaves the data in a swizzled 'hermitian' format. The
171 problem with this is that you can't just feed this data back into ACML :( A
172 pre-processing step is required to transform the hermitian swizzled complex
173 values into unswizzled complex values
174 -This function assumes that the complex conjugates are not expanded; the
175 calling function should not assume that they exist.
176 -This function pads all real numbers with 0's for imaginary components.
178 void hermitianUnPacking( float scale, int row, int col, void* src, int srcPitch, void* dst )
181 int loopCount = (col-1)/2;
182 int hermLength = mid + 1;
186 /* Currently this function is not written to handle aliased input/output pointers */
187 assert( src != dst );
189 /* These two pointers could be aliased on top of each other; be careful */
190 real* realData = (real*)src;
191 t_complex* hermData = (t_complex*)dst;
193 /* We have to expand the data from real to complex, which will clobber data if you start
194 * from the beginning. For this reason, we start at the last array and work backwards,
195 * however, we skip the very first row (nFFT == 0) because we still have data collision
196 * on that row. We copy the first row into a temporary buffer.
198 for( nFFT = row-1; nFFT >= 0; --nFFT )
200 realData = (real*)src + (nFFT*srcPitch);
201 hermData = (t_complex*)dst + (nFFT*hermLength);
203 /* The first complex number is real valued */
204 t_complex tmp = { realData[0]*scale, 0 };
208 for(i = 1; i <= loopCount; ++i)
210 t_complex tmp = { realData[i]*scale, realData[col-i]*scale };
214 /* A little cleanup for even lengths */
215 if( loopCount != mid )
217 /* The n/2 number is real valued */
218 t_complex tmp = { realData[mid]*scale, 0 };
226 /* ACML's real FFT support requires the data in a swizzled 'hermitian' format.
227 This is a non-standard format, and requires us to manually swizzle the data :(
228 A pre-processing step is required to transform the complex values into the
229 swizzled hermitian format
231 void hermitianPacking( float scale, int row, int col, void* src, void* dst, int dstPitch )
234 int loopCount = (col-1)/2;
235 int hermLength = mid + 1;
242 /* Currently this function is not written to handle aliased input/output pointers */
243 assert( src != dst );
245 for( nFFT = 0; nFFT < row; ++nFFT )
247 realData = (real*)dst + (nFFT*dstPitch);
248 hermData = (t_complex*)src + (nFFT*hermLength);
250 /* The first complex number is real valued */
251 realData[0] = hermData[0].re * scale;
253 /* ACML's complex to real function is documented as only handling 'forward'
254 * transforms, which mean we have to manually conjugate the data before doing
255 * the backwards transform. */
257 for(i = 1; i <= loopCount; ++i)
259 realData[i] = hermData[i].re * scale;
260 realData[col-i] = -hermData[i].im * scale;
263 /* A little cleanup for even lengths */
264 if( loopCount != mid )
266 /* The n/2 number is real valued */
267 realData[mid] = hermData[mid].re * scale;
275 /* Gromacs expects results that are returned to be in a packed form; No
276 complex conjugate's. This routine takes a 2D array of complex data,
277 and compresses it to fit into 'real' form by removing all the complex
278 conjugates and any known imaginary 0's. Data is expected in row-major
279 form, with ny describing the length of a row.
281 void compressConj2D( int nX, int nY, void* src, void* dst )
283 /* These two pointers are aliased on top of each other; be careful */
284 real* realData = (real*)dst;
285 t_complex* complexData;
287 int halfRows = nX/2 + 1;
288 int halfCols = nY/2 + 1;
289 int rowOdd = nX & 0x1;
290 int colOdd = nY & 0x1;
294 for( nRow = 0; nRow < nX; ++nRow )
296 complexData = (t_complex*)src + (nRow*halfCols);
299 for( nCol = 0; nCol < halfCols; ++nCol )
301 if(( nRow == 0) && (nCol == 0 ))
303 /* The complex number is real valued */
304 realData[0] = complexData[nCol].re;
307 /* Last column is real if even column length */
308 else if( (nRow == 0) && ( !colOdd && ( nCol == (halfCols-1) ) ) )
310 /* The complex number is real valued */
311 realData[0] = complexData[nCol].re;
314 /* The middle value of the first column is real if we have an even number of rows and
315 * the last column of middle row is real if we have an even number of rows and columns */
316 else if( !rowOdd && ( ( nRow == (halfRows-1) ) && ( ( nCol == 0 ) || ( !colOdd && ( nCol == (halfCols-1) ) ) ) ) )
318 /* The complex number is real valued */
319 realData[0] = complexData[nCol].re;
322 else if( (nCol == 0) || ( !colOdd && ( nCol == (halfCols-1) ) ) )
324 /* Last half of real columns are conjugates */
325 if( nRow < halfRows )
327 /* Copy a complex number, which is two reals */
328 realData[0] = complexData[nCol].re;
329 realData[1] = complexData[nCol].im;
335 /* Copy a complex number, which is two reals */
336 realData[0] = complexData[nCol].re;
337 realData[1] = complexData[nCol].im;
346 /* Gromacs expects results that are returned to be in a packed form; No
347 complex conjugate's. This routine takes a 2D array of real data,
348 and uncompresses it to fit into 'complex' form by creating all the complex
349 conjugates and any known imaginary 0's. Data is expected in row-major
350 form, with ny describing the length of a row.
352 void unCompressConj2D( int nX, int nY, void* src, void* dst )
354 /* These two pointers are aliased on top of each other; be careful */
355 real* realData = (real*)src;
356 t_complex* complexData = (t_complex*)dst;
358 int halfRows = nX/2 + 1;
359 int halfCols = nY/2 + 1;
360 int rowOdd = nX & 0x1;
361 int colOdd = nY & 0x1;
364 for( nRow = 0; nRow < nX; ++nRow )
367 for( nCol = 0; nCol < halfCols; ++nCol )
369 int iIndex = nRow*halfCols + nCol;
371 if(( nRow == 0) && (nCol == 0 ))
373 /* The complex number is real valued */
374 complexData[iIndex].re = realData[0];
375 complexData[iIndex].im = 0;
378 /* Last column is real if even column length */
379 else if( (nRow == 0) && ( !colOdd && ( nCol == (halfCols-1) ) ) )
381 /* The complex number is real valued */
382 complexData[iIndex].re = realData[0];
383 complexData[iIndex].im = 0;
386 /* The middle value of the first column is real if we have an even number of rows and
387 * the last column of middle row is real if we have an even number of rows and columns */
388 else if( !rowOdd && ( ( nRow == (halfRows-1) ) && ( ( nCol == 0 ) || ( !colOdd && ( nCol == (halfCols-1) ) ) ) ) )
390 /* The complex number is real valued */
391 complexData[iIndex].re = realData[0];
392 complexData[iIndex].im = 0;
395 else if( (nCol == 0) || (!colOdd && ( nCol == (halfCols-1) ) ) )
397 /* Last half of real columns are conjugates */
398 if( nRow < halfRows )
400 /* Copy a complex number, which is two reals */
401 complexData[iIndex].re = realData[0];
402 complexData[iIndex].im = realData[1];
407 int oIndex = (nX-nRow)*halfCols + nCol;
408 complexData[iIndex].re = complexData[oIndex].re;
409 complexData[iIndex].im = -complexData[oIndex].im;
414 /* Copy a complex number, which is two reals */
415 complexData[iIndex].re = realData[0];
416 complexData[iIndex].im = realData[1];
425 /* Support routine for the 1D array tranforms. ACML does not support a MODE
426 * flag on the real/complex interface. It assumes a 'forward' transform both
427 * directions, so that requires a manual conjugation of the imaginary comps. */
428 void negateConj( int len, void* src )
430 real* imag = (real*)src;
431 int half = len/2 + 1;
434 for( i = half; i < len; ++i)
443 gmx_fft_init_1d(gmx_fft_t * pfft,
445 enum gmx_fft_flag flags)
449 acmlComplex* comm = NULL;
455 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
460 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
465 // Single precision requires 5*nx+100
466 // Double precision requires 3*nx+100
467 if( sizeof( acmlComplex ) == 16 )
468 commSize = (3*nx+100)*sizeof( acmlComplex );
470 commSize = (5*nx+100)*sizeof( acmlComplex );
472 // Allocate communication work array
473 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
478 // Initialize communication work array
479 ACML_FFT1DX( 100, 1.0f, TRUE, nx, NULL, 1, NULL, 1, comm, &info );
483 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
484 gmx_fft_destroy( fft );
498 gmx_fft_init_1d_real(gmx_fft_t * pfft,
500 enum gmx_fft_flag flags)
510 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
515 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
521 commSize = (3*nx+100)*sizeof( float );
523 // Allocate communication work array, r2c
524 if( (commRC = (real*)malloc( commSize ) ) == NULL )
529 // Allocate communication work array, c2r
530 if( (commCR = (real*)malloc( commSize ) ) == NULL )
535 // Initialize communication work array
536 ACML_RCFFT1D( 100, nx, NULL, (real*)commRC, &info );
540 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
541 gmx_fft_destroy( fft );
545 // Initialize communication work array
546 ACML_CRFFT1D( 100, nx, NULL, (real*)commCR, &info );
550 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
551 gmx_fft_destroy( fft );
555 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
556 * full complex format */
557 if( (fft->realScratch = (acmlComplex*)malloc( nx*sizeof( acmlComplex ) ) ) == NULL )
565 fft->comm[0] = commRC;
566 fft->comm[1] = commCR;
573 gmx_fft_init_2d(gmx_fft_t * pfft,
576 enum gmx_fft_flag flags)
580 acmlComplex* comm = NULL;
586 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
591 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
596 // Single precision requires nx*ny+5*(nx+ny)
597 // Double precision requires nx*ny+3*(nx+ny)
598 if( sizeof( acmlComplex ) == 16 )
599 commSize = (nx*ny+3*(nx+ny)+200)*sizeof( acmlComplex );
601 commSize = (nx*ny+5*(nx+ny)+200)*sizeof( acmlComplex );
603 // Allocate communication work array
604 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
609 // Initialize communication work array
610 ACML_FFT2DX( 100, 1.0f, TRUE, TRUE, nx, ny, NULL, 1, nx, NULL, 1, nx, (acmlComplex*)comm, &info );
614 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
615 gmx_fft_destroy( fft );
630 gmx_fft_init_2d_real(gmx_fft_t * pfft,
633 enum gmx_fft_flag flags)
637 acmlComplex* comm = NULL;
645 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
650 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
657 /* Roll our own 2D real transform using multiple transforms in ACML,
658 * since the current ACML versions does not support our storage format,
659 * and all but the most recent don't even have 2D real FFTs.
662 // Single precision requires 5*nx+100
663 // Double precision requires 3*nx+100
664 if( sizeof( acmlComplex ) == 16 )
665 commSize = (3*nx+100)*sizeof( acmlComplex );
667 commSize = (5*nx+100)*sizeof( acmlComplex );
669 // Allocate communication work array
670 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
675 // Initialize communication work array
676 ACML_FFT1DMX( 100, 1.0f, FALSE, nyc, nx, NULL, nyc, 1, NULL, nyc, 1, comm, &info );
680 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
681 gmx_fft_destroy( fft );
685 commSize = (3*ny+100)*sizeof( real );
686 // Allocate communication work array
687 if( (commRC = (real*)malloc( commSize ) ) == NULL )
692 // TODO: Is there no MODE or PLAN for multiple hermetian sequences?
693 // Initialize communication work array
694 ACML_RCFFT1D( 100, ny, NULL, commRC, &info );
698 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
699 gmx_fft_destroy( fft );
703 commSize = (3*ny+100)*sizeof( real );
704 // Allocate communication work array
705 if( (commCR = (real*)malloc( commSize ) ) == NULL )
710 // TODO: Is there no MODE or PLAN for multiple hermetian sequences?
711 // Initialize communication work array
712 ACML_CRFFT1D( 100, ny, NULL, commCR, &info );
716 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
717 gmx_fft_destroy( fft );
721 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
722 * full complex format */
723 if( (fft->realScratch = (acmlComplex*)malloc( (nx*ny)*sizeof( acmlComplex ) ) ) == NULL )
733 fft->comm[1] = commRC;
734 fft->comm[2] = commCR;
741 gmx_fft_init_3d(gmx_fft_t * pfft,
745 enum gmx_fft_flag flags)
749 acmlComplex* comm = NULL;
755 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
760 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
765 commSize = (nx*ny*nz+4*(nx+ny+nz)+300)*sizeof( acmlComplex );
767 // Allocate communication work array
768 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
773 ACML_FFT3DX( 100, 1.0f, TRUE, nx, ny, nz, NULL, 1, nx, nx*ny, NULL, 1, nx, nx*ny, comm, commSize, &info );
787 gmx_fft_init_3d_real(gmx_fft_t * pfft,
791 enum gmx_fft_flag flags)
795 acmlComplex* commX = NULL;
796 acmlComplex* commY = NULL;
803 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
808 /* nzc = (nz/2 + 1); */
810 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
815 /* Roll our own 3D real transform using multiple transforms in ACML,
816 * since the current ACML versions does not support 2D
817 * or 3D real transforms.
821 * ny*nz complex-to-complex transforms, length nx
822 * transform distance: 1
823 * element strides: ny*nz
826 /* Single precision requires 5*nx+100
827 Double precision requires 3*nx+100
829 if( sizeof( acmlComplex ) == 16 )
830 commSize = (3*nx+100)*sizeof( acmlComplex );
832 commSize = (5*nx+100)*sizeof( acmlComplex );
834 /* Allocate communication work array */
835 if( (commX = (acmlComplex*)malloc( commSize ) ) == NULL )
840 /* Initialize communication work array */
841 ACML_FFT1DMX( 100, 1.0f, TRUE, ny*nz, nx, NULL, ny*nz, 1, NULL, ny*nz, 1, commX, &info );
845 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
846 gmx_fft_destroy( fft );
851 * We cannot do all NX*NZ transforms at once, so define a handle to do
852 * NZ transforms, and then execute it NX times.
853 * nz complex-to-complex transforms, length ny
854 * transform distance: 1
855 * element strides: nz
857 /* Single precision requires 5*nx+100
858 Double precision requires 3*nx+100
860 if( sizeof( acmlComplex ) == 16 )
861 commSize = (3*ny+100)*sizeof( acmlComplex );
863 commSize = (5*ny+100)*sizeof( acmlComplex );
865 /* Allocate communication work array */
866 if( (commY = (acmlComplex*)malloc( commSize ) ) == NULL )
871 /* Initialize communication work array */
872 /* We want to do multiple 1D FFT's in z-y plane, so we have to loop over x
873 * dimension recalculating z-y plane for each slice.
875 ACML_FFT1DMX( 100, 1.0f, TRUE, nz, ny, NULL, nz, 1, NULL, nz, 1, commY, &info );
879 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
880 gmx_fft_destroy( fft );
885 * nx*ny real-to-complex transforms, length nz
886 * transform distance: nzc*2 -> nzc*2
890 commSize = (3*nz+100)*sizeof( real );
891 /* Allocate communication work array */
892 if( (commRC = (real*)malloc( commSize ) ) == NULL )
897 /* TODO: Is there no MODE or PLAN for multiple hermetian sequences? */
898 // Initialize communication work array
899 ACML_RCFFT1D( 100, nz, NULL, commRC, &info );
904 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
905 gmx_fft_destroy( fft );
909 /* Out-of-place complex-to-real (affects distance) Z FFT:
910 * nx*ny real-to-complex transforms, length nz
911 * transform distance: nzc*2 -> nz
914 commSize = (3*nz+100)*sizeof( real );
915 /* Allocate communication work array */
916 if( (commCR = (real*)malloc( commSize ) ) == NULL )
921 // Initialize communication work array
922 ACML_CRFFT1D( 100, nz, NULL, commCR, &info );
926 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
927 gmx_fft_destroy( fft );
931 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
932 * full complex format */
933 if( (fft->realScratch = (acmlComplex*)malloc( (nx*ny*nz)*sizeof( acmlComplex ) ) ) == NULL )
943 fft->comm[0] = commX;
944 fft->comm[1] = commY;
945 fft->comm[2] = commRC;
946 fft->comm[3] = commCR;
953 gmx_fft_1d(gmx_fft_t fft,
954 enum gmx_fft_direction dir,
958 int inpl = (in_data == out_data);
960 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
962 if( (fft->real_fft == 1) || (fft->ndim != 1) ||
963 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
965 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
969 ACML_FFT1DX( mode, 1.0f, inpl, fft->nx, in_data, 1, out_data, 1, fft->comm[0], &info );
973 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
981 gmx_fft_1d_real(gmx_fft_t fft,
982 enum gmx_fft_direction dir,
989 if( (fft->real_fft != 1) || (fft->ndim != 1) ||
990 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
992 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
996 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
997 float recipCorrection = sqrtf( nx );
999 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1000 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1001 * are also only in-place, so we manually do a memcpy() first */
1002 if(dir==GMX_FFT_REAL_TO_COMPLEX)
1004 memcpy( fft->realScratch, inPtr, nx*sizeof( real ) );
1005 ACML_RCFFT1D( 1, nx, fft->realScratch, fft->comm[0], &info );
1007 hermitianUnPacking( recipCorrection, 1, nx, fft->realScratch, 0, outPtr );
1011 memcpy( fft->realScratch, inPtr, nx*sizeof( t_complex ) );
1012 hermitianPacking( recipCorrection, 1, nx, fft->realScratch, outPtr, 0 );
1014 ACML_CRFFT1D( 1, nx, outPtr, fft->comm[1], &info );
1019 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1027 gmx_fft_2d(gmx_fft_t fft,
1028 enum gmx_fft_direction dir,
1032 int inpl = (in_data == out_data);
1034 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
1036 if( (fft->real_fft == 1) || (fft->ndim != 2) ||
1037 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
1039 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1043 ACML_FFT2DX( mode, 1.0f, TRUE, inpl, fft->nx, fft->ny,
1044 in_data, 1, fft->nx, out_data, 1, fft->nx, fft->comm[0], &info );
1048 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1057 gmx_fft_2d_real(gmx_fft_t fft,
1058 enum gmx_fft_direction dir,
1062 int inpl = (inPtr == outPtr);
1068 int nyc = (fft->ny/2 + 1);
1071 /* Depending on whether we are calculating the FFT in place or not, the rows of the
1072 * 2D FFT will be packed or not.
1076 /* Not packed; 1 or 2 extra reals padding at end of each row */
1084 if( (fft->real_fft != 1) || (fft->ndim != 2) ||
1085 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
1087 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1091 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
1092 float recipCorrection = sqrtf( ny );
1094 if(dir==GMX_FFT_REAL_TO_COMPLEX)
1096 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1097 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1098 * are also only in-place, so we manually do a memcpy() first */
1101 memcpy( outPtr, inPtr, nx*ny*sizeof( real ) );
1104 /* real-to-complex in Y dimension, in-place
1105 * SCFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1106 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1107 * on even or odd lengths.
1109 for( i = 0; i < nx; ++i )
1112 ACML_RCFFT1D( 1, ny, (real*)outPtr+i*nyLen, fft->comm[1], &info );
1115 hermitianUnPacking( 1.0f, nx, ny, outPtr, nyLen, fft->realScratch );
1117 /* complex-to-complex in X dimension */
1119 ACML_FFT1DMX( -1, recipCorrection, FALSE, nyc, nx, fft->realScratch, nyc, 1,
1120 outPtr, nyc, 1, fft->comm[0], &info );
1124 /* complex-to-complex in X dimension, in-place */
1125 ACML_FFT1DMX( 1, recipCorrection, FALSE, nyc, nx, inPtr, nyc, 1,
1126 fft->realScratch, nyc, 1, fft->comm[0], &info );
1128 hermitianPacking( 1.0f, nx, ny, fft->realScratch, outPtr, nyLen );
1130 /* complex-to-real in Y dimension
1131 * CSFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1132 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1133 * on even or odd lengths.
1137 for( i = 0; i < nx; ++i )
1140 ACML_CRFFT1D( 1, ny, (real*)outPtr+i*nyLen, fft->comm[2], &info );
1147 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1156 gmx_fft_3d(gmx_fft_t fft,
1157 enum gmx_fft_direction dir,
1161 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
1162 int inpl = ( in_data == out_data );
1170 if( (fft->real_fft == 1) || (fft->ndim != 3) ||
1171 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
1173 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1177 commSize = (nx*ny*nz+4*(nx+ny+nz)+300)*sizeof( acmlComplex );
1179 ACML_FFT3DX( mode, 1.0f, inpl, nx, ny, nz, in_data, 1, nx, nx*ny,
1180 out_data, 1, nx, nx*ny, fft->comm[0], commSize, &info );
1184 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1192 gmx_fft_3d_real(gmx_fft_t fft,
1193 enum gmx_fft_direction dir,
1197 int inpl = (inPtr == outPtr);
1200 int nx,ny,nz, nzLen = 0;
1205 int nzc = (nz/2 + 1);
1207 /* Depending on whether we are calculating the FFT in place or not, the rows of the
1208 * 3D FFT will be packed or not.
1212 /* Not packed; 1 or 2 extra reals padding at end of each row */
1220 if( (fft->real_fft != 1) || (fft->ndim != 3) ||
1221 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
1223 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1227 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
1228 float recipCorrection = sqrtf( nz );
1230 if(dir==GMX_FFT_REAL_TO_COMPLEX)
1232 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1233 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1234 * are also only in-place, so we manually do a memcpy() first */
1237 memcpy( outPtr, inPtr, nx*ny*nz*sizeof( real ) );
1240 /* real-to-complex in Z dimension, in-place
1241 * SCFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1242 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1243 * on even or odd lengths.
1245 for( i = 0; i < nx*ny; ++i )
1248 ACML_RCFFT1D( 1, nz, (real*)outPtr+i*nzLen, fft->comm[2], &info );
1251 hermitianUnPacking( 1.0f, nx*ny, nz, outPtr, nzLen, fft->realScratch );
1253 /* complex-to-complex in Y dimension, in-place */
1257 ACML_FFT1DMX( -1, 1.0f, TRUE, nzc, ny, (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1,
1258 (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1, fft->comm[1], &info );
1261 /* complex-to-complex in X dimension, in-place */
1263 ACML_FFT1DMX( -1, recipCorrection, FALSE, ny*nzc, nx, fft->realScratch, ny*nzc, 1, outPtr, ny*nzc, 1, fft->comm[0], &info );
1267 /* complex-to-complex in X dimension, from inPtr to work */
1268 ACML_FFT1DMX( 1, recipCorrection, FALSE, ny*nzc, nx, inPtr, ny*nzc, 1, fft->realScratch, ny*nzc, 1, fft->comm[0], &info );
1270 /* complex-to-complex in Y dimension, in-place */
1271 for( i=0; i < nx; i++ )
1274 ACML_FFT1DMX( 1, 1.0f, TRUE, nzc, ny, (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1,
1275 (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1, fft->comm[1], &info );
1278 hermitianPacking( 1.0f, nx*ny, nz, fft->realScratch, outPtr, nzLen );
1280 /* complex-to-real in Z dimension, in-place
1281 * CSFFTM is not valid to call here. CSFFTM does not take any stride information, and assumes that
1282 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1283 * on even or odd lengths.
1285 for( i = 0; i < nx*ny; ++i )
1288 ACML_CRFFT1D( 1, nz, (real*)outPtr+i*nzLen, fft->comm[3], &info );
1295 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1303 gmx_fft_destroy(gmx_fft_t fft)
1311 if(fft->comm[d] != NULL)
1317 if( fft->realScratch != NULL)
1318 free( fft->realScratch );
1327 #endif /* GMX_FFT_ACML */