Random cosmetic fixes
[alexxy/gromacs.git] / src / external / clFFT / src / library / plan.cpp
1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16
17 ////////////////////////////////////////////
18
19 // clfft.plan.cpp : Defines the entry point for the console application.
20 //
21
22 #include "stdafx.h"
23 #include <math.h>
24 #include "private.h"
25 #include "repo.h"
26 #include "plan.h"
27 #include "generator.stockham.h"
28 #include "../include/convenienceFunctions.h"
29 #include "action.h"
30 #include "fft_binary_lookup.h"
31
32 using std::vector;
33
34 const std::string beginning_of_binary( "<[�_beginning_of_binary_�]>" );
35 const std::string end_of_binary( "<[�_I_may_be_a_sorry_case,_but_I_don't_write_jokes_in_base_13_�]>" );
36 const std::string end_of_file( "<[�_You're_off_the_edge_of_the_map,_mate._Here_there_be_monsters_�]>" );
37
38 static bool pow235(size_t num, size_t &pow2, size_t &pow3, size_t &pow5)
39 {
40         //a helper function to decide if a number is only radix 2, 3 and 5
41         if (num % 2 != 0 && num % 3 != 0 && num % 5 != 0)
42                 return false;
43
44         while (num > 1)
45         {
46                 if (num % 5 == 0)
47                 {
48                         num /= 5;
49                         pow5++;
50                         continue;
51                 }
52                 if (num % 3 == 0)
53                 {
54                         num /= 3;
55                         pow3++;
56                         continue;
57                 }
58                 if (num % 2 == 0)
59                 {
60                         num /= 2;
61                         pow2++;
62                         continue;
63                 }
64                 return false;
65         }
66         return true;
67 }
68
69 static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums, clfftPrecision precision, size_t threshold)
70 {
71         /* a helper function to split big 1D to friendly 2D sizes for inplace transpose kernels
72            currently only radix 2, 3 and 5 are supported
73            the algorithm looks for ways to split up the 1D into 2D such that one of the dimensions is multiples of the other dimension.
74            And this mupliple is radix2, 3 or 5.
75            each splited dimentsion should be further splited until that it is smaller than 4096
76         */
77         if (num <= threshold)
78                 return true;
79         if (num % 2 != 0 && num % 3 != 0 && num % 5 != 0)
80                 return false;
81
82         //let's figure out pow2, pow3 and pow5 such that num = 2^pow2 * 3^pow3 * 5^pow5
83         size_t pow2, pow3, pow5;
84         pow2 = pow3 = pow5 = 0;
85         bool status = pow235(num, pow2, pow3, pow5);
86         if (!status)
87                 return status;
88
89         size_t divide_factor;
90         if (pow2 % 2 != 0)
91         {
92                 //pow2 is odd
93                 if (pow3 % 2 != 0)
94                 {
95                         //pow2 and pow3 are odd
96                         if (pow5 % 2 != 0)
97                         {
98                                 //pow2, pow3 and pow5 are odd
99                                 //one dimension is 2*3*5 = 30 times bigger than the other dimension
100                                 divide_factor = 2 * 3 * 5;
101                         }
102                         else
103                         {
104                                 //pow2 and pow3 are odd, pow 5 is even
105                                 //one dimension is 2*3 = 6 times bigger than the other dimension
106                                 divide_factor = 2 * 3;
107                         }
108                 }
109                 else
110                 {
111                         //pow2 is odd, pow3 is even
112                         if (pow5 % 2 != 0)
113                         {
114                                 //pow2, pow5 are odd pow3 is eve
115                                 divide_factor = 2 * 5;
116                         }
117                         else
118                         {
119                                 //pow2 is odd, pow3 and pow5 are even
120                                 divide_factor = 2;
121                         }
122
123                 }
124         }
125         else
126         {
127                 //pow2 is even
128                 if (pow3 % 2 != 0)
129                 {
130                         //pow3 is odd pow2 is even
131                         if (pow5 % 2 != 0)
132                         {
133                                 //pow2 is even, pow3 and pow5 are odd
134                                 divide_factor = 3 * 5;
135                         }
136                         else
137                         {
138                                 //pow2 and pow5 are even, pow3 is odd
139                                 divide_factor = 3;
140                         }
141                 }
142                 else
143                 {
144                         //pow2 and are even
145                         if (pow5 % 2 != 0)
146                         {
147                                 //pow5 is odd pow2 pow3 is eve
148                                 divide_factor = 5;
149                         }
150                         else
151                         {
152                                 //all even
153                                 divide_factor = 1;
154                         }
155
156                 }
157         }
158         //add some special cases
159         if (num == 2687385600)
160                 divide_factor = 2 * 2 * 3 * 3;
161         if (num == 2916000000)
162                 divide_factor = 2 * 2 * 3 * 3 * 5 * 5;
163         if (num == 3057647616)
164                 divide_factor = 2 * 2 * 3 * 3;
165
166         num = num / divide_factor;
167         //now the remaining num should have even number of pow2, pow3 and pow5 and we can do sqrt
168         size_t temp = (size_t)sqrt((double)num);
169         vector<size_t> splitVec;
170         splitVec.push_back(temp*divide_factor);
171         splitVec.push_back(temp);
172         splitNums.push_back(splitVec);
173
174         status = status && split1D_for_inplace(temp*divide_factor, splitNums, precision, threshold);
175         status = status && split1D_for_inplace(temp, splitNums, precision, threshold);
176         return status;
177
178 }
179
180 // Returns CLFFT_SUCCESS if the fp64 is present, CLFFT_DEVICE_NO_DOUBLE if it is not found.  
181 clfftStatus checkDevExt( std::string ext, const cl_device_id &device )
182 {
183         size_t deviceExtSize    = 0;
184         OPENCL_V( ::clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
185                 "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
186
187         std::vector< char > szDeviceExt( deviceExtSize );
188         OPENCL_V( ::clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
189                 "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
190
191         std::string strDeviceExt = &szDeviceExt[ 0 ];
192
193         if( strDeviceExt.find( ext.c_str( ), 0 ) == std::string::npos )
194                 return CLFFT_DEVICE_NO_DOUBLE;
195
196
197         return CLFFT_SUCCESS;
198 }
199
200 clfftStatus     clfftCreateDefaultPlanInternal( clfftPlanHandle* plHandle, cl_context context, const clfftDim dim,
201                                                 const size_t* clLengths )
202 {
203         if( clLengths == NULL )
204                 return CLFFT_INVALID_HOST_PTR;
205
206         size_t lenX = 1, lenY = 1, lenZ = 1;
207
208         switch( dim )
209         {
210                 case CLFFT_1D:
211                 {
212                         //      Minimum length size is 1
213                         if( clLengths[ DimX ] == 0 )
214                                 return CLFFT_INVALID_ARG_VALUE;
215
216                         if( !IsASupportedLength( clLengths[ DimX ] ) )
217                         {
218                                 return CLFFT_NOTIMPLEMENTED;
219                         }
220
221                         lenX = clLengths[ DimX ];
222                 }
223                         break;
224                 case CLFFT_2D:
225                 {
226                         //      Minimum length size is 1
227                         if( clLengths[ DimX ] == 0 || clLengths[ DimY ] == 0 )
228                                 return CLFFT_INVALID_ARG_VALUE;
229
230                         if( !IsASupportedLength( clLengths[ DimX ] ) || !IsASupportedLength( clLengths[ DimY ] ) )
231                         {
232                                 return CLFFT_NOTIMPLEMENTED;
233                         }
234
235                         lenX = clLengths[ DimX ];
236                         lenY = clLengths[ DimY ];
237                 }
238                         break;
239                 case CLFFT_3D:
240                 {
241                         //      Minimum length size is 1
242                         if( clLengths[ DimX ] == 0 || clLengths[ DimY ] == 0 || clLengths[ DimZ ] == 0 )
243                                 return CLFFT_INVALID_ARG_VALUE;
244
245                         if( !IsASupportedLength( clLengths[ DimX ] ) || !IsASupportedLength( clLengths[ DimY ] ) ||
246                                 !IsASupportedLength( clLengths[ DimZ ] ))
247                         {
248                                 return CLFFT_NOTIMPLEMENTED;
249                         }
250
251                         lenX = clLengths[ DimX ];
252                         lenY = clLengths[ DimY ];
253                         lenZ = clLengths[ DimZ ];
254                 }
255                         break;
256                 default:
257                         return CLFFT_NOTIMPLEMENTED;
258                         break;
259         }
260
261         FFTPlan *fftPlan = NULL;
262         FFTRepo& fftRepo        = FFTRepo::getInstance( );
263         OPENCL_V( fftRepo.createPlan( plHandle, fftPlan ), _T( "fftRepo.insertPlan failed" ) );
264
265         fftPlan->baked                  = false;
266         fftPlan->dim                    = dim;
267         fftPlan->placeness              = CLFFT_INPLACE;
268         fftPlan->inputLayout    = CLFFT_COMPLEX_INTERLEAVED;
269         fftPlan->outputLayout   = CLFFT_COMPLEX_INTERLEAVED;
270         fftPlan->precision              = CLFFT_SINGLE;
271         fftPlan->context                = context;
272         fftPlan->forwardScale   = 1.0;
273         fftPlan->backwardScale  = 1.0 / static_cast< double >( lenX * lenY * lenZ );
274         fftPlan->batchsize              = 1;
275         fftPlan->gen                    = Stockham; //default setting
276
277         OPENCL_V(fftPlan->SetEnvelope(), _T("SetEnvelope failed"));
278
279         clRetainContext( fftPlan->context );
280
281 #if 0
282         /////////////////////////////////////////////////////////////////
283         // Detect OpenCL devices
284         /////////////////////////////////////////////////////////////////
285         // First, get the size of device list data
286         size_t deviceListSize;
287         OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
288                 "Getting device array size ( ::clGetContextInfo() )" );
289
290         //      Allocate memory for the devices
291         fftPlan->devices.resize( deviceListSize / sizeof( cl_device_id ) );
292
293         /* Now, get the device list data */
294         OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &fftPlan->devices[ 0 ], NULL ),
295                 "Getting device array ( ::clGetContextInfo() )" );
296 #endif
297
298         //      Need to devise a way to generate better names
299         tstringstream   tstream;
300         tstream << _T( "plan_" ) << *plHandle;
301
302         lockRAII* planLock      = NULL;
303         OPENCL_V( fftRepo.getPlan( *plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
304         planLock->setName( tstream.str( ) );
305
306         //      Set the lengths and default strides/pitches depending on the dim that the user passes to us
307         switch( dim )
308         {
309                 case CLFFT_1D:
310                 {
311                         fftPlan->length.push_back( lenX );
312                         fftPlan->inStride.push_back( 1 );
313                         fftPlan->outStride.push_back( 1 );
314                         fftPlan->iDist          = lenX;
315                         fftPlan->oDist          = lenX;
316                 }
317                         break;
318                 case CLFFT_2D:
319                 {
320                         fftPlan->length.push_back( lenX );
321                         fftPlan->length.push_back( lenY );
322                         fftPlan->inStride.push_back( 1 );
323                         fftPlan->inStride.push_back( lenX );
324                         fftPlan->outStride.push_back( 1 );
325                         fftPlan->outStride.push_back( lenX );
326                         fftPlan->iDist          = lenX*lenY;
327                         fftPlan->oDist          = lenX*lenY;
328                 }
329                         break;
330                 case CLFFT_3D:
331                 {
332                         fftPlan->length.push_back( lenX );
333                         fftPlan->length.push_back( lenY );
334                         fftPlan->length.push_back( lenZ );
335                         fftPlan->inStride.push_back( 1 );
336                         fftPlan->inStride.push_back( lenX );
337                         fftPlan->inStride.push_back( lenX*lenY );
338                         fftPlan->outStride.push_back( 1 );
339                         fftPlan->outStride.push_back( lenX );
340                         fftPlan->outStride.push_back( lenX*lenY );
341                         fftPlan->iDist          = lenX*lenY*lenZ;
342                         fftPlan->oDist          = lenX*lenY*lenZ;
343                 }
344                         break;
345         }
346
347         fftPlan->plHandle = *plHandle;
348
349         return  CLFFT_SUCCESS;
350 }
351
352 // This external entry-point should not be called from within the library. Use clfftCreateDefaultPlanInternal instead.
353 clfftStatus     clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context context, const clfftDim dim,
354                                                 const size_t* clLengths )
355 {
356         clfftStatus ret = clfftCreateDefaultPlanInternal(plHandle, context, dim, clLengths);
357
358         if(ret == CLFFT_SUCCESS)
359         {
360                 FFTRepo& fftRepo        = FFTRepo::getInstance( );
361                 FFTPlan *fftPlan = NULL;
362                 lockRAII* planLock      = NULL;
363                 OPENCL_V( fftRepo.getPlan( *plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
364
365                 fftPlan->userPlan = true;
366         }
367         
368         return ret;
369
370 }
371
372 std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHandle, bool withPlHandle)
373 {
374     //  Logic to define a sensible filename
375     const std::string kernelPrefix( "clfft.kernel." );
376     std::string generatorName;
377     std::stringstream kernelPath;
378
379     switch( gen )
380     {
381
382     case Stockham:                          generatorName = "Stockham"; break;
383         case Transpose_GCN:                 generatorName = "Transpose"; break;
384         case Transpose_SQUARE:      generatorName = "Transpose"; break;
385     case Transpose_NONSQUARE:   generatorName = "TransposeNonSquare"; break;
386         case Copy:                                  generatorName = "Copy"; break;
387
388     }
389
390     kernelPath << kernelPrefix << generatorName ;
391
392     if (withPlHandle)
393         kernelPath << plHandle;
394
395     kernelPath << ".cl";
396
397     return kernelPath.str();
398 }
399
400
401 clfftStatus selectAction(FFTPlan * fftPlan, FFTAction *& action, cl_command_queue* commQueueFFT)
402 {
403     // set the action we are baking a leaf
404     clfftStatus err;
405     
406     switch (fftPlan->gen)
407     {
408     case Stockham:  
409                 {
410                         // Instantiate the default stockham generator
411                         action = new FFTGeneratedStockhamAction (fftPlan->plHandle, fftPlan, *commQueueFFT, err);
412                         OPENCL_V( err, "FFTGeneratedStockhamAction() failed");
413                 }
414                 break;
415
416     case Transpose_GCN: 
417                 {
418                         action = new FFTGeneratedTransposeGCNAction(fftPlan->plHandle, fftPlan, *commQueueFFT, err);
419                         OPENCL_V( err, "FFTGeneratedTransposeGCNAction() failed");
420                 }
421                 break;
422
423
424     case Copy:
425                 {
426                         action = new FFTGeneratedCopyAction     (fftPlan->plHandle, fftPlan, *commQueueFFT, err);
427                         OPENCL_V( err, "FFTGeneratedCopyAction() failed");
428                 }
429                 break;
430
431     default:
432                 {
433                         assert(false);
434                         OPENCL_V( CLFFT_NOTIMPLEMENTED, "selectAction() failed");
435                 }
436     }
437
438         return CLFFT_SUCCESS;
439 }
440
441
442 inline size_t PrecisionWidth(clfftPrecision pr)
443 {
444         switch(pr)
445         {
446         case CLFFT_SINGLE:      return 1;
447         case CLFFT_DOUBLE:      return 2;
448         default:                assert(false); return 1;
449         }
450 }
451
452
453
454 clfftStatus     clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
455                                                         void (CL_CALLBACK *pfn_notify)( clfftPlanHandle plHandle, void *user_data ), void* user_data )
456 {
457         //      We do not currently support multi-GPU transforms
458         if( numQueues > 1 )
459                 return CLFFT_NOTIMPLEMENTED;
460
461         //      Notification mechanism is not set up yet; BakePlan can be called recursively to decompose higher dimension FFT's into
462         //      arrays of 1d transforms, and this must be implemented to make only a single callback to the user.
463         if( pfn_notify != NULL )
464                 return CLFFT_NOTIMPLEMENTED;
465
466         if( user_data != NULL )
467                 return CLFFT_NOTIMPLEMENTED;
468
469         FFTRepo& fftRepo        = FFTRepo::getInstance( );
470         FFTPlan* fftPlan        = NULL;
471         lockRAII* planLock      = NULL;
472
473         OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
474         scopedLock sLock( *planLock, _T( "clfftBakePlan" ) );
475
476         // if we have already baked the plan and nothing has changed since, we're done here
477         if( fftPlan->baked == true )
478         {
479                 return CLFFT_SUCCESS;
480         }
481
482         // Store the device for which we are baking
483         clGetCommandQueueInfo(*commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &fftPlan->bakeDevice, NULL);
484
485         //find product of lengths
486         size_t maxLengthInAnyDim = 1;
487         switch(fftPlan->dim)
488         {
489                 case CLFFT_3D: maxLengthInAnyDim = maxLengthInAnyDim > fftPlan->length[DimZ] ? maxLengthInAnyDim : fftPlan->length[DimZ];
490                 case CLFFT_2D: maxLengthInAnyDim = maxLengthInAnyDim > fftPlan->length[DimY] ? maxLengthInAnyDim : fftPlan->length[DimY];
491                 case CLFFT_1D: maxLengthInAnyDim = maxLengthInAnyDim > fftPlan->length[DimX] ? maxLengthInAnyDim : fftPlan->length[DimX];
492         }
493
494         const bool rc = (fftPlan->inputLayout == CLFFT_REAL) || (fftPlan->outputLayout == CLFFT_REAL);
495
496         // upper bounds on transfrom lengths - address this in the next release
497         size_t SP_MAX_LEN = 1 << 24;
498         size_t DP_MAX_LEN = 1 << 22;
499         if((fftPlan->precision == CLFFT_SINGLE) && (maxLengthInAnyDim > SP_MAX_LEN) && rc) return CLFFT_NOTIMPLEMENTED;
500         if((fftPlan->precision == CLFFT_DOUBLE) && (maxLengthInAnyDim > DP_MAX_LEN) && rc) return CLFFT_NOTIMPLEMENTED;
501
502
503         // release buffers, as these will be created only in EnqueueTransform
504         if( NULL != fftPlan->intBuffer ) { OPENCL_V( clReleaseMemObject( fftPlan->intBuffer ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBuffer = NULL; }
505         if( NULL != fftPlan->intBufferRC ) { OPENCL_V( clReleaseMemObject( fftPlan->intBufferRC ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBufferRC = NULL; }
506         if( NULL != fftPlan->intBufferC2R ) { OPENCL_V( clReleaseMemObject( fftPlan->intBufferC2R ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBufferC2R = NULL; }
507
508
509     if( fftPlan->userPlan ) // confirm it is top-level plan (user plan)
510         {
511                 if(fftPlan->placeness == CLFFT_INPLACE)
512                 {
513                         if( (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR) || (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR) )
514                                 return CLFFT_INVALID_PLAN;
515                 }
516
517                 // Make sure strides & distance are same for C-C transforms
518                 if(fftPlan->placeness == CLFFT_INPLACE)
519                 {
520                         if( (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL) )
521                         {
522                                 // check strides
523                                 for(size_t i=0; i<fftPlan->dim; i++)
524                                         if(fftPlan->inStride[i] != fftPlan->outStride[i])
525                                                 return CLFFT_INVALID_PLAN;
526
527                                 // check distance
528                                 if(fftPlan->iDist != fftPlan->oDist)
529                                         return CLFFT_INVALID_PLAN;
530                         }
531                 }
532         }
533
534         if(fftPlan->gen == Copy)
535         {
536         clfftStatus err;
537         fftPlan->action = new FFTGeneratedCopyAction(plHandle, fftPlan, *commQueueFFT, err);
538         OPENCL_V( err, _T( "FFTGeneratedCopyAction() failed" ) );
539                 fftPlan->baked          = true;
540                 return  CLFFT_SUCCESS;
541         }
542
543
544         if( fftPlan->userPlan )
545         {
546                 //      If the user specifies double precision, check that the device supports double precision first
547                 if( fftPlan->precision == CLFFT_DOUBLE || fftPlan->precision == CLFFT_DOUBLE_FAST )
548                 {
549                         clfftStatus retAmdFp64 = checkDevExt( "cl_amd_fp64", fftPlan->bakeDevice );
550                         if( retAmdFp64 != CLFFT_SUCCESS )
551                         {
552                                 //      If AMD's extention is not supported, check for Khronos extention
553                                 clfftStatus retKhrFp64 = checkDevExt( "cl_khr_fp64", fftPlan->bakeDevice );
554                                 if( retKhrFp64 != CLFFT_SUCCESS )
555                                         return retKhrFp64;
556                         }
557                 }
558         }
559
560         // Compress the plan by discarding length '1' dimensions
561         // decision to pick generator
562         if( fftPlan->userPlan && !rc ) // confirm it is top-level plan (user plan)
563         {
564                 size_t dmnsn = fftPlan->dim;
565                 bool pow2flag = true;
566
567                 // switch case flows with no 'break' statements
568                 switch(fftPlan->dim)
569                 {
570                 case CLFFT_3D:
571
572                         if(fftPlan->length[DimZ] == 1)
573                         {
574                                 dmnsn -= 1;
575                                 fftPlan-> inStride.erase(fftPlan-> inStride.begin() + 2);
576                                 fftPlan->outStride.erase(fftPlan->outStride.begin() + 2);
577                                 fftPlan->   length.erase(fftPlan->   length.begin() + 2);
578                         }
579                         else
580                         {
581                                 if( !IsPo2(fftPlan->length[DimZ])) pow2flag=false;
582                         }
583                 case CLFFT_2D:
584
585                         if(fftPlan->length[DimY] == 1)
586                         {
587                                 dmnsn -= 1;
588                                 fftPlan-> inStride.erase(fftPlan-> inStride.begin() + 1);
589                                 fftPlan->outStride.erase(fftPlan->outStride.begin() + 1);
590                                 fftPlan->   length.erase(fftPlan->   length.begin() + 1);
591                         }
592                         else
593                         {
594                                 if( !IsPo2(fftPlan->length[DimY])) pow2flag=false;
595                         }
596
597                 case CLFFT_1D:
598
599                         if( (fftPlan->length[DimX] == 1) && (dmnsn > 1) )
600                         {
601                                 dmnsn -= 1;
602                                 fftPlan-> inStride.erase(fftPlan-> inStride.begin());
603                                 fftPlan->outStride.erase(fftPlan->outStride.begin());
604                                 fftPlan->   length.erase(fftPlan->   length.begin());
605                         }
606                         else
607                         {
608                                 if( !IsPo2(fftPlan->length[DimX])) pow2flag=false;
609                         }
610                 }
611
612                 fftPlan->dim = (clfftDim)dmnsn;
613         }
614
615         // first time check transposed
616         if (fftPlan->transposed != CLFFT_NOTRANSPOSE && fftPlan->dim != CLFFT_2D &&
617                 fftPlan->dim == fftPlan->length.size())
618                 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
619
620         //      The largest vector we can transform in a single pass
621         //      depends on the GPU caps -- especially the amount of LDS
622         //      available
623         //
624         size_t Large1DThreshold = 0;
625
626
627         OPENCL_V(fftPlan->GetMax1DLength (&Large1DThreshold), _T("GetMax1DLength failed"));
628         BUG_CHECK (Large1DThreshold > 1);
629
630         //      Verify that the data passed to us is packed
631         switch( fftPlan->dim )
632         {
633         case CLFFT_1D:
634                 {
635                         if ( !Is1DPossible(fftPlan->length[0], Large1DThreshold) )
636                         {
637                                 size_t clLengths[] = { 1, 1, 0 };
638                                 size_t in_1d, in_x, count;
639
640                                 BUG_CHECK (IsPo2 (Large1DThreshold))
641
642
643                                 if( IsPo2(fftPlan->length[0]) )
644                                 {
645                                         // Enable block compute under these conditions
646                                         if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc
647                                                 && (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1)
648                                                 && (!clfftGetRequestLibNoMemAlloc() || (fftPlan->placeness == CLFFT_OUTOFPLACE)) )
649                                         {
650                                                 fftPlan->blockCompute = true;
651
652                                                 if(1 == PrecisionWidth(fftPlan->precision))
653                                                 {
654                                                         switch(fftPlan->length[0])
655                                                         {
656                                                         case 8192:              clLengths[1] = 64;      break;
657                                                         case 16384:             clLengths[1] = 64;      break;
658                                                         case 32768:             clLengths[1] = 128;     break;
659                                                         case 65536:             clLengths[1] = 256;     break;
660                                                         case 131072:    clLengths[1] = 64;      break;
661                                                         case 262144:    clLengths[1] = 64;      break;
662                                                         case 524288:    clLengths[1] = 256; break;
663                                                         case 1048576:   clLengths[1] = 256; break;
664                                                         default:                assert(false);
665                                                         }
666                                                 }
667                                                 else
668                                                 {
669                                                         switch(fftPlan->length[0])
670                                                         {
671                                                         case 4096:              clLengths[1] = 64;      break;
672                                                         case 8192:              clLengths[1] = 64;      break;
673                                                         case 16384:             clLengths[1] = 64;      break;
674                                                         case 32768:             clLengths[1] = 128;     break;
675                                                         case 65536:             clLengths[1] = 64;      break;
676                                                         case 131072:    clLengths[1] = 64;      break;
677                                                         case 262144:    clLengths[1] = 128;     break;
678                                                         case 524288:    clLengths[1] = 256; break;
679                                                         default:                assert(false);
680                                                         }
681                                                 }
682                                         }
683                                         else
684                                         {
685                                                 if( clfftGetRequestLibNoMemAlloc() && !rc && (fftPlan->placeness == CLFFT_INPLACE) )
686                                                 {
687                                                         in_x = BitScanF(fftPlan->length[0]);
688                                                         in_x /= 2;
689                                                         clLengths[1] = (size_t)1 << in_x;
690                                                 }
691                                                 else if( fftPlan->length[0] > (Large1DThreshold * Large1DThreshold) )
692                                                 {
693                                                         clLengths[1] = fftPlan->length[0] / Large1DThreshold;
694                                                 }
695                                                 else
696                                                 {
697                                                         in_1d = BitScanF (Large1DThreshold);    // this is log2(LARGE1D_THRESHOLD)
698                                                         in_x  = BitScanF (fftPlan->length[0]);  // this is log2(length)
699                                                         BUG_CHECK (in_1d > 0)
700                                                         count = in_x/in_1d;
701                                                         if (count*in_1d < in_x)
702                                                         {
703                                                                 count++;
704                                                                 in_1d = in_x / count;
705                                                                 if (in_1d * count < in_x) in_1d++;
706                                                         }
707                                                         clLengths[1] = (size_t)1 << in_1d;
708                                                 }
709                                         }
710                                 }
711                                 else
712                                 {
713                                         // This array must be kept sorted in the ascending order
714
715                                         size_t supported[] = {  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 24,
716                                                                                         25, 26, 27, 28, 30, 32, 33, 35, 36, 39, 40, 42, 44, 45, 48, 49, 50, 52, 54,
717                                                                                         55, 56, 60, 63, 64, 65, 66, 70, 72, 75, 77, 78, 80, 81, 84, 88, 90, 91, 96,
718                                                                                         98, 99, 100, 104, 105, 108, 110, 112, 117, 120, 121, 125, 126, 128, 130, 132,
719                                                                                         135, 140, 143, 144, 147, 150, 154, 156, 160, 162, 165, 168, 169, 175, 176,
720                                                                                         180, 182, 189, 192, 195, 196, 198, 200, 208, 210, 216, 220, 224, 225, 231,
721                                                                                         234, 240, 242, 243, 245, 250, 252, 256, 260, 264, 270, 273, 275, 280, 286,
722                                                                                         288, 294, 297, 300, 308, 312, 315, 320, 324, 325, 330, 336, 338, 343, 350,
723                                                                                         351, 352, 360, 363, 364, 375, 378, 384, 385, 390, 392, 396, 400, 405, 416,
724                                                                                         420, 429, 432, 440, 441, 448, 450, 455, 462, 468, 480, 484, 486, 490, 495,
725                                                                                         500, 504, 507, 512, 520, 525, 528, 539, 540, 546, 550, 560, 567, 572, 576,
726                                                                                         585, 588, 594, 600, 605, 616, 624, 625, 630, 637, 640, 648, 650, 660, 672,
727                                                                                         675, 676, 686, 693, 700, 702, 704, 715, 720, 726, 728, 729, 735, 750, 756,
728                                                                                         768, 770, 780, 784, 792, 800, 810, 819, 825, 832, 840, 845, 847, 858, 864,
729                                                                                         875, 880, 882, 891, 896, 900, 910, 924, 936, 945, 960, 968, 972, 975, 980,
730                                                                                         990, 1000, 1001, 1008, 1014, 1024, 1029, 1040, 1050, 1053, 1056, 1078, 1080,
731                                                                                         1089, 1092, 1100, 1120, 1125, 1134, 1144, 1152, 1155, 1170, 1176, 1183, 1188,
732                                                                                         1200, 1210, 1215, 1225, 1232, 1248, 1250, 1260, 1274, 1280, 1287, 1296, 1300,
733                                                                                         1320, 1323, 1331, 1344, 1350, 1352, 1365, 1372, 1375, 1386, 1400, 1404, 1408,
734                                                                                         1430, 1440, 1452, 1456, 1458, 1470, 1485, 1500, 1512, 1521, 1536, 1540, 1560,
735                                                                                         1568, 1573, 1575, 1584, 1600, 1617, 1620, 1625, 1638, 1650, 1664, 1680, 1690,
736                                                                                         1694, 1701, 1715, 1716, 1728, 1750, 1755, 1760, 1764, 1782, 1792, 1800, 1815,
737                                                                                         1820, 1848, 1859, 1872, 1875, 1890, 1911, 1920, 1925, 1936, 1944, 1950, 1960,
738                                                                                         1980, 2000, 2002, 2016, 2025, 2028, 2048, 2058, 2079, 2080, 2100, 2106, 2112,
739                                                                                         2145, 2156, 2160, 2178, 2184, 2187, 2197, 2200, 2205, 2240, 2250, 2268, 2275,
740                                                                                         2288, 2304, 2310, 2340, 2352, 2366, 2376, 2400, 2401, 2420, 2430, 2450, 2457,
741                                                                                         2464, 2475, 2496, 2500, 2520, 2535, 2541, 2548, 2560, 2574, 2592, 2600, 2625,
742                                                                                         2640, 2646, 2662, 2673, 2688, 2695, 2700, 2704, 2730, 2744, 2750, 2772, 2800,
743                                                                                         2808, 2816, 2835, 2860, 2880, 2904, 2912, 2916, 2925, 2940, 2970, 3000, 3003,
744                                                                                         3024, 3025, 3042, 3072, 3080, 3087, 3120, 3125, 3136, 3146, 3150, 3159, 3168,
745                                                                                         3185, 3200, 3234, 3240, 3250, 3267, 3276, 3300, 3328, 3360, 3375, 3380, 3388,
746                                                                                         3402, 3430, 3432, 3456, 3465, 3500, 3510, 3520, 3528, 3549, 3564, 3575, 3584,
747                                                                                         3600, 3630, 3640, 3645, 3675, 3696, 3718, 3744, 3750, 3773, 3780, 3822, 3840,
748                                                                                         3850, 3861, 3872, 3888, 3900, 3920, 3960, 3969, 3993, 4000, 4004, 4032, 4050,
749                                                                                         4056, 4095, 4096 };
750
751                                         size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
752                                         size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
753
754                                         size_t halfPowerLength = (size_t)1 << ( (StockhamGenerator::CeilPo2(fftPlan->length[0]) + 1) / 2 );
755                                         size_t factoredLengthStart =  (halfPowerLength < maxFactoredLength) ? halfPowerLength : maxFactoredLength;
756
757                                         size_t indexStart = 0;
758                                         while(supported[indexStart] < factoredLengthStart) indexStart++;
759
760                                         for(size_t i = indexStart; i >= 1; i--)
761                                         {
762                                                 if( fftPlan->length[0] % supported[i] == 0 )
763                                                 {
764                                                         if (Is1DPossible(supported[i], Large1DThreshold))
765                                                         {
766                                                                 clLengths[1] = supported[i];
767                                                                 break;
768                                                         }
769                                                 }
770                                         }
771                                 }
772                                 // add some special cases
773                                 /*
774                                 if (fftPlan->length[0] == 10000)
775                                         clLengths[1] = 100;//100 x 100
776                                 if (fftPlan->length[0] == 100000)
777                                         clLengths[1] = 100;//100 x 1,000
778                                 if (fftPlan->length[0] == 10000000)
779                                         clLengths[1] = 1000;//1,000 x 10,000
780                                 if (fftPlan->length[0] == 100000000)
781                                         clLengths[1] = 10000;//10,000 x 10,000
782                                 if (fftPlan->length[0] == 1000000000)
783                                         clLengths[1] = 10000;//10,000 x 100,000
784                                 
785                                 if (fftPlan->length[0] == 3099363912)
786                                         clLengths[1] = 78732;//39366 x 78732
787                                 if (fftPlan->length[0] == 39366)
788                                         clLengths[1] = 81;//81*486
789                                 if (fftPlan->length[0] == 78732)
790                                         clLengths[1] = 162;//162*486
791                                 if (fftPlan->length[0] == 354294)
792                                         clLengths[1] = 243;
793                                 */
794                                 size_t threshold = 4096;
795                                 if (fftPlan->precision == CLFFT_DOUBLE)
796                                         threshold = 2048;
797                                 if (clfftGetRequestLibNoMemAlloc() &&
798                                         fftPlan->placeness == CLFFT_INPLACE &&
799                                         (fftPlan->inputLayout == fftPlan->outputLayout)
800                                         && fftPlan->length[0] > threshold)
801                                 {
802                                         //for inplace fft with inplace transpose, the split logic is different
803                                         vector<vector<size_t> > splitNums;
804                                         bool implemented = split1D_for_inplace(fftPlan->length[0], splitNums, fftPlan->precision, threshold);
805                                         if (implemented)
806                                                 clLengths[1] = splitNums[0][0];
807                                 }
808
809                                 clLengths[0] = fftPlan->length[0]/clLengths[1];
810
811                 // Start of block where transposes are generated; 1D FFT
812                                 while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
813                                 {
814                                         if (fftPlan->length[0] <= Large1DThreshold) break;
815
816                                         if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
817
818                                         if ( IsPo2(fftPlan->length[0]) &&
819                                                  (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1) &&
820                                                  (!clfftGetRequestLibNoMemAlloc() || (fftPlan->placeness == CLFFT_OUTOFPLACE)) ) break;
821
822                                         if ( clLengths[0]<=32 && clLengths[1]<=32) break;
823
824
825                                         size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
826                                         size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
827                                         size_t padding = 0;
828                                         if( (smallerDim % 64 == 0) || (biggerDim % 64 == 0) )
829                                                 padding = 64;
830
831                                         clfftGenerators transGen = Transpose_GCN;
832                                         
833                                         size_t dim_ratio = biggerDim / smallerDim;
834                                         size_t dim_residue = biggerDim % smallerDim;
835                                         //    If this is an in-place transform the
836                                         //    input and output layout, dimensions and strides
837                                         //    *MUST* be the same.
838                                         //
839                                         bool inStrideEqualsOutStride = true;
840                                         for (size_t u = fftPlan->inStride.size(); u-- > 0; ) {
841                                                 if (fftPlan->inStride[u] != fftPlan->outStride[u])
842                                                 {
843                                                         inStrideEqualsOutStride = false;
844                                                         break;
845                                                 }
846                                         }
847                                         //packed data is required for inplace transpose
848                                         bool isDataPacked = true;
849                                         for (size_t u = 0; u < fftPlan->inStride.size(); u++)
850                                         {
851                                                 if (u == 0)
852                                                 {
853                                                         if (fftPlan->inStride[0] == 1)
854                                                                 continue;
855                                                         else
856                                                         {
857                                                                 isDataPacked = false;
858                                                                 break;
859                                                         }
860                                                 }
861                                                 else
862                                                 {
863                                                         size_t packDataSize = 1;
864                                                         for (size_t i = 0; i < u; i++)
865                                                                 packDataSize *= fftPlan->length[i];
866                                                         if (fftPlan->inStride[u] == packDataSize)
867                                                                 continue;
868                                                         else
869                                                         {
870                                                                 isDataPacked = false;
871                                                                 break;
872                                                         }
873                                                 }
874                                         }
875                                         if (clfftGetRequestLibNoMemAlloc() &&
876                                                 dim_residue == 0 &&
877                                                 ((dim_ratio % 2 == 0) ||
878                                                  (dim_ratio % 3 == 0) ||
879                                                  (dim_ratio % 5 == 0) ||
880                                                  (dim_ratio % 10 == 0)) &&
881                                                  fftPlan->placeness == CLFFT_INPLACE &&
882                                                  (fftPlan->inputLayout == fftPlan->outputLayout) &&
883                                                  (inStrideEqualsOutStride) && (isDataPacked))
884                                         {
885                                                 padding = 0;
886                                                 fftPlan->allOpsInplace = true;
887                                                 transGen = Transpose_NONSQUARE;
888                                                 //std::cout << "Transpose_NONSQUARE" << std::endl;
889                                         }
890
891                                         if( clfftGetRequestLibNoMemAlloc() &&
892                                                 (clLengths[0] == clLengths[1]) &&
893                                                 fftPlan->placeness == CLFFT_INPLACE )
894                                         {
895                                                 padding = 0;
896                                                 fftPlan->allOpsInplace = true;
897                                                 transGen = Transpose_SQUARE;
898                                         }
899
900                                         if (fftPlan->tmpBufSize != 0)
901                                                 padding = 0;
902
903                                         if ( (fftPlan->tmpBufSize==0 ) && !fftPlan->allOpsInplace)
904                                         {
905                                                 fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
906                                                         fftPlan->batchsize * fftPlan->ElementSize();
907
908                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
909                                                 {
910                                                         fftPlan->tmpBufSize *= fftPlan->length[index];
911                                                 }
912                                         }
913
914                                         //Transpose
915                                         //Input --> tmp buffer
916                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
917                                                 _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
918
919                                         FFTPlan* trans1Plan     = NULL;
920                                         lockRAII* trans1Lock    = NULL;
921                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
922
923                                         trans1Plan->placeness     = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
924                                         trans1Plan->precision     = fftPlan->precision;
925                                         trans1Plan->tmpBufSize    = 0;
926                                         trans1Plan->batchsize     = fftPlan->batchsize;
927                                         trans1Plan->envelope      = fftPlan->envelope;
928                                         trans1Plan->inputLayout   = fftPlan->inputLayout;
929                                         trans1Plan->outputLayout  = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
930                                         trans1Plan->inStride[0]   = fftPlan->inStride[0];
931                                         trans1Plan->inStride[1]   = clLengths[0];
932                                         trans1Plan->outStride[0]  = 1;
933                                         trans1Plan->outStride[1]  = clLengths[1] + padding;
934                                         trans1Plan->iDist         = fftPlan->iDist;
935                                         trans1Plan->oDist         = clLengths[0] * trans1Plan->outStride[1];
936                                         trans1Plan->gen           = transGen;
937                                         trans1Plan->transflag     = true;
938
939                                         if (trans1Plan->gen == Transpose_NONSQUARE || trans1Plan->gen == Transpose_SQUARE)// inplace transpose
940                                         {
941                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
942                                                 {
943                                                         //trans1Plan->length.push_back(fftPlan->length[index]);
944                                                         /*
945                                                         replacing the line above with the two lines below since:
946                                                         fftPlan is still 1D, thus the broken down transpose should be 2D not 3D
947                                                         the batchSize for the transpose should increase accordingly.
948                                                         the iDist should decrease accordingly. Push back to length will cause a 3D transpose
949                                                         */
950                                                         trans1Plan->batchsize = trans1Plan->batchsize * fftPlan->length[index];
951                                                         trans1Plan->iDist = trans1Plan->iDist / fftPlan->length[index];
952
953                                                         trans1Plan->inStride.push_back(fftPlan->inStride[index]);
954                                                         trans1Plan->outStride.push_back(trans1Plan->oDist);
955                                                         trans1Plan->oDist *= fftPlan->length[index];
956                                                 }
957                                         }
958                                         else
959                                         {
960                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
961                                                 {
962                                                         trans1Plan->length.push_back(fftPlan->length[index]);
963
964                                                         trans1Plan->inStride.push_back(fftPlan->inStride[index]);
965                                                         trans1Plan->outStride.push_back(trans1Plan->oDist);
966                                                         trans1Plan->oDist *= fftPlan->length[index];
967                                                 }
968                                         }
969
970                                         //Set callback data if set on top level plan
971                                         if (fftPlan->hasPreCallback)
972                                         {
973                                                 trans1Plan->hasPreCallback = true;
974                                                 trans1Plan->preCallback = fftPlan->preCallback;
975                                                 trans1Plan->precallUserData = fftPlan->precallUserData;
976                                         }
977
978                                         OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
979                                                 _T( "BakePlan large1d trans1 plan failed" ) );
980
981                                         //Row transform
982                                         //tmp->output
983                                         //size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
984                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
985                                                 _T( "CreateDefaultPlan Large1d column failed" ) );
986
987                                         FFTPlan* row1Plan       = NULL;
988                                         lockRAII* row1Lock      = NULL;
989                                         OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
990
991                                         row1Plan->placeness     = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
992                                         row1Plan->precision     = fftPlan->precision;
993                                         row1Plan->forwardScale  = 1.0f;
994                                         row1Plan->backwardScale = 1.0f;
995                                         row1Plan->tmpBufSize    = 0;
996                                         row1Plan->batchsize     = fftPlan->batchsize;
997
998                                         row1Plan->gen                   = fftPlan->gen;
999                                         row1Plan->envelope              = fftPlan->envelope;
1000
1001                                         // twiddling is done in row2
1002                                         row1Plan->large1D               = 0;
1003
1004                                         row1Plan->length.push_back(clLengths[0]);
1005                                         row1Plan->inputLayout   = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1006                                         row1Plan->outputLayout  = fftPlan->outputLayout;
1007                                         row1Plan->inStride[0]   = 1;
1008                                         row1Plan->outStride[0]  = fftPlan->outStride[0];
1009                                         row1Plan->inStride.push_back(clLengths[1]+padding);
1010                                         row1Plan->outStride.push_back(clLengths[1]);
1011                                         row1Plan->iDist         = clLengths[0] * row1Plan->inStride[1];
1012                                         row1Plan->oDist         = fftPlan->oDist;
1013
1014                                         for (size_t index = 1; index < fftPlan->length.size(); index++)
1015                                         {
1016                                                 row1Plan->length.push_back(fftPlan->length[index]);
1017                                                 row1Plan->inStride.push_back(row1Plan->iDist);
1018                                                 row1Plan->iDist *= fftPlan->length[index];
1019                                                 row1Plan->outStride.push_back(fftPlan->outStride[index]);
1020                                         }
1021
1022                                         OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
1023                                                 _T( "BakePlan large1d first row plan failed" ) );
1024
1025                                         //Transpose 2
1026                                         //Output --> tmp buffer
1027                                         clLengths[2] = clLengths[0];
1028                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
1029                                                 _T( "CreateDefaultPlan Large1d transpose 2 failed" ) );
1030
1031                                         FFTPlan* trans2Plan     = NULL;
1032                                         lockRAII* trans2Lock    = NULL;
1033                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
1034
1035                                         trans2Plan->placeness     = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
1036                                         trans2Plan->precision     = fftPlan->precision;
1037                                         trans2Plan->tmpBufSize    = 0;
1038                                         trans2Plan->batchsize     = fftPlan->batchsize;
1039                                         trans2Plan->envelope      = fftPlan->envelope;
1040                                         trans2Plan->inputLayout   = fftPlan->outputLayout;
1041                                         trans2Plan->outputLayout  = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1042                                         trans2Plan->inStride[0]   = fftPlan->outStride[0];
1043                                         trans2Plan->inStride[1]   = clLengths[1];
1044                                         trans2Plan->outStride[0]  = 1;
1045                                         trans2Plan->outStride[1]  = clLengths[0] + padding;
1046                                         trans2Plan->iDist         = fftPlan->oDist;
1047                                         trans2Plan->oDist         = clLengths[1] * trans2Plan->outStride[1];
1048                     trans2Plan->gen           = transGen;
1049
1050                                         //if (transGen != Transpose_NONSQUARE)//twiddle
1051                                                 trans2Plan->large1D               = fftPlan->length[0];
1052
1053                                         trans2Plan->transflag     = true;
1054
1055                                         if (trans2Plan->gen == Transpose_NONSQUARE || trans2Plan->gen == Transpose_SQUARE)// inplace transpose
1056                                         {
1057                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
1058                                                 {
1059                                                         //trans2Plan->length.push_back(fftPlan->length[index]);
1060                                                         /*
1061                                                         replacing the line above with the two lines below since:
1062                                                         fftPlan is still 1D, thus the broken down transpose should be 2D not 3D
1063                                                         the batchSize for the transpose should increase accordingly.
1064                                                         the iDist should decrease accordingly. Push back to length will cause a 3D transpose
1065                                                         */
1066                                                         trans2Plan->batchsize = trans2Plan->batchsize * fftPlan->length[index];
1067                                                         trans2Plan->iDist = trans2Plan->iDist / fftPlan->length[index];
1068                                                         trans2Plan->inStride.push_back(fftPlan->outStride[index]);
1069                                                         trans2Plan->outStride.push_back(trans2Plan->oDist);
1070                                                         trans2Plan->oDist *= fftPlan->length[index];
1071                                                 }
1072                                         }
1073                                         else
1074                                         {
1075                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
1076                                                 {
1077                                                         trans2Plan->length.push_back(fftPlan->length[index]);
1078
1079                                                         trans2Plan->inStride.push_back(fftPlan->outStride[index]);
1080                                                         trans2Plan->outStride.push_back(trans2Plan->oDist);
1081                                                         trans2Plan->oDist *= fftPlan->length[index];
1082                                                 }
1083                                         }
1084
1085                                         OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
1086                                                 _T( "BakePlan large1d trans2 plan failed" ) );
1087
1088                                         //Row transform 2
1089                                         //tmp->tmp
1090                                         //size clLengths[0], batch clLengths[1]
1091                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
1092                                                 _T( "CreateDefaultPlan Large1d second row plan failed" ) );
1093
1094                                         FFTPlan* row2Plan       = NULL;
1095                                         lockRAII* row2Lock      = NULL;
1096                                         OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
1097
1098                                         row2Plan->placeness     = CLFFT_INPLACE;
1099                                         row2Plan->precision     = fftPlan->precision;
1100                                         row2Plan->forwardScale  = fftPlan->forwardScale;
1101                                         row2Plan->backwardScale = fftPlan->backwardScale;
1102                                         row2Plan->tmpBufSize    = 0;
1103                                         row2Plan->batchsize     = fftPlan->batchsize;
1104
1105                                         row2Plan->gen                   = fftPlan->gen;
1106                                         row2Plan->envelope              = fftPlan->envelope;
1107
1108
1109                                         row2Plan->length.push_back(clLengths[1]);
1110                                         row2Plan->inputLayout   = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1111                                         row2Plan->outputLayout  = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1112                                         row2Plan->inStride[0]   = 1;
1113                                         row2Plan->outStride[0]  = 1;
1114                                         row2Plan->inStride.push_back(clLengths[0] + padding);
1115                                         row2Plan->outStride.push_back(clLengths[0] + padding);
1116                                         row2Plan->iDist         = clLengths[1] * row2Plan->inStride[1];
1117                                         row2Plan->oDist         = clLengths[1] * row2Plan->outStride[1];
1118
1119                                         for (size_t index = 1; index < fftPlan->length.size(); index++)
1120                                         {
1121                                                 row2Plan->length.push_back(fftPlan->length[index]);
1122                                                 row2Plan->inStride.push_back(row2Plan->iDist);
1123                                                 row2Plan->outStride.push_back(row2Plan->oDist);
1124                                                 row2Plan->iDist *= fftPlan->length[index];
1125                                                 row2Plan->oDist *= fftPlan->length[index];
1126                                         }
1127                                         
1128                                         //if (transGen != Transpose_NONSQUARE)//twiddle in transform
1129                                         //{
1130                                         //      row2Plan->large1D = fftPlan->length[0];
1131                                         //      row2Plan->twiddleFront = true;
1132                                         //}
1133
1134                                         OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
1135                                                 _T( "BakePlan large1d second row plan failed" ) );
1136
1137                                         //Transpose 3
1138                                         //tmp --> output
1139                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
1140                                                 _T( "CreateDefaultPlan Large1d transpose 3 failed" ) );
1141
1142                                         FFTPlan* trans3Plan     = NULL;
1143                                         lockRAII* trans3Lock    = NULL;
1144                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
1145
1146                                         trans3Plan->placeness     = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
1147                                         trans3Plan->precision     = fftPlan->precision;
1148                                         trans3Plan->tmpBufSize    = 0;
1149                                         trans3Plan->batchsize     = fftPlan->batchsize;
1150                                         trans3Plan->envelope      = fftPlan->envelope;
1151                                         trans3Plan->inputLayout   = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1152                                         trans3Plan->outputLayout  = fftPlan->outputLayout;
1153                                         trans3Plan->inStride[0]   = 1;
1154                                         trans3Plan->inStride[1]   = clLengths[0] + padding;
1155                                         trans3Plan->outStride[0]  = fftPlan->outStride[0];
1156                                         trans3Plan->outStride[1]  = clLengths[1];
1157                                         trans3Plan->iDist         = clLengths[1] * trans3Plan->inStride[1];
1158                                         trans3Plan->oDist         = fftPlan->oDist;
1159                     trans3Plan->gen           = transGen;
1160                                         trans3Plan->transflag     = true;
1161                                         trans3Plan->transOutHorizontal = true;
1162
1163
1164                                         if (trans3Plan->gen == Transpose_NONSQUARE)// inplace transpose
1165                                         {
1166                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
1167                                                 {
1168                                                         //trans3Plan->length.push_back(fftPlan->length[index]);
1169                                                         /*
1170                                                         replacing the line above with the two lines below since:
1171                                                         fftPlan is still 1D, thus the broken down transpose should be 2D not 3D
1172                                                         the batchSize for the transpose should increase accordingly.
1173                                                         the iDist should decrease accordingly. Push back to length will cause a 3D transpose
1174                                                         */
1175                                                         trans3Plan->batchsize = trans3Plan->batchsize * fftPlan->length[index];
1176                                                         //trans3Plan->iDist = trans3Plan->iDist / fftPlan->length[index];
1177                                                         //trans3Plan->inStride.push_back(trans3Plan->iDist);
1178                                                         trans3Plan->inStride.push_back(fftPlan->inStride[index]);
1179                                                         //trans3Plan->iDist *= fftPlan->length[index];
1180                                                         trans3Plan->outStride.push_back(fftPlan->outStride[index]);
1181                                                 }
1182                                         }
1183                                         else if (trans3Plan->gen == Transpose_SQUARE)
1184                                         {
1185                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
1186                                                 {
1187                                                         trans3Plan->batchsize = trans3Plan->batchsize * fftPlan->length[index];
1188                                                         //trans3Plan->iDist = trans3Plan->iDist / fftPlan->length[index];
1189                                                         //trans3Plan->inStride.push_back(trans3Plan->iDist);
1190                                                         trans3Plan->inStride.push_back(fftPlan->inStride[index]);
1191                                                         //trans3Plan->iDist *= fftPlan->length[index];
1192                                                         trans3Plan->outStride.push_back(fftPlan->outStride[index]);
1193                                                 }
1194                                         }
1195                                         else
1196                                         {
1197                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
1198                                                 {
1199                                                         trans3Plan->length.push_back(fftPlan->length[index]);
1200
1201                                                         trans3Plan->inStride.push_back(trans3Plan->iDist);
1202                                                         trans3Plan->iDist *= fftPlan->length[index];
1203                                                         trans3Plan->outStride.push_back(fftPlan->outStride[index]);
1204                                                 }
1205                                         }
1206
1207                                         //Set callback data if set on top level plan
1208                                         if (fftPlan->hasPostCallback)
1209                                         {
1210                                                 trans3Plan->hasPostCallback = true;
1211                                                 trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
1212                                                 trans3Plan->postcallUserData = fftPlan->postcallUserData;
1213                                         }
1214
1215                                         OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
1216                                                 _T( "BakePlan large1d trans3 plan failed" ) );
1217
1218                                         fftPlan->transflag = true;
1219                                         fftPlan->baked = true;
1220                                         return  CLFFT_SUCCESS;
1221                                 }
1222
1223                                 size_t length0 = clLengths[0];
1224                                 size_t length1 = clLengths[1];
1225
1226
1227                                 // For real transforms
1228                                 // Special case optimization with 5-step algorithm
1229                                 if( (fftPlan->inputLayout == CLFFT_REAL) && IsPo2(fftPlan->length[0])
1230                                         && (fftPlan->length.size() == 1)
1231                                         && (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1)
1232                                         && (fftPlan->length[0] > 4096) && (fftPlan->length.size() == 1) )
1233                                 {
1234
1235                                         ARG_CHECK(clLengths[0] <= Large1DThreshold);
1236
1237
1238                                         size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
1239                                         size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
1240                                         size_t padding = 0;
1241                                         if( (smallerDim % 64 == 0) || (biggerDim % 64 == 0) )
1242                                                 padding = 64;
1243
1244
1245                                         if (fftPlan->tmpBufSize==0 )
1246                                         {
1247                                                 size_t Nf = (1 + smallerDim/2) * biggerDim;
1248                                                 fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim / 2;
1249
1250                                                 if(fftPlan->tmpBufSize < Nf) 
1251                                                         fftPlan->tmpBufSize = Nf;
1252
1253                                                 fftPlan->tmpBufSize *= ( fftPlan->batchsize * fftPlan->ElementSize() );
1254
1255                                                 for (size_t index=1; index < fftPlan->length.size(); index++)
1256                                                 {
1257                                                         fftPlan->tmpBufSize *= fftPlan->length[index];
1258                                                 }
1259                                         }
1260
1261                                         if (fftPlan->tmpBufSizeRC==0 )
1262                                         {
1263                                                 fftPlan->tmpBufSizeRC = fftPlan->tmpBufSize;
1264                                         }
1265
1266                                         //Transpose
1267                                         //Input --> tmp buffer
1268                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
1269                                                 _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
1270
1271                                         FFTPlan* trans1Plan     = NULL;
1272                                         lockRAII* trans1Lock    = NULL;
1273                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
1274
1275                                         trans1Plan->placeness     = CLFFT_OUTOFPLACE;
1276                                         trans1Plan->precision     = fftPlan->precision;
1277                                         trans1Plan->tmpBufSize    = 0;
1278                                         trans1Plan->batchsize     = fftPlan->batchsize;
1279                                         trans1Plan->envelope      = fftPlan->envelope;
1280                                         trans1Plan->inputLayout   = fftPlan->inputLayout;
1281                                         trans1Plan->outputLayout  = CLFFT_REAL;
1282                                         trans1Plan->inStride[0]   = fftPlan->inStride[0];
1283                                         trans1Plan->inStride[1]   = clLengths[0];
1284                                         trans1Plan->outStride[0]  = 1;
1285                                         trans1Plan->outStride[1]  = clLengths[1] + padding;
1286                                         trans1Plan->iDist         = fftPlan->iDist;
1287                                         trans1Plan->oDist         = clLengths[0] * trans1Plan->outStride[1];
1288                                         trans1Plan->gen           = Transpose_GCN;
1289                                         trans1Plan->transflag     = true;
1290
1291                                         //Set callback data if set on top level plan
1292                                         if (fftPlan->hasPreCallback)
1293                                         {
1294                                                 trans1Plan->hasPreCallback = true;
1295                                                 trans1Plan->preCallback = fftPlan->preCallback;
1296                                                 trans1Plan->precallUserData = fftPlan->precallUserData;
1297                                         }
1298
1299                                         OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
1300                                                 _T( "BakePlan large1d trans1 plan failed" ) );
1301
1302                                         //Row transform
1303                                         //tmp->output
1304                                         //size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
1305                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
1306                                                 _T( "CreateDefaultPlan Large1d column failed" ) );
1307
1308                                         FFTPlan* row1Plan       = NULL;
1309                                         lockRAII* row1Lock      = NULL;
1310                                         OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
1311
1312                                         row1Plan->placeness     = CLFFT_OUTOFPLACE;
1313                                         row1Plan->precision     = fftPlan->precision;
1314                                         row1Plan->forwardScale  = 1.0f;
1315                                         row1Plan->backwardScale = 1.0f;
1316                                         row1Plan->tmpBufSize    = 0;
1317                                         row1Plan->batchsize     = fftPlan->batchsize;
1318
1319                                         row1Plan->gen                   = fftPlan->gen;
1320                                         row1Plan->envelope              = fftPlan->envelope;
1321
1322                                         // twiddling is done in row2
1323                                         row1Plan->large1D               = 0;
1324
1325                                         row1Plan->length.push_back(clLengths[0]);
1326                                         row1Plan->inputLayout   = CLFFT_REAL;
1327                                         row1Plan->outputLayout  = CLFFT_HERMITIAN_INTERLEAVED;
1328                                         row1Plan->inStride[0]   = 1;
1329                                         row1Plan->outStride[0]  = 1;
1330                                         row1Plan->inStride.push_back(clLengths[1]+padding);
1331                                         row1Plan->outStride.push_back(1 + clLengths[1]/2);
1332                                         row1Plan->iDist         = clLengths[0] * row1Plan->inStride[1];
1333                                         row1Plan->oDist         = clLengths[0] * row1Plan->outStride[1]; 
1334
1335
1336                                         OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
1337                                                 _T( "BakePlan large1d first row plan failed" ) );
1338
1339                                         //Transpose 2
1340                                         //Output --> tmp buffer
1341                                         clLengths[2] = clLengths[0];
1342                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
1343                                                 _T( "CreateDefaultPlan Large1d transpose 2 failed" ) );
1344
1345                                         FFTPlan* trans2Plan     = NULL;
1346                                         lockRAII* trans2Lock    = NULL;
1347                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
1348
1349                                         trans2Plan->transflag = true;
1350
1351                                         size_t transLengths[2];
1352                                         transLengths[0] = 1 + clLengths[1]/2;
1353                                         transLengths[1] = clLengths[0];
1354                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, transLengths ),
1355                                                 _T( "clfftSetPlanLength for planTY transpose failed" ) );
1356
1357
1358
1359                                         trans2Plan->placeness     = CLFFT_OUTOFPLACE;
1360                                         trans2Plan->precision     = fftPlan->precision;
1361                                         trans2Plan->tmpBufSize    = 0;
1362                                         trans2Plan->batchsize     = fftPlan->batchsize;
1363                                         trans2Plan->envelope      = fftPlan->envelope;
1364                                         trans2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
1365                                         trans2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
1366                                         trans2Plan->inStride[0]   = 1;
1367                                         trans2Plan->inStride[1]   = 1 + clLengths[1]/2;
1368                                         trans2Plan->outStride[0]  = 1;
1369                                         trans2Plan->outStride[1]  = clLengths[0];
1370                                         trans2Plan->iDist         = clLengths[0] * trans2Plan->inStride[1];
1371                                         trans2Plan->oDist         = (1 + clLengths[1]/2) * trans2Plan->outStride[1];
1372                     trans2Plan->gen           = Transpose_GCN;
1373                                         trans2Plan->transflag     = true;
1374                                         trans2Plan->transOutHorizontal = true;
1375
1376                                         OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
1377                                                 _T( "BakePlan large1d trans2 plan failed" ) );
1378
1379                                         //Row transform 2
1380                                         //tmp->tmp
1381                                         //size clLengths[0], batch clLengths[1]
1382                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
1383                                                 _T( "CreateDefaultPlan Large1d second row plan failed" ) );
1384
1385                                         FFTPlan* row2Plan       = NULL;
1386                                         lockRAII* row2Lock      = NULL;
1387                                         OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
1388
1389                                         row2Plan->placeness     = CLFFT_OUTOFPLACE;
1390                                         row2Plan->precision     = fftPlan->precision;
1391                                         row2Plan->forwardScale  = fftPlan->forwardScale;
1392                                         row2Plan->backwardScale = fftPlan->backwardScale;
1393                                         row2Plan->tmpBufSize    = 0;
1394                                         row2Plan->batchsize     = fftPlan->batchsize;
1395
1396                                         row2Plan->gen                   = fftPlan->gen;
1397                                         row2Plan->envelope              = fftPlan->envelope;
1398
1399
1400                                         row2Plan->length.push_back(1+clLengths[1]/2);
1401                                         row2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
1402                                         row2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
1403                                         row2Plan->inStride[0]   = 1;
1404                                         row2Plan->outStride[0]  = 1;
1405                                         row2Plan->inStride.push_back(clLengths[0]);
1406                                         row2Plan->outStride.push_back(1 + clLengths[0]/2);
1407                                         row2Plan->iDist         = (1 + clLengths[1]/2) * row2Plan->inStride[1];
1408                                         row2Plan->oDist         = clLengths[1] * row2Plan->outStride[1];
1409
1410                                         row2Plan->large1D               = fftPlan->length[0];
1411                                         row2Plan->twiddleFront  = true;
1412
1413                                         row2Plan->realSpecial = true;
1414                                         row2Plan->realSpecial_Nr = clLengths[1];
1415
1416                                         OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
1417                                                 _T( "BakePlan large1d second row plan failed" ) );
1418
1419                                         //Transpose 3
1420                                         //tmp --> output
1421                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
1422                                                 _T( "CreateDefaultPlan Large1d transpose 3 failed" ) );
1423
1424                                         FFTPlan* trans3Plan     = NULL;
1425                                         lockRAII* trans3Lock    = NULL;
1426                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
1427
1428                                         trans3Plan->transflag = true;
1429
1430                                         transLengths[0] = 1 + clLengths[0]/2;
1431                                         transLengths[1] = clLengths[1];
1432                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTZ, CLFFT_2D, transLengths ),
1433                                                 _T( "clfftSetPlanLength for planTZ transpose failed" ) );
1434
1435                                         trans3Plan->placeness     = CLFFT_OUTOFPLACE;
1436                                         trans3Plan->precision     = fftPlan->precision;
1437                                         trans3Plan->tmpBufSize    = 0;
1438                                         trans3Plan->batchsize     = fftPlan->batchsize;
1439                                         trans3Plan->envelope      = fftPlan->envelope;
1440                                         trans3Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
1441                                         if(fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR)
1442                                                 trans3Plan->outputLayout  = CLFFT_COMPLEX_PLANAR;
1443                                         else
1444                                                 trans3Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
1445                                         trans3Plan->inStride[0]   = 1;
1446                                         trans3Plan->inStride[1]   = 1 + clLengths[0]/2;
1447                                         trans3Plan->outStride[0]  = 1;
1448                                         trans3Plan->outStride[1]  = clLengths[1];
1449                                         trans3Plan->iDist         = clLengths[1] * trans3Plan->inStride[1];
1450                                         trans3Plan->oDist         = fftPlan->oDist;
1451                     trans3Plan->gen           = Transpose_GCN;
1452                                         trans3Plan->transflag     = true;
1453                                         trans3Plan->realSpecial   = true;
1454                                         trans3Plan->transOutHorizontal = true;
1455
1456                                         //Set callback data if set on top level plan
1457                                         if (fftPlan->hasPostCallback)
1458                                         {
1459                                                 trans3Plan->hasPostCallback = true;
1460                                                 trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
1461                                                 trans3Plan->postcallUserData = fftPlan->postcallUserData;
1462                                         }
1463
1464                                         OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
1465                                                 _T( "BakePlan large1d trans3 plan failed" ) );
1466
1467                                         fftPlan->transflag = true;
1468                                         fftPlan->baked = true;
1469                                         return  CLFFT_SUCCESS;
1470                                 }
1471                                 else if (fftPlan->inputLayout == CLFFT_REAL)
1472                                 {
1473                                         if (fftPlan->tmpBufSizeRC == 0)
1474                                         {
1475                                                 fftPlan->tmpBufSizeRC = length0 * length1 *
1476                                                         fftPlan->batchsize * fftPlan->ElementSize();
1477                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
1478                                                 {
1479                                                         fftPlan->tmpBufSizeRC *= fftPlan->length[index];
1480                                                 }
1481                                         }
1482
1483                                         // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
1484                                         // transposed output
1485                                         OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1]),
1486                                                 _T("CreateDefaultPlan Large1d column failed"));
1487
1488                                         FFTPlan* colTPlan = NULL;
1489                                         lockRAII* colLock = NULL;
1490                                         OPENCL_V(fftRepo.getPlan(fftPlan->planX, colTPlan, colLock), _T("fftRepo.getPlan failed"));
1491
1492                                         // current plan is to create intermediate buffer, packed and interleave
1493                                         // This is a column FFT, the first elements distance between each FFT is the distance of the first two
1494                                         // elements in the original buffer. Like a transpose of the matrix
1495                                         // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
1496
1497                                         //this part are common for both passes
1498                                         colTPlan->placeness = CLFFT_OUTOFPLACE;
1499                                         colTPlan->precision = fftPlan->precision;
1500                                         colTPlan->forwardScale = 1.0f;
1501                                         colTPlan->backwardScale = 1.0f;
1502                                         colTPlan->tmpBufSize = 0;
1503                                         colTPlan->batchsize = fftPlan->batchsize;
1504
1505                                         colTPlan->gen = fftPlan->gen;
1506                                         colTPlan->envelope = fftPlan->envelope;
1507
1508                                         //Pass large1D flag to confirm we need multiply twiddle factor
1509                                         colTPlan->large1D = fftPlan->length[0];
1510                                         colTPlan->RCsimple = true;
1511
1512                                         colTPlan->length.push_back(clLengths[0]);
1513
1514                                         // first Pass
1515                                         colTPlan->inputLayout = fftPlan->inputLayout;
1516                                         colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1517                                         colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
1518                                         colTPlan->outStride[0] = 1;
1519                                         colTPlan->iDist = fftPlan->iDist;
1520                                         colTPlan->oDist = length0 * length1;//fftPlan->length[0];
1521                                         colTPlan->inStride.push_back(fftPlan->inStride[0]);
1522                                         colTPlan->outStride.push_back(length1);//clLengths[1]);
1523
1524                                         for (size_t index = 1; index < fftPlan->length.size(); index++)
1525                                         {
1526                                                 colTPlan->length.push_back(fftPlan->length[index]);
1527                                                 colTPlan->inStride.push_back(fftPlan->inStride[index]);
1528                                                 // tmp buffer is tightly packed
1529                                                 colTPlan->outStride.push_back(colTPlan->oDist);
1530                                                 colTPlan->oDist *= fftPlan->length[index];
1531                                         }
1532
1533                                         //Set callback data if set on top level plan
1534                                         if (fftPlan->hasPreCallback)
1535                                         {
1536                                                 colTPlan->hasPreCallback = true;
1537                                                 colTPlan->preCallback = fftPlan->preCallback;
1538                                                 colTPlan->precallUserData = fftPlan->precallUserData;
1539                                         }
1540
1541                                         OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d first column plan failed"));
1542
1543                                         //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
1544                                         OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0]),
1545                                                 _T("CreateDefaultPlan large1D row failed"));
1546
1547                                         FFTPlan* col2Plan = NULL;
1548                                         lockRAII* rowLock = NULL;
1549                                         OPENCL_V(fftRepo.getPlan(fftPlan->planY, col2Plan, rowLock), _T("fftRepo.getPlan failed"));
1550
1551                                         // This is second column fft, intermediate buffer is packed and interleaved
1552                                         // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
1553
1554                                         col2Plan->precision = fftPlan->precision;
1555                                         col2Plan->forwardScale = fftPlan->forwardScale;
1556                                         col2Plan->backwardScale = fftPlan->backwardScale;
1557                                         col2Plan->tmpBufSize = 0;
1558                                         col2Plan->batchsize = fftPlan->batchsize;
1559
1560                                         col2Plan->gen = fftPlan->gen;
1561                                         col2Plan->envelope = fftPlan->envelope;
1562
1563                                         col2Plan->length.push_back(length1);
1564
1565                                         col2Plan->inStride[0] = length1;
1566                                         col2Plan->inStride.push_back(1);
1567                                         col2Plan->iDist = length0 * length1;
1568
1569                                         // make sure colTPlan (first column plan) does not recurse, otherwise large twiddle mul
1570                                         // cannot be done with this algorithm sequence
1571                                         assert(colTPlan->planX == 0);
1572
1573
1574                                         col2Plan->placeness = CLFFT_INPLACE;
1575                                         col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1576                                         col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1577
1578                                         col2Plan->outStride[0] = length1;
1579                                         col2Plan->outStride.push_back(1);
1580                                         col2Plan->oDist = length0 * length1;
1581
1582                                         for (size_t index = 1; index < fftPlan->length.size(); index++)
1583                                         {
1584                                                 col2Plan->length.push_back(fftPlan->length[index]);
1585                                                 col2Plan->inStride.push_back(col2Plan->iDist);
1586                                                 col2Plan->outStride.push_back(col2Plan->oDist);
1587                                                 col2Plan->iDist *= fftPlan->length[index];
1588                                                 col2Plan->oDist *= fftPlan->length[index];
1589                                         }
1590
1591
1592                                         OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
1593
1594                                         if ( (fftPlan->outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
1595                                                  (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR) )
1596                                         {
1597                                                 // copy plan to get back to hermitian
1598                                                 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0]),
1599                                                         _T("CreateDefaultPlan RC copy failed"));
1600
1601                                                 FFTPlan* copyPlan = NULL;
1602                                                 lockRAII* copyLock = NULL;
1603                                                 OPENCL_V(fftRepo.getPlan(fftPlan->planRCcopy, copyPlan, copyLock), _T("fftRepo.getPlan failed"));
1604
1605                                                 // This is second column fft, intermediate buffer is packed and interleaved
1606                                                 // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
1607
1608                                                 // common part for both passes
1609                                                 copyPlan->placeness = CLFFT_OUTOFPLACE;
1610                                                 copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1611                                                 copyPlan->outputLayout = fftPlan->outputLayout;
1612
1613                                                 copyPlan->precision = fftPlan->precision;
1614                                                 copyPlan->forwardScale = 1.0f;
1615                                                 copyPlan->backwardScale = 1.0f;
1616                                                 copyPlan->tmpBufSize = 0;
1617                                                 copyPlan->batchsize = fftPlan->batchsize;
1618
1619                                                 copyPlan->gen = Copy;
1620                                                 copyPlan->envelope = fftPlan->envelope;
1621
1622
1623                                                 copyPlan->inStride[0] = 1;
1624                                                 copyPlan->iDist = fftPlan->length[0];
1625
1626                                                 copyPlan->outStride[0] = fftPlan->outStride[0];
1627                                                 copyPlan->oDist = fftPlan->oDist;
1628
1629                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
1630                                                 {
1631                                                         copyPlan->length.push_back(fftPlan->length[index]);
1632                                                         copyPlan->inStride.push_back(copyPlan->inStride[index - 1] * fftPlan->length[index - 1]);
1633                                                         copyPlan->iDist *= fftPlan->length[index];
1634                                                         copyPlan->outStride.push_back(fftPlan->outStride[index]);
1635                                                 }
1636
1637                                                 //Set callback data if set on top level plan
1638                                                 if (fftPlan->hasPostCallback)
1639                                                 {
1640                                                         copyPlan->hasPostCallback = true;
1641                                                         copyPlan->postCallbackParam = fftPlan->postCallbackParam;
1642                                                         copyPlan->postcallUserData = fftPlan->postcallUserData;
1643                                                 }
1644
1645                                                 OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d RC copy plan failed"));
1646                                         }
1647
1648                                 }
1649                                 else if(fftPlan->outputLayout == CLFFT_REAL)
1650                                 {
1651                                         if (fftPlan->tmpBufSizeRC==0 )
1652                                         {
1653                                                 fftPlan->tmpBufSizeRC = length0 * length1 *
1654                                                         fftPlan->batchsize * fftPlan->ElementSize();
1655                                                 for (size_t index=1; index < fftPlan->length.size(); index++)
1656                                                 {
1657                                                         fftPlan->tmpBufSizeRC *= fftPlan->length[index];
1658                                                 }
1659                                         }
1660
1661                                         if ((fftPlan->inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
1662                                                 (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR))
1663                                         {
1664                                                 // copy plan to from hermitian to full complex
1665                                                 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0]),
1666                                                         _T("CreateDefaultPlan RC copy failed"));
1667
1668                                                 FFTPlan* copyPlan = NULL;
1669                                                 lockRAII* copyLock = NULL;
1670                                                 OPENCL_V(fftRepo.getPlan(fftPlan->planRCcopy, copyPlan, copyLock), _T("fftRepo.getPlan failed"));
1671
1672                                                 // This is second column fft, intermediate buffer is packed and interleaved
1673                                                 // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
1674
1675                                                 // common part for both passes
1676                                                 copyPlan->placeness = CLFFT_OUTOFPLACE;
1677                                                 copyPlan->inputLayout = fftPlan->inputLayout;
1678                                                 copyPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1679
1680                                                 copyPlan->precision = fftPlan->precision;
1681                                                 copyPlan->forwardScale = 1.0f;
1682                                                 copyPlan->backwardScale = 1.0f;
1683                                                 copyPlan->tmpBufSize = 0;
1684                                                 copyPlan->batchsize = fftPlan->batchsize;
1685
1686                                                 copyPlan->gen = Copy;
1687                                                 copyPlan->envelope = fftPlan->envelope;
1688
1689                                                 copyPlan->inStride[0] = fftPlan->inStride[0];
1690                                                 copyPlan->iDist = fftPlan->iDist;
1691
1692                                                 copyPlan->outStride[0] = 1;
1693                                                 copyPlan->oDist = fftPlan->length[0];
1694
1695                                                 for (size_t index = 1; index < fftPlan->length.size(); index++)
1696                                                 {
1697                                                         copyPlan->length.push_back(fftPlan->length[index]);
1698                                                         copyPlan->outStride.push_back(copyPlan->outStride[index - 1] * fftPlan->length[index - 1]);
1699                                                         copyPlan->oDist *= fftPlan->length[index];
1700                                                         copyPlan->inStride.push_back(fftPlan->inStride[index]);
1701                                                 }
1702
1703                                                 //Set callback data if set on top level plan
1704                                                 if (fftPlan->hasPreCallback)
1705                                                 {
1706                                                         copyPlan->hasPreCallback = true;
1707                                                         copyPlan->preCallback = fftPlan->preCallback;
1708                                                         copyPlan->precallUserData = fftPlan->precallUserData;
1709                                                 }
1710
1711                                                 OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d RC copy plan failed"));
1712                                         }
1713
1714                                         // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
1715                                         // transposed output
1716                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
1717                                                 _T( "CreateDefaultPlan Large1d column failed" ) );
1718
1719                                         FFTPlan* colTPlan       = NULL;
1720                                         lockRAII* colLock       = NULL;
1721                                         OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
1722
1723                                         // current plan is to create intermediate buffer, packed and interleave
1724                                         // This is a column FFT, the first elements distance between each FFT is the distance of the first two
1725                                         // elements in the original buffer. Like a transpose of the matrix
1726                                         // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
1727
1728                                         //this part are common for both passes
1729                                         colTPlan->precision     = fftPlan->precision;
1730                                         colTPlan->forwardScale  = 1.0f;
1731                                         colTPlan->backwardScale = 1.0f;
1732                                         colTPlan->tmpBufSize    = 0;
1733                                         colTPlan->batchsize     = fftPlan->batchsize;
1734
1735                                         colTPlan->gen                   = fftPlan->gen;
1736                                         colTPlan->envelope              = fftPlan->envelope;
1737
1738                                         //Pass large1D flag to confirm we need multiply twiddle factor
1739                                         colTPlan->large1D       = fftPlan->length[0];
1740
1741                                         colTPlan->length.push_back(clLengths[0]);
1742
1743                                         colTPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
1744                                         colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
1745                                         
1746                                         colTPlan->inStride[0]  = length0;
1747                                         colTPlan->inStride.push_back(1);
1748                                         colTPlan->iDist        = length0 * length1;
1749
1750                                         colTPlan->outStride[0] = length0;
1751                                         colTPlan->outStride.push_back(1);
1752                                         colTPlan->oDist         = length0 * length1;
1753
1754                                         for (size_t index=1; index < fftPlan->length.size(); index++)
1755                                         {
1756                                                 colTPlan->length.push_back(fftPlan->length[index]);
1757                                                 colTPlan->inStride.push_back(colTPlan->iDist);
1758                                                 colTPlan->outStride.push_back(colTPlan->oDist);
1759                                                 colTPlan->iDist   *= fftPlan->length[index];
1760                                                 colTPlan->oDist   *= fftPlan->length[index];
1761                                         }
1762
1763                                         if ((fftPlan->inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
1764                                                 (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR))
1765                                         {
1766                                                 colTPlan->placeness = CLFFT_INPLACE;
1767                                         }
1768                                         else
1769                                         {
1770                                                 colTPlan->placeness = CLFFT_OUTOFPLACE;
1771                                         }
1772
1773                                         OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
1774
1775                                         //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
1776                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
1777                                                 _T( "CreateDefaultPlan large1D row failed" ) );
1778
1779                                         FFTPlan* col2Plan       = NULL;
1780                                         lockRAII* rowLock       = NULL;
1781                                         OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
1782
1783                                         // This is second column fft, intermediate buffer is packed and interleaved
1784                                         // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
1785
1786                                         // common part for both passes
1787                                         col2Plan->placeness     = CLFFT_OUTOFPLACE;
1788                                         col2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
1789                                         col2Plan->outputLayout  = fftPlan->outputLayout;
1790
1791                                         col2Plan->precision     = fftPlan->precision;
1792                                         col2Plan->forwardScale  = fftPlan->forwardScale;
1793                                         col2Plan->backwardScale = fftPlan->backwardScale;
1794                                         col2Plan->tmpBufSize    = 0;
1795                                         col2Plan->batchsize     = fftPlan->batchsize;
1796
1797                                         col2Plan->gen                   = fftPlan->gen;
1798                                         col2Plan->envelope                      = fftPlan->envelope;
1799
1800                                         col2Plan->RCsimple = true;
1801                                         col2Plan->length.push_back(length1);
1802
1803                                         col2Plan->inStride[0]  = 1;
1804                                         col2Plan->inStride.push_back(length0);
1805                                         col2Plan->iDist        = length0 * length1;
1806
1807                                         col2Plan->outStride[0] = length1 * fftPlan->outStride[0];
1808                                         col2Plan->outStride.push_back(fftPlan->outStride[0]);
1809                                         col2Plan->oDist         = fftPlan->oDist;
1810
1811                                         for (size_t index=1; index < fftPlan->length.size(); index++)
1812                                         {
1813                                                 col2Plan->length.push_back(fftPlan->length[index]);
1814                                                 col2Plan->inStride.push_back(col2Plan->iDist);
1815                                                 col2Plan->iDist   *= fftPlan->length[index];
1816                                                 col2Plan->outStride.push_back(fftPlan->outStride[index]);
1817                                         }
1818
1819                                         //Set callback data if set on top level plan
1820                                         if (fftPlan->hasPostCallback)
1821                                         {
1822                                                 col2Plan->hasPostCallback = true;
1823                                                 col2Plan->postCallbackParam = fftPlan->postCallbackParam;
1824                                                 col2Plan->postcallUserData = fftPlan->postcallUserData;
1825                                         }
1826
1827                                         OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
1828                                 }
1829                                 else
1830                                 {
1831
1832                                         if( (fftPlan->length[0] > 262144/PrecisionWidth(fftPlan->precision)) && fftPlan->blockCompute )
1833                                         {
1834                                                 assert(fftPlan->length[0] <= 1048576);
1835
1836
1837                                                 size_t padding = 64;    
1838                                                 if (fftPlan->tmpBufSize==0 )
1839                                                 {
1840                                                         fftPlan->tmpBufSize = (length1 + padding) * length0 *
1841                                                                         fftPlan->batchsize * fftPlan->ElementSize();
1842                                                         for (size_t index=1; index < fftPlan->length.size(); index++)
1843                                                         {
1844                                                                 fftPlan->tmpBufSize *= fftPlan->length[index];
1845                                                         }
1846                                                 }
1847
1848                                                 // Algorithm in this case is 
1849                                                 // T(with pad, out_of_place), R (in_place), C(in_place), Unpad(out_of_place)
1850
1851                                                 size_t len[3] = { clLengths[1], clLengths[0], 1 };
1852
1853                                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, len ),
1854                                                 _T( "CreateDefaultPlan Large1d trans1 failed" ) );
1855
1856                                                 FFTPlan* trans1Plan     = NULL;
1857                                                 lockRAII* trans1Lock    = NULL;
1858                                                 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
1859
1860                                                 trans1Plan->placeness     = CLFFT_OUTOFPLACE;
1861                                                 trans1Plan->precision     = fftPlan->precision;
1862                                                 trans1Plan->tmpBufSize    = 0;
1863                                                 trans1Plan->batchsize     = fftPlan->batchsize;
1864                                                 trans1Plan->envelope      = fftPlan->envelope;
1865                                                 trans1Plan->inputLayout   = fftPlan->inputLayout;
1866                                                 trans1Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
1867                                                 trans1Plan->inStride[0]   = fftPlan->inStride[0];
1868                                                 trans1Plan->inStride[1]   = length1;
1869                                                 trans1Plan->outStride[0]  = 1;
1870                                                 trans1Plan->outStride[1]  = length0 + padding;
1871                                                 trans1Plan->iDist         = fftPlan->iDist;
1872                                                 trans1Plan->oDist         = length1 * trans1Plan->outStride[1];
1873                                                 trans1Plan->gen           = Transpose_GCN;
1874                                                 trans1Plan->transflag     = true;
1875
1876                                                 for (size_t index=1; index < fftPlan->length.size(); index++)
1877                                                 {
1878                                                         trans1Plan->length.push_back(fftPlan->length[index]);
1879                                                         trans1Plan->inStride.push_back(fftPlan->inStride[index]);
1880                                                         trans1Plan->outStride.push_back(trans1Plan->oDist);
1881                                                         trans1Plan->oDist *= fftPlan->length[index];
1882                                                 }
1883
1884                                                 //Set callback data if set on top level plan
1885                                                 if (fftPlan->hasPreCallback)
1886                                                 {
1887                                                         trans1Plan->hasPreCallback = true;
1888                                                         trans1Plan->preCallback = fftPlan->preCallback;
1889                                                         trans1Plan->precallUserData = fftPlan->precallUserData;
1890                                                 }
1891
1892                                                 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
1893                                                         _T( "BakePlan large1d trans1 plan failed" ) );
1894
1895
1896                                                 // row FFT
1897                                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[0] ),
1898                                                         _T( "CreateDefaultPlan Large1d column failed" ) );
1899
1900                                                 FFTPlan* rowPlan        = NULL;
1901                                                 lockRAII* rowLock       = NULL;
1902                                                 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
1903
1904                                                 assert(fftPlan->large1D == 0);
1905
1906                                                 rowPlan->placeness     = CLFFT_INPLACE;
1907                                                 rowPlan->precision     = fftPlan->precision;
1908                                                 rowPlan->forwardScale  = 1.0f;
1909                                                 rowPlan->backwardScale = 1.0f;
1910                                                 rowPlan->tmpBufSize    = 0;
1911                                                 rowPlan->batchsize     = fftPlan->batchsize;
1912
1913                                                 rowPlan->gen                    = fftPlan->gen;
1914                                                 rowPlan->envelope               = fftPlan->envelope;
1915
1916                                                 rowPlan->length.push_back(length1);
1917
1918
1919                                                 rowPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
1920                                                 rowPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
1921                                                 rowPlan->inStride[0]   = 1;
1922                                                 rowPlan->outStride[0]  = 1;
1923                                                 rowPlan->inStride.push_back(length0+padding);
1924                                                 rowPlan->outStride.push_back(length0+padding);
1925                                                 rowPlan->iDist         = (length0+padding)*length1;
1926                                                 rowPlan->oDist         = (length0+padding)*length1;
1927
1928                                                 for (size_t index=1; index < fftPlan->length.size(); index++)
1929                                                 {
1930                                                         rowPlan->length.push_back(fftPlan->length[index]);
1931                                                         rowPlan->inStride.push_back(rowPlan->iDist);
1932                                                         rowPlan->iDist *= fftPlan->length[index];
1933                                                         rowPlan->outStride.push_back(rowPlan->oDist);
1934                                                         rowPlan->oDist *= fftPlan->length[index];
1935                                                 }
1936
1937
1938                                                 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first row plan failed" ) );
1939
1940                                                 //column FFT
1941                                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[1] ),
1942                                                         _T( "CreateDefaultPlan large1D column failed" ) );
1943
1944                                                 FFTPlan* col2Plan       = NULL;
1945                                                 lockRAII* colLock       = NULL;
1946                                                 OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, colLock ), _T( "fftRepo.getPlan failed" ) );
1947
1948                                                 col2Plan->placeness     = CLFFT_INPLACE;
1949                                                 col2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
1950                                                 col2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
1951                                                 col2Plan->precision     = fftPlan->precision;
1952                                                 col2Plan->forwardScale  = fftPlan->forwardScale;
1953                                                 col2Plan->backwardScale = fftPlan->backwardScale;
1954                                                 col2Plan->tmpBufSize    = 0;
1955                                                 col2Plan->batchsize     = fftPlan->batchsize;
1956
1957                                                 col2Plan->gen                   = fftPlan->gen;
1958                                                 col2Plan->envelope              = fftPlan->envelope;
1959
1960                                                 col2Plan->large1D       = fftPlan->length[0];
1961                                                 col2Plan->twiddleFront  = true;
1962
1963                                                 col2Plan->length.push_back(clLengths[0]);
1964
1965
1966
1967                                                 col2Plan->blockCompute = true;
1968                                                 col2Plan->blockComputeType = BCT_C2C;
1969
1970                                                 col2Plan->inStride[0]  = length0+padding;
1971                                                 col2Plan->outStride[0] = length0+padding;
1972                                                 col2Plan->iDist        = (length0+padding) * length1;
1973                                                 col2Plan->oDist        = (length0+padding) * length1;
1974                                                 col2Plan->inStride.push_back(1);
1975                                                 col2Plan->outStride.push_back(1);
1976
1977
1978                                                 for (size_t index=1; index < fftPlan->length.size(); index++)
1979                                                 {
1980                                                         col2Plan->length.push_back(fftPlan->length[index]);
1981                                                         col2Plan->inStride.push_back(col2Plan->iDist);
1982                                                         col2Plan->outStride.push_back(col2Plan->oDist);
1983                                                         col2Plan->iDist   *= fftPlan->length[index];
1984                                                         col2Plan->oDist   *= fftPlan->length[index];
1985                                                 }
1986
1987
1988                                                 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
1989
1990
1991                                                 // copy plan to get results back to packed output
1992                                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planCopy, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
1993                                                         _T( "CreateDefaultPlan Copy failed" ) );
1994
1995                                                 FFTPlan* copyPlan       = NULL;
1996                                                 lockRAII* copyLock      = NULL;
1997                                                 OPENCL_V( fftRepo.getPlan( fftPlan->planCopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
1998
1999
2000                                                 copyPlan->placeness     = CLFFT_OUTOFPLACE;
2001                                                 copyPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
2002                                                 copyPlan->outputLayout  = fftPlan->outputLayout;
2003
2004                                                 copyPlan->precision     = fftPlan->precision;
2005                                                 copyPlan->forwardScale  = 1.0f;
2006                                                 copyPlan->backwardScale = 1.0f;
2007                                                 copyPlan->tmpBufSize    = 0;
2008                                                 copyPlan->batchsize     = fftPlan->batchsize;
2009
2010                                                 copyPlan->gen                   = Copy;
2011                                                 copyPlan->envelope              = fftPlan->envelope;
2012
2013                                                 copyPlan->length.push_back(length1);
2014
2015                                                 copyPlan->inStride[0]  = 1;
2016                                                 copyPlan->inStride.push_back(length0+padding);
2017                                                 copyPlan->iDist        = length1*(length0+padding);
2018
2019                                                 copyPlan->outStride[0] = fftPlan->outStride[0];
2020                                                 copyPlan->outStride.push_back(length0);
2021                                                 copyPlan->oDist         = fftPlan->oDist;
2022
2023                                                 for (size_t index=1; index < fftPlan->length.size(); index++)
2024                                                 {
2025                                                         copyPlan->length.push_back(fftPlan->length[index]);
2026                                                         copyPlan->inStride.push_back(copyPlan->inStride[index] * copyPlan->length[index]);
2027                                                         copyPlan->iDist   *= fftPlan->length[index];
2028                                                         copyPlan->outStride.push_back(fftPlan->outStride[index]);
2029                                                 }
2030
2031                                                 OPENCL_V(clfftBakePlan(fftPlan->planCopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d copy plan failed" ) );
2032                                         }
2033                                         else
2034                                         {
2035
2036                                                 if (fftPlan->tmpBufSize==0 )
2037                                                 {
2038                                                         fftPlan->tmpBufSize = length0 * length1 *
2039                                                                 fftPlan->batchsize * fftPlan->ElementSize();
2040                                                         for (size_t index=1; index < fftPlan->length.size(); index++)
2041                                                         {
2042                                                                 fftPlan->tmpBufSize *= fftPlan->length[index];
2043                                                         }
2044                                                 }
2045
2046                                                 // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
2047                                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
2048                                                         _T( "CreateDefaultPlan Large1d column failed" ) );
2049
2050                                                 FFTPlan* colTPlan       = NULL;
2051                                                 lockRAII* colLock       = NULL;
2052                                                 OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
2053
2054                                                 assert(fftPlan->large1D == 0);
2055
2056                                                 // current plan is to create intermediate buffer, packed and interleave
2057                                                 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
2058                                                 // elements in the original buffer. Like a transpose of the matrix
2059                                                 // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
2060
2061                                                 //this part are common for both passes
2062                                                 colTPlan->placeness     = CLFFT_OUTOFPLACE;
2063                                                 colTPlan->precision     = fftPlan->precision;
2064                                                 colTPlan->forwardScale  = 1.0f;
2065                                                 colTPlan->backwardScale = 1.0f;
2066                                                 colTPlan->tmpBufSize    = 0;
2067                                                 colTPlan->batchsize     = fftPlan->batchsize;
2068
2069                                                 colTPlan->gen                   = fftPlan->gen;
2070                                                 colTPlan->envelope                      = fftPlan->envelope;
2071
2072                                                 //Pass large1D flag to confirm we need multiply twiddle factor
2073                                                 colTPlan->large1D       = fftPlan->length[0];
2074
2075                                                 colTPlan->length.push_back(length0);
2076
2077
2078                                                 colTPlan->inputLayout   = fftPlan->inputLayout;
2079                                                 colTPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
2080                                                 colTPlan->inStride[0]   = fftPlan->inStride[0] * length0;
2081                                                 colTPlan->outStride[0]  = length0;
2082                                                 colTPlan->iDist         = fftPlan->iDist;
2083                                                 colTPlan->oDist         = length0 * length1;
2084                                                 colTPlan->inStride.push_back(fftPlan->inStride[0]);
2085                                                 colTPlan->outStride.push_back(1);
2086
2087                                                 //Set callback data if set on top level plan
2088                                                 if (fftPlan->hasPreCallback)
2089                                                 {
2090                                                         colTPlan->hasPreCallback = true;
2091                                                         colTPlan->preCallback = fftPlan->preCallback;
2092                                                         colTPlan->precallUserData = fftPlan->precallUserData;
2093                                                 }
2094
2095                                                 // Enabling block column compute
2096                                                 if( (colTPlan->inStride[0] == length0) && IsPo2(fftPlan->length[0]) && (fftPlan->length[0] < 524288) )
2097                                                 {
2098                                                         colTPlan->blockCompute = true;
2099                                                         colTPlan->blockComputeType = BCT_C2C;
2100                                                 }
2101
2102                                                 for (size_t index=1; index < fftPlan->length.size(); index++)
2103                                                 {
2104                                                         colTPlan->length.push_back(fftPlan->length[index]);
2105                                                         colTPlan->inStride.push_back(fftPlan->inStride[index]);
2106                                                         // tmp buffer is tightly packed
2107                                                         colTPlan->outStride.push_back(colTPlan->oDist);
2108                                                         colTPlan->oDist        *= fftPlan->length[index];
2109                                                 }
2110
2111
2112                                                 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
2113
2114                                                 //another column FFT, size clLengths[0], batch clLengths[1], output with transpose
2115                                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D,  &clLengths[0] ),
2116                                                         _T( "CreateDefaultPlan large1D row failed" ) );
2117
2118                                                 FFTPlan* col2Plan       = NULL;
2119                                                 lockRAII* rowLock       = NULL;
2120                                                 OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
2121
2122                                                 // This is second column fft, intermediate buffer is packed and interleaved
2123                                                 // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
2124
2125                                                 // common part for both passes
2126                                                 col2Plan->outputLayout  = fftPlan->outputLayout;
2127                                                 col2Plan->precision     = fftPlan->precision;
2128                                                 col2Plan->forwardScale  = fftPlan->forwardScale;
2129                                                 col2Plan->backwardScale = fftPlan->backwardScale;
2130                                                 col2Plan->tmpBufSize    = 0;
2131                                                 col2Plan->batchsize     = fftPlan->batchsize;
2132                                                 col2Plan->oDist         = fftPlan->oDist;
2133
2134                                                 col2Plan->gen                   = fftPlan->gen;
2135                                                 col2Plan->envelope              = fftPlan->envelope;
2136
2137
2138                                                 col2Plan->length.push_back(clLengths[1]);
2139
2140                                                 bool integratedTranposes = true;
2141
2142
2143                                                 if( colTPlan->blockCompute && (fftPlan->outStride[0] == 1) && clLengths[0] <= 256)
2144                                                 {
2145                                                         col2Plan->blockCompute = true;
2146                                                         col2Plan->blockComputeType = BCT_R2C;
2147
2148                                                         col2Plan->placeness    = CLFFT_OUTOFPLACE;
2149                                                         col2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
2150                                                         col2Plan->inStride[0]  = 1;
2151                                                         col2Plan->outStride[0] = length1;
2152                                                         col2Plan->iDist        = length0 * length1;
2153                                                         col2Plan->inStride.push_back(length0);
2154                                                         col2Plan->outStride.push_back(1);
2155                                                 }
2156                                                 else if( colTPlan->blockCompute && (fftPlan->outStride[0] == 1) )
2157                                                 {
2158                                                         integratedTranposes = false;
2159
2160                                                         col2Plan->placeness    = CLFFT_INPLACE;
2161                                                         col2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
2162                                                         col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2163                                                         col2Plan->inStride[0]  = 1;
2164                                                         col2Plan->outStride[0] = 1;
2165                                                         col2Plan->iDist        = length0 * length1;
2166                                                         col2Plan->oDist        = length0 * length1;
2167                                                         col2Plan->inStride.push_back(length0);
2168                                                         col2Plan->outStride.push_back(length0);
2169                                                 }
2170                                                 else
2171                                                 {
2172                                                         //first layer, large 1D from tmp buffer to output buffer
2173                                                         col2Plan->placeness    = CLFFT_OUTOFPLACE;
2174                                                         col2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
2175                                                         col2Plan->inStride[0]  = 1;
2176                                                         col2Plan->outStride[0] = fftPlan->outStride[0] * clLengths[1];
2177                                                         col2Plan->iDist        = length0 * length1; //fftPlan->length[0];
2178                                                         col2Plan->inStride.push_back(length0);
2179                                                         col2Plan->outStride.push_back(fftPlan->outStride[0]);
2180                                                 }
2181
2182                                                 if(!integratedTranposes)
2183                                                 {
2184                                                         for (size_t index=1; index < fftPlan->length.size(); index++)
2185                                                         {
2186                                                                 col2Plan->length.push_back(fftPlan->length[index]);
2187                                                                 col2Plan->inStride.push_back(col2Plan->iDist);
2188                                                                 col2Plan->outStride.push_back(col2Plan->oDist);
2189                                                                 col2Plan->iDist        *= fftPlan->length[index];
2190                                                                 col2Plan->oDist        *= fftPlan->length[index];
2191                                                         }
2192                                                 }
2193                                                 else
2194                                                 {
2195                                                         for (size_t index=1; index < fftPlan->length.size(); index++)
2196                                                         {
2197                                                                 col2Plan->length.push_back(fftPlan->length[index]);
2198                                                                 col2Plan->inStride.push_back(col2Plan->iDist);
2199                                                                 col2Plan->outStride.push_back(fftPlan->outStride[index]);
2200                                                                 col2Plan->iDist   *= fftPlan->length[index];
2201                                                         }
2202                                                 }
2203
2204                                                 //Set callback data if set on top level plan
2205                                                 if (fftPlan->hasPostCallback && integratedTranposes)
2206                                                 {
2207                                                         col2Plan->hasPostCallback = true;
2208                                                         col2Plan->postCallbackParam = fftPlan->postCallbackParam;
2209                                                         col2Plan->postcallUserData = fftPlan->postcallUserData;
2210                                                 }
2211
2212                                                 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
2213
2214                                                 if(!integratedTranposes)
2215                                                 {
2216                                                         //Transpose 
2217                                                         //tmp --> output
2218                                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
2219                                                                 _T( "CreateDefaultPlan Large1d transpose failed" ) );
2220
2221                                                         FFTPlan* trans3Plan     = NULL;
2222                                                         lockRAII* trans3Lock    = NULL;
2223                                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
2224
2225                                                         trans3Plan->placeness     = CLFFT_OUTOFPLACE;
2226                                                         trans3Plan->precision     = fftPlan->precision;
2227                                                         trans3Plan->tmpBufSize    = 0;
2228                                                         trans3Plan->batchsize     = fftPlan->batchsize;
2229                                                         trans3Plan->envelope      = fftPlan->envelope;
2230                                                         trans3Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
2231                                                         trans3Plan->outputLayout  = fftPlan->outputLayout;
2232                                                         trans3Plan->inStride[0]   = 1;
2233                                                         trans3Plan->inStride[1]   = clLengths[0];
2234                                                         trans3Plan->outStride[0]  = fftPlan->outStride[0];
2235                                                         trans3Plan->outStride[1]  = clLengths[1] * fftPlan->outStride[0];
2236                                                         trans3Plan->iDist         = fftPlan->length[0];
2237                                                         trans3Plan->oDist         = fftPlan->oDist;
2238                                                         trans3Plan->gen           = Transpose_GCN;
2239                                                         trans3Plan->transflag     = true;
2240
2241                                                         for (size_t index=1; index < fftPlan->length.size(); index++)
2242                                                         {
2243                                                                 trans3Plan->length.push_back(fftPlan->length[index]);
2244                                                                 trans3Plan->inStride.push_back(trans3Plan->iDist);
2245                                                                 trans3Plan->iDist *= fftPlan->length[index];
2246                                                                 trans3Plan->outStride.push_back(fftPlan->outStride[index]);
2247                                                         }
2248
2249                                                         //Set callback data if set on top level plan
2250                                                         if (fftPlan->hasPostCallback)
2251                                                         {
2252                                                                 trans3Plan->hasPostCallback = true;
2253                                                                 trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
2254                                                                 trans3Plan->postcallUserData = fftPlan->postcallUserData;
2255                                                         }
2256
2257                                                         OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
2258                                                                 _T( "BakePlan large1d trans plan failed" ) );
2259                                                 }
2260                                         }
2261                                 }
2262
2263                                 fftPlan->baked = true;
2264                                 return  CLFFT_SUCCESS;
2265                         }
2266                 }
2267                 break;
2268         case CLFFT_2D:
2269                 {
2270
2271                         if (fftPlan->transflag) //Transpose for 2D
2272                         {
2273                 clfftStatus err = CLFFT_SUCCESS;
2274                                 if(fftPlan->gen == Transpose_GCN)
2275                                         fftPlan->action = new FFTGeneratedTransposeGCNAction(plHandle, fftPlan, *commQueueFFT, err);
2276                                 else if (fftPlan->gen == Transpose_SQUARE)
2277                                         fftPlan->action = new FFTGeneratedTransposeSquareAction(plHandle, fftPlan, *commQueueFFT, err);
2278                 else if (fftPlan->gen == Transpose_NONSQUARE)
2279                 {
2280                                         if(fftPlan->nonSquareKernelType != NON_SQUARE_TRANS_PARENT)
2281                                                 fftPlan->action = new FFTGeneratedTransposeNonSquareAction(plHandle, fftPlan, *commQueueFFT, err);
2282                                         else
2283                                         {
2284                                                 size_t clLengths[] = { 1, 1, 0 };
2285                                                 clLengths[0] = fftPlan->length[0];
2286                                                 clLengths[1] = fftPlan->length[1];
2287
2288                                                 //NON_SQUARE_KERNEL_ORDER currKernelOrder;
2289                                                 // controlling the transpose and swap kernel order
2290                                                 // if leading dim is larger than the other dim it makes sense to swap and transpose
2291                                                 if (clLengths[0] > clLengths[1])
2292                                                 {
2293                                                         //Twiddling will be done in swap kernel, in regardless of the order
2294                                                         fftPlan->nonSquareKernelOrder = SWAP_AND_TRANSPOSE;
2295                                                 }
2296                                                 else
2297                                                 {
2298                                                         if (fftPlan->large1D != 0 && 0)
2299                                                         {
2300                                 //this is not going to happen anymore
2301                                                                 fftPlan->nonSquareKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
2302                                                         }
2303                                                         else
2304                                                         {
2305                                 //twiddling can be done in swap
2306                                                                 fftPlan->nonSquareKernelOrder = TRANSPOSE_AND_SWAP;
2307                                                         }
2308                                                 }
2309
2310                                                 //std::cout << "currKernelOrder = " << fftPlan->nonSquareKernelOrder << std::endl;
2311                                                 //ends tranpose kernel order
2312
2313                                                 //Transpose stage 1 
2314                                                 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths),
2315                                                         _T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
2316
2317                                                 FFTPlan* trans1Plan = NULL;
2318                                                 lockRAII* trans1Lock = NULL;
2319                                                 OPENCL_V(fftRepo.getPlan(fftPlan->planTX, trans1Plan, trans1Lock), _T("fftRepo.getPlan failed"));
2320
2321                                                 trans1Plan->placeness = CLFFT_INPLACE;
2322                                                 trans1Plan->precision = fftPlan->precision;
2323                                                 trans1Plan->tmpBufSize = 0;
2324                                                 trans1Plan->batchsize = fftPlan->batchsize;
2325                                                 trans1Plan->envelope = fftPlan->envelope;
2326                                                 trans1Plan->inputLayout = fftPlan->inputLayout;
2327                                                 trans1Plan->outputLayout = fftPlan->outputLayout;
2328                                                 trans1Plan->inStride[0] = fftPlan->inStride[0];
2329                                                 trans1Plan->outStride[0] = fftPlan->outStride[0];
2330                                                 trans1Plan->inStride[1] = fftPlan->inStride[1];
2331                                                 trans1Plan->outStride[1] = fftPlan->outStride[1];
2332                                                 trans1Plan->iDist = fftPlan->iDist;
2333                                                 trans1Plan->oDist = fftPlan->oDist;
2334                                                 trans1Plan->gen = Transpose_NONSQUARE;
2335                                                 trans1Plan->nonSquareKernelOrder = fftPlan->nonSquareKernelOrder;
2336                                                 if(fftPlan->nonSquareKernelOrder == SWAP_AND_TRANSPOSE)
2337                                                         trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
2338                                                 else if (fftPlan->nonSquareKernelOrder == TRANSPOSE_AND_SWAP)
2339                                                         trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED;
2340                                                 else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP)
2341                                                         trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING;
2342                                                 trans1Plan->transflag = true;
2343                         trans1Plan->large1D = fftPlan->large1D;//twiddling may happen in this kernel
2344
2345                                                 if (trans1Plan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
2346                                                 {
2347                                                         //this should be in a function to avoide duplicate code TODO
2348                                                         //need to treat a non square matrix as a sqaure matrix with bigger batch size
2349                                                         size_t lengthX = trans1Plan->length[0];
2350                                                         size_t lengthY = trans1Plan->length[1];
2351
2352                                                         size_t BatchFactor = (lengthX > lengthY) ? (lengthX / lengthY) : (lengthY / lengthX);
2353                                                         trans1Plan->transposeMiniBatchSize = BatchFactor;
2354                                                         trans1Plan->batchsize *= BatchFactor;
2355                                                         trans1Plan->iDist = trans1Plan->iDist / BatchFactor;
2356                                                         if (lengthX > lengthY)
2357                                                         {
2358                                                                 trans1Plan->length[0] = lengthX / BatchFactor;
2359                                                                 trans1Plan->inStride[1] = lengthX / BatchFactor;
2360                                                         }
2361                                                         else if (lengthX < lengthY)
2362                                                         {
2363                                                                 trans1Plan->length[1] = lengthY / BatchFactor;
2364                                                                 trans1Plan->inStride[1] = lengthX;
2365                                                         }
2366                                                 }
2367
2368                                                 for (size_t index = 2; index < fftPlan->length.size(); index++)
2369                                                 {
2370                                                         trans1Plan->length.push_back(fftPlan->length[index]);
2371                                                         trans1Plan->inStride.push_back(fftPlan->inStride[index]);
2372                                                         trans1Plan->outStride.push_back(fftPlan->outStride[index]);
2373                                                 }
2374
2375                                                 if (fftPlan->hasPreCallback)
2376                                                 {
2377                                                         trans1Plan->hasPreCallback = true;
2378                                                         trans1Plan->preCallback = fftPlan->preCallback;
2379                                                         trans1Plan->precallUserData = fftPlan->precallUserData;
2380                                                 }
2381
2382
2383                                                 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL),
2384                                                         _T("BakePlan transpose_nsq_stage1 plan failed"));
2385
2386
2387                                                 //Transpose stage 2 
2388                                                 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengths),
2389                                                         _T("CreateDefaultPlan transpose_nsq_stage2 plan failed"));
2390
2391                                                 FFTPlan* trans2Plan = NULL;
2392                                                 lockRAII* trans2Lock = NULL;
2393                                                 OPENCL_V(fftRepo.getPlan(fftPlan->planTY, trans2Plan, trans2Lock), _T("fftRepo.getPlan failed"));
2394
2395                                                 trans2Plan->placeness = CLFFT_INPLACE;
2396                                                 trans2Plan->precision = fftPlan->precision;
2397                                                 trans2Plan->tmpBufSize = 0;
2398                                                 trans2Plan->batchsize = fftPlan->batchsize;
2399                                                 trans2Plan->envelope = fftPlan->envelope;
2400                                                 trans2Plan->inputLayout = fftPlan->inputLayout;
2401                                                 trans2Plan->outputLayout = fftPlan->outputLayout;
2402                                                 trans2Plan->inStride[0] = fftPlan->inStride[0];
2403                                                 trans2Plan->outStride[0] = fftPlan->outStride[0];
2404                                                 trans2Plan->inStride[1] = fftPlan->inStride[1];
2405                                                 trans2Plan->outStride[1] = fftPlan->outStride[1];
2406                                                 trans2Plan->iDist = fftPlan->iDist;
2407                                                 trans2Plan->oDist = fftPlan->oDist;
2408                                                 trans2Plan->gen = Transpose_NONSQUARE;
2409                                                 trans2Plan->nonSquareKernelOrder = fftPlan->nonSquareKernelOrder;
2410                                                 if (fftPlan->nonSquareKernelOrder == SWAP_AND_TRANSPOSE)
2411                                                         trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED;
2412                                                 else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_AND_SWAP)
2413                                                         trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
2414                                                 else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP)
2415                                                         trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
2416                                                 trans2Plan->transflag = true;
2417                                                 trans2Plan->large1D = fftPlan->large1D;//twiddling may happen in this kernel
2418
2419                                                 if (trans2Plan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
2420                                                 {
2421                                                         //need to treat a non square matrix as a sqaure matrix with bigger batch size
2422                                                         size_t lengthX = trans2Plan->length[0];
2423                                                         size_t lengthY = trans2Plan->length[1];
2424
2425                                                         size_t BatchFactor = (lengthX > lengthY) ? (lengthX/lengthY) : (lengthY/lengthX);
2426                                                         trans2Plan->transposeMiniBatchSize = BatchFactor;
2427                                                         trans2Plan->batchsize *= BatchFactor;
2428                                                         trans2Plan->iDist = trans2Plan->iDist / BatchFactor;
2429                                                         if (lengthX > lengthY)
2430                                                         {
2431                                                                 trans2Plan->length[0] = lengthX / BatchFactor;
2432                                                                 trans2Plan->inStride[1] = lengthX / BatchFactor;
2433                                                         }
2434                                                         else if(lengthX < lengthY)
2435                                                         {
2436                                                                 trans2Plan->length[1] = lengthY / BatchFactor;
2437                                                                 trans2Plan->inStride[1] = lengthX;
2438                                                         }
2439                                                 }
2440
2441                                                 for (size_t index = 2; index < fftPlan->length.size(); index++)
2442                                                 {
2443                                                         trans2Plan->length.push_back(fftPlan->length[index]);
2444                                                         trans2Plan->inStride.push_back(fftPlan->inStride[index]);
2445                                                         trans2Plan->outStride.push_back(fftPlan->outStride[index]);
2446                                                 }
2447
2448                                                 if (fftPlan->hasPostCallback)
2449                                                 {
2450                                                         trans2Plan->hasPostCallback = true;
2451                                                         trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
2452                                                         trans2Plan->postcallUserData = fftPlan->postcallUserData;
2453                                                 }
2454
2455                                                 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL),
2456                                                         _T("BakePlan transpose_nsq_stage2 plan failed"));
2457                                         }
2458                 }
2459                                 else
2460                                         fftPlan->action = new FFTGeneratedTransposeGCNAction(plHandle, fftPlan, *commQueueFFT, err);
2461
2462                 OPENCL_V( err, "FFTGeneratedTransposeXXXAction failed");
2463
2464                                 fftPlan->baked          = true;
2465                                 return  CLFFT_SUCCESS;
2466                         }
2467
2468                         size_t length0 = fftPlan->length[0];
2469                         size_t length1 = fftPlan->length[1];
2470
2471
2472                         if (fftPlan->length[0] > Large1DThreshold ||
2473                                 fftPlan->length[1] > Large1DThreshold)
2474                                 fftPlan->large2D = true;
2475
2476                         while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
2477                         {
2478                                 //break;
2479
2480
2481                 // TODO : Check for a better way to do this.
2482                 bool isnvidia = false;
2483                 for (size_t Idx = 0; !isnvidia && Idx < numQueues; Idx++)
2484                 {
2485                     cl_command_queue QIdx = commQueueFFT[Idx];
2486                     cl_device_id Device;
2487                     clGetCommandQueueInfo(QIdx, CL_QUEUE_DEVICE, sizeof(Device), &Device, NULL);
2488                     char Vendor[256];
2489                     clGetDeviceInfo(Device, CL_DEVICE_VENDOR, sizeof(Vendor), &Vendor, NULL);
2490                     isnvidia |= (strncmp(Vendor, "NVIDIA", 6) == 0);
2491                 }
2492                 // nvidia gpus are failing when doing transpose for 2D FFTs
2493                 if (isnvidia) break;
2494
2495                                 if (fftPlan->length.size() != 2) break;
2496                                 if (!(IsPo2(fftPlan->length[0])) || !(IsPo2(fftPlan->length[1])))
2497                                         break;
2498                                 if (fftPlan->length[1] < 32) break;
2499                                 //TBD: restrict the use large2D in x!=y case becase we will need two temp buffers
2500                                 //     (1) for 2D usage (2) for 1D large usage
2501                                 //if (fftPlan->large2D) break;
2502                                 //Performance show 512 is the good case with transpose
2503                                 //if user want the result to be transposed, then we will.
2504
2505                                 if (fftPlan->length[0] < 64) break;
2506                                 //x!=y case, we need tmp buffer, currently temp buffer only support interleaved format
2507                                 //if (fftPlan->length[0] != fftPlan->length[1] && fftPlan->outputLayout == CLFFT_COMPLEX_PLANAR) break;
2508                                 if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1 ||
2509                                         fftPlan->inStride[1] != fftPlan->length[0] || fftPlan->outStride[1] != fftPlan->length[0])
2510                                         break;
2511                                 //if (fftPlan->placeness != CLFFT_INPLACE || fftPlan->inputLayout != CLFFT_COMPLEX_PLANAR)
2512                                 //      break;
2513                                 //if (fftPlan->batchsize != 1) break;
2514                                 //if (fftPlan->precision != CLFFT_SINGLE) break;
2515
2516                                 fftPlan->transflag = true;
2517
2518                                 //create row plan,
2519                                 // x=y & x!=y, In->In for inplace, In->out for outofplace
2520                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
2521                                         _T( "CreateDefaultPlan for planX failed" ) );
2522
2523                                 FFTPlan* rowPlan        = NULL;
2524                                 lockRAII* rowLock       = NULL;
2525                                 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
2526
2527                                 rowPlan->inputLayout     = fftPlan->inputLayout;
2528                                 rowPlan->outputLayout    = fftPlan->outputLayout;
2529                                 rowPlan->placeness       = fftPlan->placeness;
2530                                 rowPlan->outStride[0]    = fftPlan->outStride[0];
2531                                 rowPlan->outStride.push_back(fftPlan->outStride[1]);
2532                                 rowPlan->oDist           = fftPlan->oDist;
2533                                 rowPlan->precision       = fftPlan->precision;
2534                                 rowPlan->forwardScale    = 1.0f;
2535                                 rowPlan->backwardScale   = 1.0f;
2536                                 rowPlan->tmpBufSize      = 0;
2537
2538                                 rowPlan->gen                     = fftPlan->gen;
2539                                 rowPlan->envelope                = fftPlan->envelope;
2540                                 rowPlan->batchsize       = fftPlan->batchsize;
2541                                 rowPlan->inStride[0]     = fftPlan->inStride[0];
2542                                 rowPlan->length.push_back(fftPlan->length[1]);
2543                                 rowPlan->inStride.push_back(fftPlan->inStride[1]);
2544                                 rowPlan->iDist           = fftPlan->iDist;
2545                                 
2546                                 //Set callback data if set on top level plan
2547                                 if (fftPlan->hasPreCallback)
2548                                 {
2549                                         rowPlan->hasPreCallback = true;
2550                                         rowPlan->preCallback = fftPlan->preCallback;
2551                                         rowPlan->precallUserData = fftPlan->precallUserData;
2552                                 }
2553
2554                                 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
2555                                         _T( "BakePlan for planX failed" ) );
2556
2557                                 //Create transpose plan for first transpose
2558                                 //x=y: inplace. x!=y inplace: in->tmp, outofplace out->tmp
2559                                 size_t clLengths[] = { 1, 1, 0 };
2560                                 clLengths[0] = fftPlan->length[0];
2561                                 clLengths[1] = fftPlan->length[1];
2562
2563                                 size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
2564                                 size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
2565                                 size_t padding = 0;
2566
2567                                 fftPlan->transpose_in_2d_inplace = (clLengths[0]==clLengths[1]) ? true : false;
2568                                 if ( (!fftPlan->transpose_in_2d_inplace) && fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2 )
2569                                 {
2570                                         if ((smallerDim % 64 == 0) || (biggerDim % 64 == 0))
2571                                                 if(biggerDim > 512)
2572                                                         padding = 64;
2573
2574                                         // we need tmp buffer for x!=y case
2575                                         // we assume the tmp buffer is packed interleaved
2576                                         fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
2577                                                 fftPlan->batchsize * fftPlan->ElementSize();
2578                                 }
2579
2580                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
2581                                         _T( "CreateDefaultPlan for planT failed" ) );
2582
2583                                 FFTPlan* transPlanX     = NULL;
2584                                 lockRAII* transLockX    = NULL;
2585                                 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, transPlanX, transLockX ), _T( "fftRepo.getPlan failed" ) );
2586
2587                                 transPlanX->inputLayout     = fftPlan->outputLayout;
2588                                 transPlanX->precision       = fftPlan->precision;
2589                                 transPlanX->tmpBufSize      = 0;
2590
2591                                 transPlanX->envelope            = fftPlan->envelope;
2592                                 transPlanX->batchsize       = fftPlan->batchsize;
2593                                 transPlanX->inStride[0]     = fftPlan->outStride[0];
2594                                 transPlanX->inStride[1]     = fftPlan->outStride[1];
2595                                 transPlanX->iDist           = fftPlan->oDist;
2596                                 transPlanX->transflag       = true;
2597
2598                                 if (!fftPlan->transpose_in_2d_inplace)
2599                                 {
2600                                         transPlanX->gen = Transpose_GCN;
2601                                         transPlanX->outputLayout    = CLFFT_COMPLEX_INTERLEAVED;
2602                                         transPlanX->placeness       = CLFFT_OUTOFPLACE;
2603                                         transPlanX->outStride[0]    = 1;
2604                                         transPlanX->outStride[1]    = clLengths[1] + padding;
2605                                         transPlanX->oDist           = clLengths[0] * transPlanX->outStride[1];
2606                                 }
2607                                 else
2608                                 {
2609                                         transPlanX->gen = Transpose_SQUARE;
2610                                         transPlanX->outputLayout    = fftPlan->outputLayout;
2611                                         transPlanX->placeness       = CLFFT_INPLACE;
2612                                         transPlanX->outStride[0]    = fftPlan->outStride[0];
2613                                         transPlanX->outStride[1]    = fftPlan->outStride[1];
2614                                         transPlanX->oDist           = fftPlan->oDist;
2615                                 }
2616
2617                                 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
2618                                         _T( "BakePlan for planTX failed" ) );
2619
2620                                 //create second row plan
2621                                 //x!=y: tmp->tmp, x=y case: In->In or Out->Out
2622                                 //if Transposed result is a choice x!=y: tmp->In or out
2623                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
2624                                         _T( "CreateDefaultPlan for planY failed" ) );
2625
2626                                 FFTPlan* colPlan        = NULL;
2627                                 lockRAII* colLock       = NULL;
2628                                 OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
2629
2630                                 if (!fftPlan->transpose_in_2d_inplace)
2631                                 {
2632                                         colPlan->inputLayout     = CLFFT_COMPLEX_INTERLEAVED;
2633                                         colPlan->inStride[0]     = 1;
2634                                         colPlan->inStride.push_back(clLengths[1] + padding);
2635                                         colPlan->iDist           = clLengths[0] * colPlan->inStride[1];
2636
2637                                         if (fftPlan->transposed == CLFFT_NOTRANSPOSE)
2638                                         {
2639                                                 colPlan->outputLayout    = CLFFT_COMPLEX_INTERLEAVED;
2640                                                 colPlan->outStride[0]    = 1;
2641                                                 colPlan->outStride.push_back(clLengths[1] + padding);
2642                                                 colPlan->oDist           = clLengths[0] * colPlan->outStride[1];
2643                                                 colPlan->placeness       = CLFFT_INPLACE;
2644                                         }
2645                                         else
2646                                         {
2647                                                 colPlan->outputLayout    = fftPlan->outputLayout;
2648                                                 colPlan->outStride[0]    = fftPlan->outStride[0];
2649                                                 colPlan->outStride.push_back(clLengths[1] * fftPlan->outStride[0]);
2650                                                 colPlan->oDist           = fftPlan->oDist;
2651                                                 colPlan->placeness       = CLFFT_OUTOFPLACE;
2652                                         }
2653                                 }
2654                                 else
2655                                 {
2656                                         colPlan->inputLayout     = fftPlan->outputLayout;
2657                                         colPlan->outputLayout    = fftPlan->outputLayout;
2658                                         colPlan->outStride[0]    = fftPlan->outStride[0];
2659                                         colPlan->outStride.push_back(fftPlan->outStride[1]);
2660                                         colPlan->oDist           = fftPlan->oDist;
2661                                         colPlan->inStride[0]     = fftPlan->outStride[0];
2662                                         colPlan->inStride.push_back(fftPlan->outStride[1]);
2663                                         colPlan->iDist           = fftPlan->oDist;
2664                                         colPlan->placeness       = CLFFT_INPLACE;
2665                                 }
2666
2667                                 colPlan->precision       = fftPlan->precision;
2668                                 colPlan->forwardScale    = fftPlan->forwardScale;
2669                                 colPlan->backwardScale   = fftPlan->backwardScale;
2670                                 colPlan->tmpBufSize      = 0;
2671
2672                                 colPlan->gen                     = fftPlan->gen;
2673                                 colPlan->envelope                = fftPlan->envelope;
2674                                 colPlan->batchsize       = fftPlan->batchsize;
2675                                 colPlan->length.push_back(fftPlan->length[0]);
2676
2677                                 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
2678                                         _T( "BakePlan for planY failed" ) );
2679
2680                                 if (fftPlan->transposed == CLFFT_TRANSPOSED)
2681                                 {
2682                                         fftPlan->baked = true;
2683                                         return  CLFFT_SUCCESS;
2684                                 }
2685
2686                                 //Create transpose plan for second transpose
2687                                 //x!=y case tmp->In or Out, x=y case In->In or Out->out
2688                                 size_t clLengthsY[2] = { clLengths[1], clLengths[0] };
2689                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengthsY ),
2690                                         _T( "CreateDefaultPlan for planTY failed" ) );
2691
2692                                 FFTPlan* transPlanY     = NULL;
2693                                 lockRAII* transLockY    = NULL;
2694                                 OPENCL_V( fftRepo.getPlan( fftPlan->planTY, transPlanY, transLockY ), _T( "fftRepo.getPlan failed" ) );
2695
2696                                 if (!fftPlan->transpose_in_2d_inplace)
2697                                 {
2698                                         transPlanY->gen = Transpose_GCN;
2699                                         transPlanY->inputLayout     = CLFFT_COMPLEX_INTERLEAVED;
2700                                         transPlanY->placeness       = CLFFT_OUTOFPLACE;
2701                                         transPlanY->inStride[0]     = 1;
2702                                         transPlanY->inStride[1]     = clLengths[1] + padding;
2703                                         transPlanY->iDist           = clLengths[0] * transPlanY->inStride[1];
2704                                         transPlanY->transOutHorizontal = true;
2705                                 }
2706                                 else
2707                                 {
2708                                         transPlanY->gen = Transpose_SQUARE;
2709                                         transPlanY->inputLayout     = fftPlan->outputLayout;
2710                                         transPlanY->placeness       = CLFFT_INPLACE;
2711                                         transPlanY->inStride[0]     = fftPlan->outStride[0];
2712                                         transPlanY->inStride[1]     = fftPlan->outStride[1];
2713                                         transPlanY->iDist           = fftPlan->oDist;
2714                                 }
2715                                 transPlanY->outputLayout    = fftPlan->outputLayout;
2716                                 transPlanY->outStride[0]    = fftPlan->outStride[0];
2717                                 transPlanY->outStride[1]    = fftPlan->outStride[1];
2718                                 transPlanY->oDist           = fftPlan->oDist;
2719                                 transPlanY->precision       = fftPlan->precision;
2720                                 transPlanY->tmpBufSize      = 0;
2721
2722                                 transPlanY->envelope            = fftPlan->envelope;
2723                                 transPlanY->batchsize       = fftPlan->batchsize;
2724                                 transPlanY->transflag       = true;
2725
2726                                 //Set callback data if set on top level plan
2727                                 if (fftPlan->hasPostCallback)
2728                                 {
2729                                         transPlanY->hasPostCallback = true;
2730                                         transPlanY->postCallbackParam = fftPlan->postCallbackParam;
2731                                         transPlanY->postcallUserData = fftPlan->postcallUserData;
2732                                 }
2733
2734                                 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
2735                                         _T( "BakePlan for planTY failed" ) );
2736
2737                                 fftPlan->baked = true;
2738                                 return  CLFFT_SUCCESS;
2739                         }
2740
2741                         //check transposed
2742                         if (fftPlan->transposed != CLFFT_NOTRANSPOSE)
2743                                 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2744
2745
2746                         if(fftPlan->inputLayout == CLFFT_REAL)
2747                         {
2748                                 length0 = fftPlan->length[0];
2749                                 length1 = fftPlan->length[1];
2750
2751                                 size_t Nt = (1 + length0/2);
2752
2753
2754                                 // create row plan
2755                                 // real to hermitian
2756
2757                                 //create row plan
2758                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
2759                                         _T( "CreateDefaultPlan for planX failed" ) );
2760
2761                                 FFTPlan* rowPlan        = NULL;
2762                                 lockRAII* rowLock       = NULL;
2763                                 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
2764
2765
2766                                 rowPlan->outputLayout  = fftPlan->outputLayout;
2767                                 rowPlan->inputLayout  = fftPlan->inputLayout;
2768                                 rowPlan->placeness     = fftPlan->placeness;
2769                                 rowPlan->length.push_back(length1);
2770
2771                                 rowPlan->inStride[0]  = fftPlan->inStride[0];
2772                                 rowPlan->inStride.push_back(fftPlan->inStride[1]);
2773                                 rowPlan->iDist         = fftPlan->iDist;
2774
2775                                 rowPlan->precision     = fftPlan->precision;
2776                                 rowPlan->forwardScale  = 1.0f;
2777                                 rowPlan->backwardScale = 1.0f;
2778                                 rowPlan->tmpBufSize    = 0;
2779
2780                                 rowPlan->gen                    = fftPlan->gen;
2781                                 rowPlan->envelope               = fftPlan->envelope;
2782
2783                                 rowPlan->batchsize    = fftPlan->batchsize;
2784
2785                                 rowPlan->outStride[0]  = fftPlan->outStride[0];
2786                                 rowPlan->outStride.push_back(fftPlan->outStride[1]);
2787                                 rowPlan->oDist         = fftPlan->oDist;
2788
2789                                 //this 2d is decomposed from 3d
2790                                 for (size_t index=2; index < fftPlan->length.size(); index++)
2791                                 {
2792                                         rowPlan->length.push_back(fftPlan->length[index]);
2793                                         rowPlan->inStride.push_back(fftPlan->inStride[index]);
2794                                         rowPlan->outStride.push_back(fftPlan->outStride[index]);
2795                                 }
2796
2797                                 //Set callback data if set on top level plan
2798                                 if (fftPlan->hasPreCallback)
2799                                 {
2800                                         rowPlan->hasPreCallback = true;
2801                                         rowPlan->preCallback = fftPlan->preCallback;
2802                                         rowPlan->precallUserData = fftPlan->precallUserData;
2803                                 }
2804
2805                                 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
2806
2807                                 if( (rowPlan->inStride[0] == 1) && (rowPlan->outStride[0] == 1) &&
2808                                         ( ((rowPlan->inStride[1] == Nt*2) && (rowPlan->placeness == CLFFT_INPLACE)) ||
2809                                           ((rowPlan->inStride[1] == length0) && (rowPlan->placeness == CLFFT_OUTOFPLACE)) )
2810                                         && (rowPlan->outStride[1] == Nt) )
2811                                 {
2812                                         // calc temp buf size
2813                                         if (fftPlan->tmpBufSize==0)
2814                                         {
2815                                                 fftPlan->tmpBufSize = Nt * length1 * fftPlan->batchsize * fftPlan->ElementSize();
2816
2817                                                 for (size_t index=2; index < fftPlan->length.size(); index++)
2818                                                 {
2819                                                         fftPlan->tmpBufSize *= fftPlan->length[index];
2820                                                 }
2821                                         }
2822
2823                                         // create first transpose plan
2824                                         
2825                                         //Transpose 
2826                                         // output --> tmp
2827                                         size_t transLengths[2] = { length0, length1 };
2828                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, transLengths ),
2829                                                 _T( "CreateDefaultPlan for planTX transpose failed" ) );
2830
2831                                         FFTPlan* trans1Plan     = NULL;
2832                                         lockRAII* trans1Lock    = NULL;
2833                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
2834
2835                                         trans1Plan->transflag = true;
2836
2837                                         transLengths[0] = Nt;
2838                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTX, CLFFT_2D, transLengths ),
2839                                                 _T( "clfftSetPlanLength for planTX transpose failed" ) );
2840
2841                                         switch(fftPlan->outputLayout)
2842                                         {
2843                                         case CLFFT_HERMITIAN_INTERLEAVED:
2844                                                 {
2845                                                         trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2846                                                         trans1Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
2847                                                 }
2848                                                 break;
2849                                         case CLFFT_HERMITIAN_PLANAR:
2850                                                 {
2851                                                         trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2852                                                         trans1Plan->inputLayout  = CLFFT_COMPLEX_PLANAR;
2853                                                 }
2854                                                 break;
2855                                         default: assert(false);
2856                                         }
2857
2858                                         trans1Plan->placeness     = CLFFT_OUTOFPLACE;
2859                                         trans1Plan->precision     = fftPlan->precision;
2860                                         trans1Plan->tmpBufSize    = 0;
2861                                         trans1Plan->batchsize     = fftPlan->batchsize;
2862                                         trans1Plan->envelope      = fftPlan->envelope;
2863                                         trans1Plan->forwardScale  = 1.0f;
2864                                         trans1Plan->backwardScale = 1.0f;
2865
2866                                         trans1Plan->inStride[0]   = 1;
2867                                         trans1Plan->inStride[1]   = Nt;
2868                                         trans1Plan->outStride[0]  = 1;
2869                                         trans1Plan->outStride[1]  = length1;
2870                                         trans1Plan->iDist         = rowPlan->oDist;
2871                                         trans1Plan->oDist                 = Nt*length1;
2872                                         trans1Plan->transOutHorizontal = true;
2873
2874                                         trans1Plan->gen           = Transpose_GCN;
2875
2876
2877                                         for (size_t index=2; index < fftPlan->length.size(); index++)
2878                                         {
2879                                                 trans1Plan->length.push_back(fftPlan->length[index]);
2880                                                 trans1Plan->inStride.push_back(rowPlan->outStride[index]);
2881                                                 trans1Plan->outStride.push_back(trans1Plan->oDist);
2882                                                 trans1Plan->oDist *= fftPlan->length[index];
2883                                         }
2884
2885                                         OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
2886                                                 _T( "BakePlan for planTX failed" ) );
2887
2888
2889                                         // Create column plan as a row plan
2890                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
2891                                                 _T( "CreateDefaultPlan for planY failed" ) );
2892
2893                                         FFTPlan* colPlan        = NULL;
2894                                         lockRAII* colLock       = NULL;
2895                                         OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
2896
2897                                         colPlan->outputLayout  = trans1Plan->outputLayout;
2898                                         colPlan->inputLayout   = trans1Plan->outputLayout;
2899                                         colPlan->placeness     = CLFFT_INPLACE;
2900                                         colPlan->length.push_back(Nt);
2901
2902                                         colPlan->inStride[0]  = 1;
2903                                         colPlan->inStride.push_back(length1);
2904                                         colPlan->iDist         = Nt*length1;
2905
2906                                         colPlan->outStride[0]  = 1;
2907                                         colPlan->outStride.push_back(length1);
2908                                         colPlan->oDist         = Nt*length1;
2909
2910                                         colPlan->precision     = fftPlan->precision;
2911                                         colPlan->forwardScale  = fftPlan->forwardScale;
2912                                         colPlan->backwardScale = fftPlan->backwardScale;
2913                                         colPlan->tmpBufSize    = 0;
2914
2915                                         colPlan->gen                    = fftPlan->gen;
2916                                         colPlan->envelope               = fftPlan->envelope;
2917
2918                                         colPlan->batchsize    = fftPlan->batchsize;
2919
2920                                         //this 2d is decomposed from 3d
2921                                         for (size_t index=2; index < fftPlan->length.size(); index++)
2922                                         {
2923                                                 colPlan->length.push_back(fftPlan->length[index]);
2924                                                 colPlan->inStride.push_back(colPlan->iDist);
2925                                                 colPlan->outStride.push_back(colPlan->oDist);
2926                                                 colPlan->iDist *= fftPlan->length[index];
2927                                                 colPlan->oDist *= fftPlan->length[index];
2928                                         }
2929
2930                                         OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
2931                                                 _T( "BakePlan for planY failed" ) );
2932
2933                                         if (fftPlan->transposed == CLFFT_TRANSPOSED)
2934                                         {
2935                                                 fftPlan->baked = true;
2936                                                 return  CLFFT_SUCCESS;
2937                                         }
2938
2939                                         // create second transpose plan
2940                                         
2941                                         //Transpose 
2942                                         //output --> tmp
2943                                         size_t trans2Lengths[2] = { length1, length0 };
2944                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, trans2Lengths ),
2945                                                 _T( "CreateDefaultPlan for planTY transpose failed" ) );
2946
2947                                         FFTPlan* trans2Plan     = NULL;
2948                                         lockRAII* trans2Lock    = NULL;
2949                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
2950
2951                                         trans2Plan->transflag = true;
2952
2953                                         trans2Lengths[1] = Nt;
2954                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, trans2Lengths ),
2955                                                 _T( "clfftSetPlanLength for planTY transpose failed" ) );
2956
2957                                         switch(fftPlan->outputLayout)
2958                                         {
2959                                         case CLFFT_HERMITIAN_INTERLEAVED:
2960                                                 {
2961                                                         trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2962                                                         trans2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
2963                                                 }
2964                                                 break;
2965                                         case CLFFT_HERMITIAN_PLANAR:
2966                                                 {
2967                                                         trans2Plan->outputLayout = CLFFT_COMPLEX_PLANAR;
2968                                                         trans2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
2969                                                 }
2970                                                 break;
2971                                         default: assert(false);
2972                                         }
2973
2974                                         trans2Plan->placeness     = CLFFT_OUTOFPLACE;
2975                                         trans2Plan->precision     = fftPlan->precision;
2976                                         trans2Plan->tmpBufSize    = 0;
2977                                         trans2Plan->batchsize     = fftPlan->batchsize;
2978                                         trans2Plan->envelope      = fftPlan->envelope;
2979                                         trans2Plan->forwardScale  = 1.0f;
2980                                         trans2Plan->backwardScale = 1.0f;
2981
2982                                         trans2Plan->inStride[0]   = 1;
2983                                         trans2Plan->inStride[1]   = length1;
2984                                         trans2Plan->outStride[0]  = 1;
2985                                         trans2Plan->outStride[1]  = Nt;
2986                                         trans2Plan->iDist         = Nt*length1;
2987                                         trans2Plan->oDist                 = fftPlan->oDist;
2988
2989                                         trans2Plan->gen           = Transpose_GCN;
2990                                         trans2Plan->transflag     = true;
2991
2992                                         for (size_t index=2; index < fftPlan->length.size(); index++)
2993                                         {
2994                                                 trans2Plan->length.push_back(fftPlan->length[index]);
2995                                                 trans2Plan->inStride.push_back(trans2Plan->iDist);
2996                                                 trans2Plan->iDist *= fftPlan->length[index];
2997                                                 trans2Plan->outStride.push_back(fftPlan->outStride[index]);
2998
2999                                         }
3000
3001                                         //Set callback data if set on top level plan
3002                                         if (fftPlan->hasPostCallback)
3003                                         {
3004                                                 trans2Plan->hasPostCallback = true;
3005                                                 trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
3006                                                 trans2Plan->postcallUserData = fftPlan->postcallUserData;
3007                                         }
3008
3009                                         OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
3010                                                 _T( "BakePlan for planTY failed" ) );
3011
3012                                 }
3013                                 else
3014                                 {
3015                                         // create col plan
3016                                         // complex to complex
3017
3018                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
3019                                                 _T( "CreateDefaultPlan for planY failed" ) );
3020
3021                                         FFTPlan* colPlan        = NULL;
3022                                         lockRAII* colLock       = NULL;
3023                                         OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3024
3025                                         switch(fftPlan->outputLayout)
3026                                         {
3027                                         case CLFFT_HERMITIAN_INTERLEAVED:
3028                                                 {
3029                                                         colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3030                                                         colPlan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3031                                                 }
3032                                                 break;
3033                                         case CLFFT_HERMITIAN_PLANAR:
3034                                                 {
3035                                                         colPlan->outputLayout = CLFFT_COMPLEX_PLANAR;
3036                                                         colPlan->inputLayout  = CLFFT_COMPLEX_PLANAR;
3037                                                 }
3038                                                 break;
3039                                         default: assert(false);
3040                                         }
3041
3042                                         colPlan->placeness     = CLFFT_INPLACE;
3043                                         colPlan->length.push_back(Nt);
3044
3045                                         colPlan->outStride[0]  = fftPlan->outStride[1];
3046                                         colPlan->outStride.push_back(fftPlan->outStride[0]);
3047                                         colPlan->oDist         = fftPlan->oDist;
3048
3049
3050                                         colPlan->precision     = fftPlan->precision;
3051                                         colPlan->forwardScale  = fftPlan->forwardScale;
3052                                         colPlan->backwardScale = fftPlan->backwardScale;
3053                                         colPlan->tmpBufSize    = fftPlan->tmpBufSize;
3054
3055                                         colPlan->gen                    = fftPlan->gen;
3056                                         colPlan->envelope                       = fftPlan->envelope;
3057
3058                                         colPlan->batchsize = fftPlan->batchsize;
3059
3060                                         colPlan->inStride[0]  = rowPlan->outStride[1];
3061                                         colPlan->inStride.push_back(rowPlan->outStride[0]);
3062                                         colPlan->iDist         = rowPlan->oDist;
3063
3064                                         //this 2d is decomposed from 3d
3065                                         for (size_t index=2; index < fftPlan->length.size(); index++)
3066                                         {
3067                                                 colPlan->length.push_back(fftPlan->length[index]);
3068                                                 colPlan->outStride.push_back(fftPlan->outStride[index]);
3069                                                 colPlan->inStride.push_back(rowPlan->outStride[index]);
3070                                         }
3071
3072                                         //Set callback data if set on top level plan
3073                                         if (fftPlan->hasPostCallback)
3074                                         {
3075                                                 colPlan->hasPostCallback = true;
3076                                                 colPlan->postCallbackParam = fftPlan->postCallbackParam;
3077                                                 colPlan->postcallUserData = fftPlan->postcallUserData;
3078                                         }
3079
3080                                         OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
3081                                 }
3082
3083                         }
3084                         else if(fftPlan->outputLayout == CLFFT_REAL)
3085                         {
3086                                 length0 = fftPlan->length[0];
3087                                 length1 = fftPlan->length[1];
3088
3089                                 size_t Nt = (1 + length0/2);
3090                                 if (fftPlan->tmpBufSize==0)
3091                                 {
3092                                         fftPlan->tmpBufSize = Nt * length1 * fftPlan->batchsize * fftPlan->ElementSize();
3093                                         for (size_t index=2; index < fftPlan->length.size(); index++)
3094                                                 fftPlan->tmpBufSize *= fftPlan->length[index];
3095                                 }
3096
3097                                 if ((fftPlan->tmpBufSizeC2R==0) && (fftPlan->placeness == CLFFT_OUTOFPLACE) && (fftPlan->length.size() == 2))
3098                                 {
3099                                         fftPlan->tmpBufSizeC2R = fftPlan->tmpBufSize;
3100                                 }
3101
3102                                 if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) &&
3103                                         ( ((fftPlan->outStride[1] == Nt*2) && (fftPlan->oDist == Nt*2*length1) && (fftPlan->placeness == CLFFT_INPLACE)) ||
3104                                                 ((fftPlan->outStride[1] == length0) && (fftPlan->oDist == length0*length1) && (fftPlan->placeness == CLFFT_OUTOFPLACE)) )
3105                                         && (fftPlan->inStride[1] == Nt) && (fftPlan->iDist == Nt*length1) )
3106                                 {
3107                                         // create first transpose plan
3108                                         
3109                                         //Transpose 
3110                                         // input --> tmp
3111                                         size_t transLengths[2] = { length0, length1 };
3112                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, transLengths ),
3113                                                 _T( "CreateDefaultPlan for planTY transpose failed" ) );
3114
3115                                         FFTPlan* trans1Plan     = NULL;
3116                                         lockRAII* trans1Lock    = NULL;
3117                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
3118
3119                                         trans1Plan->transflag = true;
3120
3121                                         transLengths[0] = Nt;
3122                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, transLengths ),
3123                                                 _T( "clfftSetPlanLength for planTY transpose failed" ) );
3124
3125                                         switch(fftPlan->inputLayout)
3126                                         {
3127                                         case CLFFT_HERMITIAN_INTERLEAVED:
3128                                                 {
3129                                                         trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3130                                                         trans1Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3131                                                 }
3132                                                 break;
3133                                         case CLFFT_HERMITIAN_PLANAR:
3134                                                 {
3135                                                         trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3136                                                         trans1Plan->inputLayout  = CLFFT_COMPLEX_PLANAR;
3137                                                 }
3138                                                 break;
3139                                         default: assert(false);
3140                                         }
3141
3142                                         trans1Plan->placeness     = CLFFT_OUTOFPLACE;
3143                                         trans1Plan->precision     = fftPlan->precision;
3144                                         trans1Plan->tmpBufSize    = 0;
3145                                         trans1Plan->batchsize     = fftPlan->batchsize;
3146                                         trans1Plan->envelope      = fftPlan->envelope;
3147                                         trans1Plan->forwardScale  = 1.0f;
3148                                         trans1Plan->backwardScale = 1.0f;
3149
3150                                         trans1Plan->inStride[0]   = 1;
3151                                         trans1Plan->inStride[1]   = Nt;
3152                                         trans1Plan->outStride[0]  = 1;
3153                                         trans1Plan->outStride[1]  = length1;
3154                                         trans1Plan->iDist         = fftPlan->iDist;
3155                                         trans1Plan->oDist                 = Nt*length1;
3156                                         trans1Plan->transOutHorizontal = true;
3157
3158                                         trans1Plan->gen           = Transpose_GCN;
3159
3160
3161                                         for (size_t index=2; index < fftPlan->length.size(); index++)
3162                                         {
3163                                                 trans1Plan->length.push_back(fftPlan->length[index]);
3164                                                 trans1Plan->inStride.push_back(fftPlan->inStride[index]);
3165                                                 trans1Plan->outStride.push_back(trans1Plan->oDist);
3166                                                 trans1Plan->oDist *= fftPlan->length[index];
3167                                         }
3168
3169                                         //Set callback data if set on top level plan
3170                                         if (fftPlan->hasPreCallback)
3171                                         {
3172                                                 trans1Plan->hasPreCallback = true;
3173                                                 trans1Plan->preCallback = fftPlan->preCallback;
3174                                                 trans1Plan->precallUserData = fftPlan->precallUserData;
3175                                         }
3176
3177                                         OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
3178                                                 _T( "BakePlan for planTY failed" ) );
3179
3180                                         // create col plan
3181                                         // complex to complex
3182
3183                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
3184                                                 _T( "CreateDefaultPlan for planY failed" ) );
3185
3186                                         FFTPlan* colPlan        = NULL;
3187                                         lockRAII* colLock       = NULL;
3188                                         OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3189
3190                                         colPlan->length.push_back(Nt);
3191
3192                                         colPlan->inStride[0]  = 1;
3193                                         colPlan->inStride.push_back(length1);
3194                                         colPlan->iDist         = trans1Plan->oDist;
3195
3196                                         colPlan->placeness = CLFFT_INPLACE;
3197                                         colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3198                                         colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3199
3200                                         colPlan->outStride[0]  = colPlan->inStride[0];
3201                                         colPlan->outStride.push_back(colPlan->inStride[1]);
3202                                         colPlan->oDist         = colPlan->iDist;
3203
3204                                         for (size_t index=2; index < fftPlan->length.size(); index++)
3205                                         {
3206                                                 colPlan->length.push_back(fftPlan->length[index]);
3207                                                 colPlan->inStride.push_back(trans1Plan->outStride[index]);
3208                                                 colPlan->outStride.push_back(trans1Plan->outStride[index]);
3209                                         }
3210
3211
3212                                         colPlan->precision     = fftPlan->precision;
3213                                         colPlan->forwardScale  = 1.0f;
3214                                         colPlan->backwardScale = 1.0f;
3215                                         colPlan->tmpBufSize    = 0;
3216
3217                                         colPlan->gen                    = fftPlan->gen;
3218                                         colPlan->envelope               = fftPlan->envelope;
3219
3220                                         colPlan->batchsize = fftPlan->batchsize;
3221
3222                                         OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
3223
3224                                         // create second transpose plan
3225                                         
3226                                         //Transpose 
3227                                         //tmp --> output
3228                                         size_t trans2Lengths[2] = { length1, length0 };
3229                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, trans2Lengths ),
3230                                                 _T( "CreateDefaultPlan for planTX transpose failed" ) );
3231
3232                                         FFTPlan* trans2Plan     = NULL;
3233                                         lockRAII* trans2Lock    = NULL;
3234                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
3235
3236                                         trans2Plan->transflag = true;
3237
3238                                         trans2Lengths[1] = Nt;
3239                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTX, CLFFT_2D, trans2Lengths ),
3240                                                 _T( "clfftSetPlanLength for planTX transpose failed" ) );
3241
3242
3243                                         trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3244                                         trans2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3245
3246
3247                                         trans2Plan->placeness     = CLFFT_OUTOFPLACE;
3248                                         trans2Plan->precision     = fftPlan->precision;
3249                                         trans2Plan->tmpBufSize    = 0;
3250                                         trans2Plan->batchsize     = fftPlan->batchsize;
3251                                         trans2Plan->envelope      = fftPlan->envelope;
3252                                         trans2Plan->forwardScale  = 1.0f;
3253                                         trans2Plan->backwardScale = 1.0f;
3254
3255                                         trans2Plan->inStride[0]   = 1;
3256                                         trans2Plan->inStride[1]   = length1;
3257                                         trans2Plan->outStride[0]  = 1;
3258                                         trans2Plan->outStride[1]  = Nt;
3259                                         trans2Plan->iDist         = colPlan->oDist;
3260                                         trans2Plan->oDist                 = Nt*length1;
3261
3262                                         trans2Plan->gen           = Transpose_GCN;
3263                                         trans2Plan->transflag     = true;
3264
3265                                         for (size_t index=2; index < fftPlan->length.size(); index++)
3266                                         {
3267                                                 trans2Plan->length.push_back(fftPlan->length[index]);
3268                                                 trans2Plan->inStride.push_back(colPlan->outStride[index]);
3269                                                 trans2Plan->outStride.push_back(trans2Plan->oDist);
3270                                                 trans2Plan->oDist *= fftPlan->length[index];
3271
3272                                         }
3273
3274                                         OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
3275                                                 _T( "BakePlan for planTX failed" ) );
3276
3277                                         // create row plan
3278                                         // hermitian to real
3279
3280                                         //create row plan
3281                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
3282                                                 _T( "CreateDefaultPlan for planX failed" ) );
3283
3284                                         FFTPlan* rowPlan        = NULL;
3285                                         lockRAII* rowLock       = NULL;
3286                                         OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
3287
3288                                         rowPlan->outputLayout  = fftPlan->outputLayout;
3289                                         rowPlan->inputLayout   = CLFFT_HERMITIAN_INTERLEAVED;
3290
3291                                         rowPlan->length.push_back(length1);
3292
3293                                         rowPlan->outStride[0]  = fftPlan->outStride[0];
3294                                         rowPlan->outStride.push_back(fftPlan->outStride[1]);
3295                                         rowPlan->oDist         = fftPlan->oDist;
3296
3297                                         rowPlan->inStride[0]  = trans2Plan->outStride[0];
3298                                         rowPlan->inStride.push_back(trans2Plan->outStride[1]);
3299                                         rowPlan->iDist         = trans2Plan->oDist;
3300
3301                                         for (size_t index=2; index < fftPlan->length.size(); index++)
3302                                         {
3303                                                 rowPlan->length.push_back(fftPlan->length[index]);
3304                                                 rowPlan->inStride.push_back(trans2Plan->outStride[index]);
3305                                                 rowPlan->outStride.push_back(fftPlan->outStride[index]);
3306                                         }
3307
3308                                         if (fftPlan->placeness == CLFFT_INPLACE)
3309                                         {
3310                                                 rowPlan->placeness     = CLFFT_INPLACE;
3311                                         }
3312                                         else
3313                                         {
3314                                                 rowPlan->placeness     = CLFFT_OUTOFPLACE;
3315                                         }                               
3316
3317
3318                                         rowPlan->precision     = fftPlan->precision;
3319                                         rowPlan->forwardScale  = fftPlan->forwardScale;
3320                                         rowPlan->backwardScale = fftPlan->backwardScale;
3321                                         rowPlan->tmpBufSize    = 0;
3322
3323                                         rowPlan->gen                    = fftPlan->gen;
3324                                         rowPlan->envelope               = fftPlan->envelope;
3325
3326                                         rowPlan->batchsize    = fftPlan->batchsize;
3327
3328                                         //Set callback data if set on top level plan
3329                                         if (fftPlan->hasPostCallback)
3330                                         {
3331                                                 rowPlan->hasPostCallback = true;
3332                                                 rowPlan->postCallbackParam = fftPlan->postCallbackParam;
3333                                                 rowPlan->postcallUserData = fftPlan->postcallUserData;
3334                                         }
3335
3336                                         OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
3337                                 }
3338                                 else
3339                                 {
3340
3341                                         // create col plan
3342                                         // complex to complex
3343
3344                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
3345                                                 _T( "CreateDefaultPlan for planY failed" ) );
3346
3347                                         FFTPlan* colPlan        = NULL;
3348                                         lockRAII* colLock       = NULL;
3349                                         OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3350
3351
3352                                         switch(fftPlan->inputLayout)
3353                                         {
3354                                         case CLFFT_HERMITIAN_INTERLEAVED:
3355                                                 {
3356                                                         colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3357                                                         colPlan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3358                                                 }
3359                                                 break;
3360                                         case CLFFT_HERMITIAN_PLANAR:
3361                                                 {
3362                                                         colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3363                                                         colPlan->inputLayout  = CLFFT_COMPLEX_PLANAR;
3364                                                 }
3365                                                 break;
3366                                         default: assert(false);
3367                                         }
3368
3369
3370                                         colPlan->length.push_back(Nt);
3371
3372                                         colPlan->inStride[0]  = fftPlan->inStride[1];
3373                                         colPlan->inStride.push_back(fftPlan->inStride[0]);
3374                                         colPlan->iDist         = fftPlan->iDist;
3375
3376
3377                                         if (fftPlan->placeness == CLFFT_INPLACE)
3378                                         {
3379                                                 colPlan->placeness = CLFFT_INPLACE;
3380                                         }
3381                                         else
3382                                         {
3383                                                 if(fftPlan->length.size() > 2)
3384                                                         colPlan->placeness = CLFFT_INPLACE;
3385                                                 else
3386                                                         colPlan->placeness = CLFFT_OUTOFPLACE;
3387                                         }
3388
3389                                         if(colPlan->placeness == CLFFT_INPLACE)
3390                                         {
3391                                                 colPlan->outStride[0]  = colPlan->inStride[0];
3392                                                 colPlan->outStride.push_back(colPlan->inStride[1]);
3393                                                 colPlan->oDist         = colPlan->iDist;
3394
3395                                                 for (size_t index=2; index < fftPlan->length.size(); index++)
3396                                                 {
3397                                                         colPlan->length.push_back(fftPlan->length[index]);
3398                                                         colPlan->inStride.push_back(fftPlan->inStride[index]);
3399                                                         colPlan->outStride.push_back(fftPlan->inStride[index]);
3400                                                 }
3401                                         }
3402                                         else
3403                                         {
3404                                                 colPlan->outStride[0]  = Nt;
3405                                                 colPlan->outStride.push_back(1);
3406                                                 colPlan->oDist         = Nt*length1;
3407
3408                                                 for (size_t index=2; index < fftPlan->length.size(); index++)
3409                                                 {
3410                                                         colPlan->length.push_back(fftPlan->length[index]);
3411                                                         colPlan->inStride.push_back(fftPlan->inStride[index]);
3412                                                         colPlan->outStride.push_back(colPlan->oDist);
3413                                                         colPlan->oDist *= fftPlan->length[index];
3414                                                 }
3415                                         }
3416
3417                                         colPlan->precision     = fftPlan->precision;
3418                                         colPlan->forwardScale  = 1.0f;
3419                                         colPlan->backwardScale = 1.0f;
3420                                         colPlan->tmpBufSize    = 0;
3421
3422                                         colPlan->gen                    = fftPlan->gen;
3423                                         colPlan->envelope                       = fftPlan->envelope;
3424
3425                                         colPlan->batchsize = fftPlan->batchsize;
3426
3427                                         //Set callback data if set on top level plan
3428                                         if (fftPlan->hasPreCallback)
3429                                         {
3430                                                 colPlan->hasPreCallback = true;
3431                                                 colPlan->preCallback = fftPlan->preCallback;
3432                                                 colPlan->precallUserData = fftPlan->precallUserData;
3433                                         }
3434
3435                                         OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
3436
3437                                         // create row plan
3438                                         // hermitian to real
3439
3440                                         //create row plan
3441                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
3442                                                 _T( "CreateDefaultPlan for planX failed" ) );
3443
3444                                         FFTPlan* rowPlan        = NULL;
3445                                         lockRAII* rowLock       = NULL;
3446                                         OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
3447
3448                                         rowPlan->outputLayout  = fftPlan->outputLayout;
3449                                         rowPlan->inputLayout   = CLFFT_HERMITIAN_INTERLEAVED;
3450
3451                                         rowPlan->length.push_back(length1);
3452
3453                                         rowPlan->outStride[0]  = fftPlan->outStride[0];
3454                                         rowPlan->outStride.push_back(fftPlan->outStride[1]);
3455                                         rowPlan->oDist         = fftPlan->oDist;
3456
3457                                         if (fftPlan->placeness == CLFFT_INPLACE)
3458                                         {
3459                                                 rowPlan->placeness     = CLFFT_INPLACE;
3460
3461                                                 rowPlan->inStride[0]  = colPlan->outStride[1];
3462                                                 rowPlan->inStride.push_back(colPlan->outStride[0]);
3463                                                 rowPlan->iDist         = colPlan->oDist;
3464
3465                                                 for (size_t index=2; index < fftPlan->length.size(); index++)
3466                                                 {
3467                                                         rowPlan->length.push_back(fftPlan->length[index]);
3468                                                         rowPlan->inStride.push_back(colPlan->outStride[index]);
3469                                                         rowPlan->outStride.push_back(fftPlan->outStride[index]);
3470                                                 }
3471                                         }
3472                                         else
3473                                         {
3474                                                 rowPlan->placeness     = CLFFT_OUTOFPLACE;
3475
3476                                                 rowPlan->inStride[0]   = 1;
3477                                                 rowPlan->inStride.push_back(Nt);
3478                                                 rowPlan->iDist         = Nt*length1;
3479
3480                                                 for (size_t index=2; index < fftPlan->length.size(); index++)
3481                                                 {
3482                                                         rowPlan->length.push_back(fftPlan->length[index]);
3483                                                         rowPlan->outStride.push_back(fftPlan->outStride[index]);
3484                                                         rowPlan->inStride.push_back(rowPlan->iDist);                                            
3485                                                         rowPlan->iDist *= fftPlan->length[index];
3486                                                 }
3487                                         }
3488                                 
3489
3490                                         rowPlan->precision     = fftPlan->precision;
3491                                         rowPlan->forwardScale  = fftPlan->forwardScale;
3492                                         rowPlan->backwardScale = fftPlan->backwardScale;
3493                                         rowPlan->tmpBufSize    = 0;
3494
3495                                         rowPlan->gen                    = fftPlan->gen;
3496                                         rowPlan->envelope               = fftPlan->envelope;
3497
3498                                         rowPlan->batchsize    = fftPlan->batchsize;
3499
3500                                         //Set callback data if set on top level plan
3501                                         if (fftPlan->hasPostCallback)
3502                                         {
3503                                                 rowPlan->hasPostCallback = true;
3504                                                 rowPlan->postCallbackParam = fftPlan->postCallbackParam;
3505                                                 rowPlan->postcallUserData = fftPlan->postcallUserData;
3506                                         }
3507
3508                                         OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
3509                                 }
3510                         }
3511                         else
3512                         {
3513                                 if (fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2)
3514                                 {
3515                                         fftPlan->tmpBufSize = length0 * length1 *
3516                                                 fftPlan->batchsize * fftPlan->ElementSize();
3517                                 }
3518
3519                                 //create row plan
3520                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
3521                                         _T( "CreateDefaultPlan for planX failed" ) );
3522
3523                                 FFTPlan* rowPlan        = NULL;
3524                                 lockRAII* rowLock       = NULL;
3525                                 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
3526
3527                                 rowPlan->inputLayout   = fftPlan->inputLayout;
3528                                 if (fftPlan->large2D || fftPlan->length.size()>2)
3529                                 {
3530                                         rowPlan->outputLayout  = fftPlan->outputLayout;
3531                                         rowPlan->placeness     = fftPlan->placeness;
3532                                         rowPlan->outStride[0]  = fftPlan->outStride[0];
3533                                         rowPlan->outStride.push_back(fftPlan->outStride[1]);
3534                                         rowPlan->oDist         = fftPlan->oDist;
3535                                 }
3536                                 else
3537                                 {
3538                                         rowPlan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3539                                         rowPlan->placeness     = CLFFT_OUTOFPLACE;
3540                                         rowPlan->outStride[0]  = length1;//1;
3541                                         rowPlan->outStride.push_back(1);//length0);
3542                                         rowPlan->oDist         = length0 * length1;
3543                                 }
3544                                 rowPlan->precision     = fftPlan->precision;
3545                                 rowPlan->forwardScale  = 1.0f;
3546                                 rowPlan->backwardScale = 1.0f;
3547                                 rowPlan->tmpBufSize    = fftPlan->tmpBufSize;
3548
3549                                 rowPlan->gen                    = fftPlan->gen;
3550                                 rowPlan->envelope                       = fftPlan->envelope;
3551
3552                                 // This is the row fft, the first elements distance between the first two FFTs is the distance of the first elements
3553                                 // of the first two rows in the original buffer.
3554                                 rowPlan->batchsize    = fftPlan->batchsize;
3555                                 rowPlan->inStride[0]  = fftPlan->inStride[0];
3556
3557                                 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
3558                                 rowPlan->length.push_back(fftPlan->length[1]);
3559                                 rowPlan->inStride.push_back(fftPlan->inStride[1]);
3560
3561                                 //this 2d is decomposed from 3d
3562                                 if (fftPlan->length.size()>2)
3563                                 {
3564                                         rowPlan->length.push_back(fftPlan->length[2]);
3565                                         rowPlan->inStride.push_back(fftPlan->inStride[2]);
3566                                         rowPlan->outStride.push_back(fftPlan->outStride[2]);
3567                                 }
3568
3569                                 rowPlan->iDist    = fftPlan->iDist;
3570
3571                                 //Set callback data if set on top level plan
3572                                 if (fftPlan->hasPreCallback)
3573                                 {
3574                                         rowPlan->hasPreCallback = true;
3575                                         rowPlan->preCallback = fftPlan->preCallback;
3576                                         rowPlan->precallUserData = fftPlan->precallUserData;
3577                                 }
3578
3579                                 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
3580
3581                                 //create col plan
3582                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
3583                                         _T( "CreateDefaultPlan for planY failed" ) );
3584
3585                                 FFTPlan* colPlan        = NULL;
3586                                 lockRAII* colLock       = NULL;
3587                                 OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3588
3589                                 if (fftPlan->large2D || fftPlan->length.size()>2)
3590                                 {
3591                                         colPlan->inputLayout   = fftPlan->outputLayout;
3592                                         colPlan->placeness     = CLFFT_INPLACE;
3593                                         colPlan->inStride[0]   = fftPlan->outStride[1];
3594                                         colPlan->inStride.push_back(fftPlan->outStride[0]);
3595                                         colPlan->iDist         = fftPlan->oDist;
3596                                 }
3597                                 else
3598                                 {
3599                                         colPlan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
3600                                         colPlan->placeness     = CLFFT_OUTOFPLACE;
3601                                         colPlan->inStride[0]   = 1;//length0;
3602                                         colPlan->inStride.push_back(length1);//1);
3603                                         colPlan->iDist         = length0 * length1;
3604                                 }
3605
3606                                 colPlan->outputLayout  = fftPlan->outputLayout;
3607                                 colPlan->precision     = fftPlan->precision;
3608                                 colPlan->forwardScale  = fftPlan->forwardScale;
3609                                 colPlan->backwardScale = fftPlan->backwardScale;
3610                                 colPlan->tmpBufSize    = fftPlan->tmpBufSize;
3611
3612                                 colPlan->gen                    = fftPlan->gen;
3613                                 colPlan->envelope                       = fftPlan->envelope;
3614
3615                                 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
3616                                 // elements in the original buffer. Like a transpose of the matrix
3617                                 colPlan->batchsize = fftPlan->batchsize;
3618                                 colPlan->outStride[0] = fftPlan->outStride[1];
3619
3620                                 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
3621                                 colPlan->length.push_back(fftPlan->length[0]);
3622                                 colPlan->outStride.push_back(fftPlan->outStride[0]);
3623                                 colPlan->oDist    = fftPlan->oDist;
3624
3625                                 //this 2d is decomposed from 3d
3626                                 if (fftPlan->length.size()>2)
3627                                 {
3628                                         //assert(fftPlan->large2D);
3629                                         colPlan->length.push_back(fftPlan->length[2]);
3630                                         colPlan->inStride.push_back(fftPlan->outStride[2]);
3631                                         colPlan->outStride.push_back(fftPlan->outStride[2]);
3632                                 }
3633
3634                                 //Set callback data if set on top level plan
3635                                 if (fftPlan->hasPostCallback)
3636                                 {
3637                                         colPlan->hasPostCallback = true;
3638                                         colPlan->postCallbackParam = fftPlan->postCallbackParam;
3639                                         colPlan->postcallUserData = fftPlan->postcallUserData;
3640                                 }
3641
3642                                 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
3643                         }
3644
3645                         fftPlan->baked = true;
3646                         return  CLFFT_SUCCESS;
3647                 }
3648         case CLFFT_3D:
3649                 {
3650                         if(fftPlan->inputLayout == CLFFT_REAL)
3651                         {
3652
3653                                 size_t length0 = fftPlan->length[ DimX ];
3654                                 size_t length1 = fftPlan->length[ DimY ];
3655                                 size_t length2 = fftPlan->length[ DimZ ];
3656
3657                                 size_t Nt = (1 + length0/2);
3658
3659
3660                                 //create 2D xy plan
3661                                 size_t clLengths[] = { length0, length1, 0 };
3662                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
3663                                         _T( "CreateDefaultPlan 2D planX failed" ) );
3664
3665                                 FFTPlan* xyPlan = NULL;
3666                                 lockRAII* rowLock       = NULL;
3667                                 OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
3668
3669                                 xyPlan->inputLayout   = fftPlan->inputLayout;
3670                                 xyPlan->outputLayout  = fftPlan->outputLayout;
3671                                 xyPlan->placeness     = fftPlan->placeness;
3672                                 xyPlan->precision     = fftPlan->precision;
3673                                 xyPlan->forwardScale  = 1.0f;
3674                                 xyPlan->backwardScale = 1.0f;
3675                                 xyPlan->tmpBufSize    = fftPlan->tmpBufSize;
3676
3677                                 xyPlan->gen                      = fftPlan->gen;
3678                                 xyPlan->envelope                         = fftPlan->envelope;
3679
3680                                 // This is the xy fft, the first elements distance between the first two FFTs is the distance of the first elements
3681                                 // of the first two rows in the original buffer.
3682                                 xyPlan->batchsize    = fftPlan->batchsize;
3683                                 xyPlan->inStride[0]  = fftPlan->inStride[0];
3684                                 xyPlan->inStride[1]  = fftPlan->inStride[1];
3685                                 xyPlan->outStride[0] = fftPlan->outStride[0];
3686                                 xyPlan->outStride[1] = fftPlan->outStride[1];
3687
3688                                 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
3689                                 xyPlan->length.push_back(fftPlan->length[2]);
3690                                 xyPlan->inStride.push_back(fftPlan->inStride[2]);
3691                                 xyPlan->outStride.push_back(fftPlan->outStride[2]);
3692                                 xyPlan->iDist    = fftPlan->iDist;
3693                                 xyPlan->oDist    = fftPlan->oDist;
3694
3695                                 //this 3d is decomposed from 4d
3696                                 for (size_t index=3; index < fftPlan->length.size(); index++)
3697                                 {
3698                                         xyPlan->length.push_back(fftPlan->length[index]);
3699                                         xyPlan->inStride.push_back(fftPlan->inStride[index]);
3700                                         xyPlan->outStride.push_back(fftPlan->outStride[index]);
3701                                 }
3702
3703                                 //Set callback data if set on top level plan
3704                                 if (fftPlan->hasPreCallback)
3705                                 {
3706                                         xyPlan->hasPreCallback = true;
3707                                         xyPlan->preCallback = fftPlan->preCallback;
3708                                         xyPlan->precallUserData = fftPlan->precallUserData;
3709                                 }
3710
3711                                 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
3712
3713                                 if( (xyPlan->inStride[0] == 1) && (xyPlan->outStride[0] == 1) &&
3714                                         (xyPlan->outStride[2] == Nt*length1) &&
3715                                         ( ((xyPlan->inStride[2] == Nt*2*length1) && (xyPlan->placeness == CLFFT_INPLACE)) ||
3716                                           ((xyPlan->inStride[2] == length0*length1) && (xyPlan->placeness == CLFFT_OUTOFPLACE)) ) )
3717                                 {
3718
3719                                         if (fftPlan->tmpBufSize==0)
3720                                         {
3721                                                 fftPlan->tmpBufSize = Nt * length1 * length2 * fftPlan->batchsize * fftPlan->ElementSize();
3722
3723                                                 for (size_t index=3; index < fftPlan->length.size(); index++)
3724                                                 {
3725                                                         fftPlan->tmpBufSize *= fftPlan->length[index];
3726                                                 }
3727                                         }
3728
3729                                         // create first transpose plan
3730                                         
3731                                         //Transpose 
3732                                         // output --> tmp
3733                                         size_t transLengths[2] = { length0*length1, length2 };
3734                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, transLengths ),
3735                                                 _T( "CreateDefaultPlan for planTX transpose failed" ) );
3736
3737                                         FFTPlan* trans1Plan     = NULL;
3738                                         lockRAII* trans1Lock    = NULL;
3739                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
3740
3741                                         trans1Plan->transflag = true;
3742
3743                                         transLengths[0] = Nt*length1;
3744                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTX, CLFFT_2D, transLengths ),
3745                                                 _T( "clfftSetPlanLength for planTX transpose failed" ) );
3746
3747                                         switch(fftPlan->outputLayout)
3748                                         {
3749                                         case CLFFT_HERMITIAN_INTERLEAVED:
3750                                                 {
3751                                                         trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3752                                                         trans1Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3753                                                 }
3754                                                 break;
3755                                         case CLFFT_HERMITIAN_PLANAR:
3756                                                 {
3757                                                         trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3758                                                         trans1Plan->inputLayout  = CLFFT_COMPLEX_PLANAR;
3759                                                 }
3760                                                 break;
3761                                         default: assert(false);
3762                                         }
3763
3764                                         trans1Plan->placeness     = CLFFT_OUTOFPLACE;
3765                                         trans1Plan->precision     = fftPlan->precision;
3766                                         trans1Plan->tmpBufSize    = 0;
3767                                         trans1Plan->batchsize     = fftPlan->batchsize;
3768                                         trans1Plan->envelope      = fftPlan->envelope;
3769                                         trans1Plan->forwardScale  = 1.0f;
3770                                         trans1Plan->backwardScale = 1.0f;
3771
3772                                         trans1Plan->inStride[0]   = 1;
3773                                         trans1Plan->inStride[1]   = Nt*length1;
3774                                         trans1Plan->outStride[0]  = 1;
3775                                         trans1Plan->outStride[1]  = length2;
3776                                         trans1Plan->iDist         = xyPlan->oDist;
3777                                         trans1Plan->oDist                 = Nt*length1*length2;
3778                                         trans1Plan->transOutHorizontal = true;
3779
3780                                         trans1Plan->gen           = Transpose_GCN;
3781
3782
3783                                         for (size_t index=3; index < fftPlan->length.size(); index++)
3784                                         {
3785                                                 trans1Plan->length.push_back(fftPlan->length[index]);
3786                                                 trans1Plan->inStride.push_back(xyPlan->outStride[index]);
3787                                                 trans1Plan->outStride.push_back(trans1Plan->oDist);
3788                                                 trans1Plan->oDist *= fftPlan->length[index];
3789                                         }
3790
3791                                         OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
3792                                                 _T( "BakePlan for planTX failed" ) );
3793
3794                                         // Create column plan as a row plan
3795                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimZ ] ),
3796                                                 _T( "CreateDefaultPlan for planZ failed" ) );
3797
3798                                         FFTPlan* colPlan        = NULL;
3799                                         lockRAII* colLock       = NULL;
3800                                         OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3801
3802                                         colPlan->outputLayout  = trans1Plan->outputLayout;
3803                                         colPlan->inputLayout   = trans1Plan->outputLayout;
3804                                         colPlan->placeness     = CLFFT_INPLACE;
3805                                         colPlan->length.push_back(Nt*length1);
3806
3807                                         colPlan->inStride[0]  = 1;
3808                                         colPlan->inStride.push_back(length2);
3809                                         colPlan->iDist         = Nt*length1*length2;
3810
3811                                         colPlan->outStride[0]  = 1;
3812                                         colPlan->outStride.push_back(length2);
3813                                         colPlan->oDist         = Nt*length1*length2;
3814
3815                                         colPlan->precision     = fftPlan->precision;
3816                                         colPlan->forwardScale  = fftPlan->forwardScale;
3817                                         colPlan->backwardScale = fftPlan->backwardScale;
3818                                         colPlan->tmpBufSize    = 0;
3819
3820                                         colPlan->gen                    = fftPlan->gen;
3821                                         colPlan->envelope               = fftPlan->envelope;
3822
3823                                         colPlan->batchsize    = fftPlan->batchsize;
3824
3825                                         //this 2d is decomposed from 3d
3826                                         for (size_t index=3; index < fftPlan->length.size(); index++)
3827                                         {
3828                                                 colPlan->length.push_back(fftPlan->length[index]);
3829                                                 colPlan->inStride.push_back(colPlan->iDist);
3830                                                 colPlan->outStride.push_back(colPlan->oDist);
3831                                                 colPlan->iDist *= fftPlan->length[index];
3832                                                 colPlan->oDist *= fftPlan->length[index];
3833                                         }
3834
3835                                         OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ),
3836                                                 _T( "BakePlan for planZ failed" ) );
3837
3838                                         if (fftPlan->transposed == CLFFT_TRANSPOSED)
3839                                         {
3840                                                 fftPlan->baked = true;
3841                                                 return  CLFFT_SUCCESS;
3842                                         }
3843
3844                                         // create second transpose plan
3845                                         
3846                                         //Transpose 
3847                                         //output --> tmp
3848                                         size_t trans2Lengths[2] = { length2, length0*length1 };
3849                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, trans2Lengths ),
3850                                                 _T( "CreateDefaultPlan for planTY transpose failed" ) );
3851
3852                                         FFTPlan* trans2Plan     = NULL;
3853                                         lockRAII* trans2Lock    = NULL;
3854                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
3855
3856                                         trans2Plan->transflag = true;
3857
3858                                         trans2Lengths[1] = Nt*length1;
3859                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, trans2Lengths ),
3860                                                 _T( "clfftSetPlanLength for planTY transpose failed" ) );
3861
3862                                         switch(fftPlan->outputLayout)
3863                                         {
3864                                         case CLFFT_HERMITIAN_INTERLEAVED:
3865                                                 {
3866                                                         trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3867                                                         trans2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3868                                                 }
3869                                                 break;
3870                                         case CLFFT_HERMITIAN_PLANAR:
3871                                                 {
3872                                                         trans2Plan->outputLayout = CLFFT_COMPLEX_PLANAR;
3873                                                         trans2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3874                                                 }
3875                                                 break;
3876                                         default: assert(false);
3877                                         }
3878
3879                                         trans2Plan->placeness     = CLFFT_OUTOFPLACE;
3880                                         trans2Plan->precision     = fftPlan->precision;
3881                                         trans2Plan->tmpBufSize    = 0;
3882                                         trans2Plan->batchsize     = fftPlan->batchsize;
3883                                         trans2Plan->envelope      = fftPlan->envelope;
3884                                         trans2Plan->forwardScale  = 1.0f;
3885                                         trans2Plan->backwardScale = 1.0f;
3886
3887                                         trans2Plan->inStride[0]   = 1;
3888                                         trans2Plan->inStride[1]   = length2;
3889                                         trans2Plan->outStride[0]  = 1;
3890                                         trans2Plan->outStride[1]  = Nt*length1;
3891                                         trans2Plan->iDist         = Nt*length1*length2;
3892                                         trans2Plan->oDist                 = fftPlan->oDist;
3893
3894                                         trans2Plan->gen           = Transpose_GCN;
3895                                         trans2Plan->transflag     = true;
3896
3897                                         for (size_t index=3; index < fftPlan->length.size(); index++)
3898                                         {
3899                                                 trans2Plan->length.push_back(fftPlan->length[index]);
3900                                                 trans2Plan->inStride.push_back(trans2Plan->iDist);
3901                                                 trans2Plan->iDist *= fftPlan->length[index];
3902                                                 trans2Plan->outStride.push_back(fftPlan->outStride[index]);
3903                                         }
3904
3905                                         //Set callback data if set on top level plan
3906                                         if (fftPlan->hasPostCallback)
3907                                         {
3908                                                 trans2Plan->hasPostCallback = true;
3909                                                 trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
3910                                                 trans2Plan->postcallUserData = fftPlan->postcallUserData;
3911                                         }
3912
3913                                         OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
3914                                                 _T( "BakePlan for planTY failed" ) );
3915
3916
3917                                 }
3918                                 else
3919                                 {
3920
3921                                         clLengths[0] = fftPlan->length[ DimZ ];
3922                                         clLengths[1] = clLengths[2] = 0;
3923                                         //create 1D col plan
3924                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
3925                                                 _T( "CreateDefaultPlan for planZ failed" ) );
3926
3927                                         FFTPlan* colPlan        = NULL;
3928                                         lockRAII* colLock       = NULL;
3929                                         OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3930
3931                                         switch(fftPlan->outputLayout)
3932                                         {
3933                                         case CLFFT_HERMITIAN_INTERLEAVED:
3934                                                 {
3935                                                         colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3936                                                         colPlan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
3937                                                 }
3938                                                 break;
3939                                         case CLFFT_HERMITIAN_PLANAR:
3940                                                 {
3941                                                         colPlan->outputLayout = CLFFT_COMPLEX_PLANAR;
3942                                                         colPlan->inputLayout  = CLFFT_COMPLEX_PLANAR;
3943                                                 }
3944                                                 break;
3945                                         default: assert(false);
3946                                         }
3947
3948                                         colPlan->placeness     = CLFFT_INPLACE;
3949                                         colPlan->precision     = fftPlan->precision;
3950                                         colPlan->forwardScale  = fftPlan->forwardScale;
3951                                         colPlan->backwardScale = fftPlan->backwardScale;
3952                                         colPlan->tmpBufSize    = fftPlan->tmpBufSize;
3953
3954                                         colPlan->gen                     = fftPlan->gen;
3955                                         colPlan->envelope                        = fftPlan->envelope;
3956
3957                                         // This is a column FFT, the first elements distance between each FFT is the distance of the first two
3958                                         // elements in the original buffer. Like a transpose of the matrix
3959                                         colPlan->batchsize = fftPlan->batchsize;
3960                                         colPlan->inStride[0] = fftPlan->outStride[2];
3961                                         colPlan->outStride[0] = fftPlan->outStride[2];
3962
3963                                         //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
3964                                         colPlan->length.push_back(1 + fftPlan->length[0]/2);
3965                                         colPlan->length.push_back(fftPlan->length[1]);
3966                                         colPlan->inStride.push_back(fftPlan->outStride[0]);
3967                                         colPlan->inStride.push_back(fftPlan->outStride[1]);
3968                                         colPlan->outStride.push_back(fftPlan->outStride[0]);
3969                                         colPlan->outStride.push_back(fftPlan->outStride[1]);
3970                                         colPlan->iDist    = fftPlan->oDist;
3971                                         colPlan->oDist    = fftPlan->oDist;
3972
3973                                         //this 3d is decomposed from 4d
3974                                         for (size_t index=3; index < fftPlan->length.size(); index++)
3975                                         {
3976                                                 colPlan->length.push_back(fftPlan->length[index]);
3977                                                 colPlan->inStride.push_back(xyPlan->outStride[index]);
3978                                                 colPlan->outStride.push_back(fftPlan->outStride[index]);
3979                                         }
3980
3981                                         //Set callback data if set on top level plan
3982                                         if (fftPlan->hasPostCallback)
3983                                         {
3984                                                 colPlan->hasPostCallback = true;
3985                                                 colPlan->postCallbackParam = fftPlan->postCallbackParam;
3986                                                 colPlan->postcallUserData = fftPlan->postcallUserData;
3987                                         }
3988
3989                                         OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
3990                                 }
3991                         }
3992                         else if(fftPlan->outputLayout == CLFFT_REAL)
3993                         {
3994                                 size_t length0 = fftPlan->length[ DimX ];
3995                                 size_t length1 = fftPlan->length[ DimY ];
3996                                 size_t length2 = fftPlan->length[ DimZ ];
3997
3998                                 size_t Nt = (1 + length0/2);
3999
4000                                 if (fftPlan->tmpBufSize == 0)
4001                                 {
4002                                         fftPlan->tmpBufSize = Nt * length1 * length2 * fftPlan->batchsize * fftPlan->ElementSize();
4003                                         for (size_t index=3; index < fftPlan->length.size(); index++)
4004                                                 fftPlan->tmpBufSize *= fftPlan->length[index];
4005                                 }
4006
4007                                 if ((fftPlan->tmpBufSizeC2R==0) && (fftPlan->placeness == CLFFT_OUTOFPLACE))
4008                                 {
4009                                         fftPlan->tmpBufSizeC2R = fftPlan->tmpBufSize;
4010                                 }
4011
4012                                 if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) &&
4013                                         ( ((fftPlan->outStride[2] == Nt*2*length1) && (fftPlan->oDist == Nt*2*length1*length2) && (fftPlan->placeness == CLFFT_INPLACE)) ||
4014                                                 ((fftPlan->outStride[2] == length0*length1) && (fftPlan->oDist == length0*length1*length2) && (fftPlan->placeness == CLFFT_OUTOFPLACE)) )
4015                                         && (fftPlan->inStride[2] == Nt*length1) && (fftPlan->iDist == Nt*length1*length2))
4016                                 {
4017                                         // create first transpose plan
4018                                         
4019                                         //Transpose 
4020                                         // input --> tmp
4021                                         size_t transLengths[2] = { length0*length1, length2 };
4022                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, transLengths ),
4023                                                 _T( "CreateDefaultPlan for planTZ transpose failed" ) );
4024
4025                                         FFTPlan* trans1Plan     = NULL;
4026                                         lockRAII* trans1Lock    = NULL;
4027                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
4028
4029                                         trans1Plan->transflag = true;
4030
4031                                         transLengths[0] = Nt*length1;
4032                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTZ, CLFFT_2D, transLengths ),
4033                                                 _T( "clfftSetPlanLength for planTZ transpose failed" ) );
4034
4035                                         switch(fftPlan->inputLayout)
4036                                         {
4037                                         case CLFFT_HERMITIAN_INTERLEAVED:
4038                                                 {
4039                                                         trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4040                                                         trans1Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
4041                                                 }
4042                                                 break;
4043                                         case CLFFT_HERMITIAN_PLANAR:
4044                                                 {
4045                                                         trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4046                                                         trans1Plan->inputLayout  = CLFFT_COMPLEX_PLANAR;
4047                                                 }
4048                                                 break;
4049                                         default: assert(false);
4050                                         }
4051
4052                                         trans1Plan->placeness     = CLFFT_OUTOFPLACE;
4053                                         trans1Plan->precision     = fftPlan->precision;
4054                                         trans1Plan->tmpBufSize    = 0;
4055                                         trans1Plan->batchsize     = fftPlan->batchsize;
4056                                         trans1Plan->envelope      = fftPlan->envelope;
4057                                         trans1Plan->forwardScale  = 1.0f;
4058                                         trans1Plan->backwardScale = 1.0f;
4059
4060                                         trans1Plan->inStride[0]   = 1;
4061                                         trans1Plan->inStride[1]   = Nt*length1;
4062                                         trans1Plan->outStride[0]  = 1;
4063                                         trans1Plan->outStride[1]  = length2;
4064                                         trans1Plan->iDist         = fftPlan->iDist;
4065                                         trans1Plan->oDist                 = Nt*length1*length2;
4066                                         trans1Plan->transOutHorizontal = true;
4067
4068                                         trans1Plan->gen           = Transpose_GCN;
4069
4070
4071                                         for (size_t index=3; index < fftPlan->length.size(); index++)
4072                                         {
4073                                                 trans1Plan->length.push_back(fftPlan->length[index]);
4074                                                 trans1Plan->inStride.push_back(fftPlan->inStride[index]);
4075                                                 trans1Plan->outStride.push_back(trans1Plan->oDist);
4076                                                 trans1Plan->oDist *= fftPlan->length[index];
4077                                         }
4078
4079                                         //Set callback data if set on top level plan
4080                                         if (fftPlan->hasPreCallback)
4081                                         {
4082                                                 trans1Plan->hasPreCallback = true;
4083                                                 trans1Plan->preCallback = fftPlan->preCallback;
4084                                                 trans1Plan->precallUserData = fftPlan->precallUserData;
4085                                         }
4086
4087                                         OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
4088                                                 _T( "BakePlan for planTZ failed" ) );
4089
4090                                         // create col plan
4091                                         // complex to complex
4092
4093                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimZ ] ),
4094                                                 _T( "CreateDefaultPlan for planZ failed" ) );
4095
4096                                         FFTPlan* colPlan        = NULL;
4097                                         lockRAII* colLock       = NULL;
4098                                         OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
4099
4100                                         colPlan->length.push_back(Nt*length1);
4101
4102                                         colPlan->inStride[0]  = 1;
4103                                         colPlan->inStride.push_back(length2);
4104                                         colPlan->iDist        = trans1Plan->oDist;
4105
4106                                         colPlan->placeness = CLFFT_INPLACE;
4107                                         colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
4108                                         colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4109
4110                                         colPlan->outStride[0]  = colPlan->inStride[0];
4111                                         colPlan->outStride.push_back(colPlan->inStride[1]);
4112                                         colPlan->oDist         = colPlan->iDist;
4113
4114                                         for (size_t index=3; index < fftPlan->length.size(); index++)
4115                                         {
4116                                                 colPlan->length.push_back(fftPlan->length[index]);
4117                                                 colPlan->inStride.push_back(trans1Plan->outStride[index-1]);
4118                                                 colPlan->outStride.push_back(trans1Plan->outStride[index-1]);
4119                                         }
4120
4121
4122                                         colPlan->precision     = fftPlan->precision;
4123                                         colPlan->forwardScale  = 1.0f;
4124                                         colPlan->backwardScale = 1.0f;
4125                                         colPlan->tmpBufSize    = 0;
4126
4127                                         colPlan->gen                    = fftPlan->gen;
4128                                         colPlan->envelope               = fftPlan->envelope;
4129
4130                                         colPlan->batchsize = fftPlan->batchsize;
4131
4132                                         OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planZ failed" ) );
4133
4134                                         // create second transpose plan
4135                                         
4136                                         //Transpose 
4137                                         //tmp --> output
4138                                         size_t trans2Lengths[2] = { length2, length0*length1 };
4139                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, trans2Lengths ),
4140                                                 _T( "CreateDefaultPlan for planTX transpose failed" ) );
4141
4142                                         FFTPlan* trans2Plan     = NULL;
4143                                         lockRAII* trans2Lock    = NULL;
4144                                         OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
4145
4146                                         trans2Plan->transflag = true;
4147
4148                                         trans2Lengths[1] = Nt*length1;
4149                                         OPENCL_V(clfftSetPlanLength( fftPlan->planTX, CLFFT_2D, trans2Lengths ),
4150                                                 _T( "clfftSetPlanLength for planTX transpose failed" ) );
4151
4152
4153                                         trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4154                                         trans2Plan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
4155
4156
4157                                         trans2Plan->placeness     = CLFFT_OUTOFPLACE;
4158                                         trans2Plan->precision     = fftPlan->precision;
4159                                         trans2Plan->tmpBufSize    = 0;
4160                                         trans2Plan->batchsize     = fftPlan->batchsize;
4161                                         trans2Plan->envelope      = fftPlan->envelope;
4162                                         trans2Plan->forwardScale  = 1.0f;
4163                                         trans2Plan->backwardScale = 1.0f;
4164
4165                                         trans2Plan->inStride[0]   = 1;
4166                                         trans2Plan->inStride[1]   = length2;
4167                                         trans2Plan->outStride[0]  = 1;
4168                                         trans2Plan->outStride[1]  = Nt*length1;
4169                                         trans2Plan->iDist         = colPlan->oDist;
4170                                         trans2Plan->oDist                 = Nt*length1*length2;
4171
4172                                         trans2Plan->gen           = Transpose_GCN;
4173                                         trans2Plan->transflag     = true;
4174
4175                                         for (size_t index=3; index < fftPlan->length.size(); index++)
4176                                         {
4177                                                 trans2Plan->length.push_back(fftPlan->length[index]);
4178                                                 trans2Plan->inStride.push_back(colPlan->outStride[index-1]);
4179                                                 trans2Plan->outStride.push_back(trans2Plan->oDist);
4180                                                 trans2Plan->oDist *= fftPlan->length[index];
4181
4182                                         }
4183
4184                                         OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
4185                                                 _T( "BakePlan for planTX failed" ) );
4186
4187                                         // create row plan
4188                                         // hermitian to real
4189
4190                                         //create 2D xy plan
4191                                         size_t clLengths[] = { length0, length1, 0 };
4192                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
4193                                                 _T( "CreateDefaultPlan for 2D planX failed" ) );
4194
4195                                         FFTPlan* rowPlan        = NULL;
4196                                         lockRAII* rowLock       = NULL;
4197                                         OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
4198
4199                                         rowPlan->outputLayout  = fftPlan->outputLayout;
4200                                         rowPlan->inputLayout   = CLFFT_HERMITIAN_INTERLEAVED;
4201
4202                                         rowPlan->length.push_back(length2);
4203
4204                                         rowPlan->outStride[0]  = fftPlan->outStride[0];
4205                                         rowPlan->outStride[1]  = fftPlan->outStride[1];
4206                                         rowPlan->outStride.push_back(fftPlan->outStride[2]);
4207                                         rowPlan->oDist         = fftPlan->oDist;
4208
4209                                         rowPlan->inStride[0]  = trans2Plan->outStride[0];
4210                                         rowPlan->inStride[1]  = Nt;
4211                                         rowPlan->inStride.push_back(Nt*length1);
4212                                         rowPlan->iDist         = trans2Plan->oDist;
4213
4214                                         for (size_t index=3; index < fftPlan->length.size(); index++)
4215                                         {
4216                                                 rowPlan->length.push_back(fftPlan->length[index]);
4217                                                 rowPlan->inStride.push_back(trans2Plan->outStride[index-1]);
4218                                                 rowPlan->outStride.push_back(fftPlan->outStride[index]);
4219                                         }
4220
4221                                         if (fftPlan->placeness == CLFFT_INPLACE)
4222                                         {
4223                                                 rowPlan->placeness     = CLFFT_INPLACE;
4224                                         }
4225                                         else
4226                                         {
4227                                                 rowPlan->placeness     = CLFFT_OUTOFPLACE;
4228                                         }                               
4229
4230
4231                                         rowPlan->precision     = fftPlan->precision;
4232                                         rowPlan->forwardScale  = fftPlan->forwardScale;
4233                                         rowPlan->backwardScale = fftPlan->backwardScale;
4234                                         rowPlan->tmpBufSize    = 0;
4235
4236                                         rowPlan->gen                    = fftPlan->gen;
4237                                         rowPlan->envelope               = fftPlan->envelope;
4238
4239                                         rowPlan->batchsize    = fftPlan->batchsize;
4240
4241                                         //Set callback data if set on top level plan
4242                                         if (fftPlan->hasPostCallback)
4243                                         {
4244                                                 rowPlan->hasPostCallback = true;
4245                                                 rowPlan->postCallbackParam = fftPlan->postCallbackParam;
4246                                                 rowPlan->postcallUserData = fftPlan->postcallUserData;
4247                                         }
4248
4249                                         OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
4250                                 }
4251                                 else
4252                                 {
4253
4254                                         size_t clLengths[] = { 1, 0, 0 };
4255
4256                                         clLengths[0] = fftPlan->length[ DimZ ];
4257
4258                                         //create 1D col plan
4259                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
4260                                                 _T( "CreateDefaultPlan for planZ failed" ) );
4261
4262                                         FFTPlan* colPlan        = NULL;
4263                                         lockRAII* colLock       = NULL;
4264                                         OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
4265
4266                                         switch(fftPlan->inputLayout)
4267                                         {
4268                                         case CLFFT_HERMITIAN_INTERLEAVED:
4269                                                 {
4270                                                         colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4271                                                         colPlan->inputLayout  = CLFFT_COMPLEX_INTERLEAVED;
4272                                                 }
4273                                                 break;
4274                                         case CLFFT_HERMITIAN_PLANAR:
4275                                                 {
4276                                                         colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4277                                                         colPlan->inputLayout  = CLFFT_COMPLEX_PLANAR;
4278                                                 }
4279                                                 break;
4280                                         default: assert(false);
4281                                         }
4282
4283                                         colPlan->length.push_back(Nt);
4284                                         colPlan->length.push_back(length1);
4285
4286                                         colPlan->inStride[0]  = fftPlan->inStride[2];
4287                                         colPlan->inStride.push_back(fftPlan->inStride[0]);
4288                                         colPlan->inStride.push_back(fftPlan->inStride[1]);
4289                                         colPlan->iDist         = fftPlan->iDist;
4290
4291
4292                                         if (fftPlan->placeness == CLFFT_INPLACE)
4293                                         {
4294                                                 colPlan->placeness = CLFFT_INPLACE;
4295
4296                                                 colPlan->outStride[0]  = colPlan->inStride[0];
4297                                                 colPlan->outStride.push_back(colPlan->inStride[1]);
4298                                                 colPlan->outStride.push_back(colPlan->inStride[2]);
4299                                                 colPlan->oDist         = colPlan->iDist;
4300
4301                                                 for (size_t index=3; index < fftPlan->length.size(); index++)
4302                                                 {
4303                                                         colPlan->length.push_back(fftPlan->length[index]);
4304                                                         colPlan->inStride.push_back(fftPlan->inStride[index]);
4305                                                         colPlan->outStride.push_back(fftPlan->inStride[index]);
4306                                                 }
4307                                         }
4308                                         else
4309                                         {
4310                                                 colPlan->placeness = CLFFT_OUTOFPLACE;
4311
4312                                                 colPlan->outStride[0]  = Nt*length1;
4313                                                 colPlan->outStride.push_back(1);
4314                                                 colPlan->outStride.push_back(Nt);
4315                                                 colPlan->oDist         = Nt*length1*length2;
4316
4317                                                 for (size_t index=3; index < fftPlan->length.size(); index++)
4318                                                 {
4319                                                         colPlan->length.push_back(fftPlan->length[index]);
4320                                                         colPlan->inStride.push_back(fftPlan->inStride[index]);
4321                                                         colPlan->outStride.push_back(colPlan->oDist);
4322                                                         colPlan->oDist *= fftPlan->length[index];
4323                                                 }
4324                                         }
4325
4326                                 
4327                                         colPlan->precision     = fftPlan->precision;
4328                                         colPlan->forwardScale  = 1.0f;
4329                                         colPlan->backwardScale = 1.0f;
4330                                         colPlan->tmpBufSize    = 0;
4331
4332                                         colPlan->gen                     = fftPlan->gen;
4333                                         colPlan->envelope                = fftPlan->envelope;
4334
4335                                         colPlan->batchsize = fftPlan->batchsize;
4336
4337                                         //Set callback data if set on top level plan
4338                                         if (fftPlan->hasPreCallback)
4339                                         {
4340                                                 colPlan->hasPreCallback = true;
4341                                                 colPlan->preCallback = fftPlan->preCallback;
4342                                                 colPlan->precallUserData = fftPlan->precallUserData;
4343                                         }
4344                                 
4345                                         OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
4346
4347
4348                                         clLengths[0] = fftPlan->length[ DimX ];
4349                                         clLengths[1] = fftPlan->length[ DimY ];
4350
4351                                         //create 2D xy plan
4352                                         OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
4353                                                 _T( "CreateDefaultPlan 2D planX failed" ) );
4354
4355                                         FFTPlan* xyPlan = NULL;
4356                                         lockRAII* rowLock       = NULL;
4357                                         OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
4358
4359                                         xyPlan->inputLayout   = CLFFT_HERMITIAN_INTERLEAVED;
4360                                         xyPlan->outputLayout  = fftPlan->outputLayout;
4361
4362                                         xyPlan->length.push_back(length2);
4363                         
4364                                         xyPlan->outStride[0]  = fftPlan->outStride[0];
4365                                         xyPlan->outStride[1]  = fftPlan->outStride[1];
4366                                         xyPlan->outStride.push_back(fftPlan->outStride[2]);
4367                                         xyPlan->oDist         = fftPlan->oDist;
4368
4369                                         if (fftPlan->placeness == CLFFT_INPLACE)
4370                                         {
4371                                                 xyPlan->placeness     = CLFFT_INPLACE;
4372
4373                                                 xyPlan->inStride[0]  = colPlan->outStride[1];
4374                                                 xyPlan->inStride[1]  = colPlan->outStride[2];
4375                                                 xyPlan->inStride.push_back(colPlan->outStride[0]);
4376                                                 xyPlan->iDist         = colPlan->oDist;
4377
4378                                                 for (size_t index=3; index < fftPlan->length.size(); index++)
4379                                                 {
4380                                                         xyPlan->length.push_back(fftPlan->length[index]);
4381                                                         xyPlan->inStride.push_back(colPlan->outStride[index]);
4382                                                         xyPlan->outStride.push_back(fftPlan->outStride[index]);
4383                                                 }
4384                                         }
4385                                         else
4386                                         {
4387                                                 xyPlan->placeness     = CLFFT_OUTOFPLACE;
4388
4389                                                 xyPlan->inStride[0]   = 1;
4390                                                 xyPlan->inStride[1]   = Nt;
4391                                                 xyPlan->inStride.push_back(Nt*length1);
4392                                                 xyPlan->iDist         = Nt*length1*length2;
4393
4394                                                 for (size_t index=3; index < fftPlan->length.size(); index++)
4395                                                 {
4396                                                         xyPlan->length.push_back(fftPlan->length[index]);
4397                                                         xyPlan->outStride.push_back(fftPlan->outStride[index]);
4398                                                         xyPlan->inStride.push_back(xyPlan->iDist);                                              
4399                                                         xyPlan->iDist *= fftPlan->length[index];
4400                                                 }
4401                                         }
4402
4403
4404                                         xyPlan->precision     = fftPlan->precision;
4405                                         xyPlan->forwardScale  = fftPlan->forwardScale;
4406                                         xyPlan->backwardScale = fftPlan->backwardScale;
4407                                         xyPlan->tmpBufSize    = fftPlan->tmpBufSize;
4408
4409                                         xyPlan->gen                      = fftPlan->gen;
4410                                         xyPlan->envelope         = fftPlan->envelope;
4411
4412                                         xyPlan->batchsize    = fftPlan->batchsize;
4413
4414                                         //Set callback data if set on top level plan
4415                                         if (fftPlan->hasPostCallback)
4416                                         {
4417                                                 xyPlan->hasPostCallback = true;
4418                                                 xyPlan->postCallbackParam = fftPlan->postCallbackParam;
4419                                                 xyPlan->postcallUserData = fftPlan->postcallUserData;
4420                                         }
4421
4422                                         OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
4423                                 }
4424                         }
4425                         else
4426                         {
4427                                 if (fftPlan->tmpBufSize==0 && (
4428                                         fftPlan->length[0] > Large1DThreshold ||
4429                                         fftPlan->length[1] > Large1DThreshold ||
4430                                         fftPlan->length[2] > Large1DThreshold
4431                                         ))
4432                                 {
4433                                         fftPlan->tmpBufSize = fftPlan->length[0] * fftPlan->length[1] * fftPlan->length[2] *
4434                                                 fftPlan->batchsize * fftPlan->ElementSize();
4435                                 }
4436
4437                                 size_t clLengths[] = { 1, 1, 0 };
4438                                 clLengths[0] = fftPlan->length[ DimX ];
4439                                 clLengths[1] = fftPlan->length[ DimY ];
4440
4441                                 //create 2D xy plan
4442                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
4443                                         _T( "CreateDefaultPlan 2D planX failed" ) );
4444
4445                                 FFTPlan* xyPlan = NULL;
4446                                 lockRAII* rowLock       = NULL;
4447                                 OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
4448
4449                                 xyPlan->inputLayout   = fftPlan->inputLayout;
4450                                 xyPlan->outputLayout  = fftPlan->outputLayout;
4451                                 xyPlan->placeness     = fftPlan->placeness;
4452                                 xyPlan->precision     = fftPlan->precision;
4453                                 xyPlan->forwardScale  = 1.0f;
4454                                 xyPlan->backwardScale = 1.0f;
4455                                 xyPlan->tmpBufSize    = fftPlan->tmpBufSize;
4456
4457                                 xyPlan->gen                      = fftPlan->gen;
4458                                 xyPlan->envelope                         = fftPlan->envelope;
4459
4460                                 // This is the xy fft, the first elements distance between the first two FFTs is the distance of the first elements
4461                                 // of the first two rows in the original buffer.
4462                                 xyPlan->batchsize    = fftPlan->batchsize;
4463                                 xyPlan->inStride[0]  = fftPlan->inStride[0];
4464                                 xyPlan->inStride[1]  = fftPlan->inStride[1];
4465                                 xyPlan->outStride[0] = fftPlan->outStride[0];
4466                                 xyPlan->outStride[1] = fftPlan->outStride[1];
4467
4468                                 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
4469                                 xyPlan->length.push_back(fftPlan->length[2]);
4470                                 xyPlan->inStride.push_back(fftPlan->inStride[2]);
4471                                 xyPlan->outStride.push_back(fftPlan->outStride[2]);
4472                                 xyPlan->iDist    = fftPlan->iDist;
4473                                 xyPlan->oDist    = fftPlan->oDist;
4474
4475                                 //Set callback data if set on top level plan
4476                                 if (fftPlan->hasPreCallback)
4477                                 {
4478                                         xyPlan->hasPreCallback = true;
4479                                         xyPlan->preCallback = fftPlan->preCallback;
4480                                         xyPlan->precallUserData = fftPlan->precallUserData;
4481                                 }
4482
4483                                 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
4484
4485                                 clLengths[0] = fftPlan->length[ DimZ ];
4486                                 clLengths[1] = clLengths[2] = 0;
4487                                 //create 1D col plan
4488                                 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
4489                                         _T( "CreateDefaultPlan for planZ failed" ) );
4490
4491                                 FFTPlan* colPlan        = NULL;
4492                                 lockRAII* colLock       = NULL;
4493                                 OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
4494
4495                                 colPlan->inputLayout   = fftPlan->outputLayout;
4496                                 colPlan->outputLayout  = fftPlan->outputLayout;
4497                                 colPlan->placeness     = CLFFT_INPLACE;
4498                                 colPlan->precision     = fftPlan->precision;
4499                                 colPlan->forwardScale  = fftPlan->forwardScale;
4500                                 colPlan->backwardScale = fftPlan->backwardScale;
4501                                 colPlan->tmpBufSize    = fftPlan->tmpBufSize;
4502
4503                                 colPlan->gen                     = fftPlan->gen;
4504                                 colPlan->envelope                        = fftPlan->envelope;
4505
4506                                 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
4507                                 // elements in the original buffer. Like a transpose of the matrix
4508                                 colPlan->batchsize = fftPlan->batchsize;
4509                                 colPlan->inStride[0] = fftPlan->outStride[2];
4510                                 colPlan->outStride[0] = fftPlan->outStride[2];
4511
4512                                 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
4513                                 colPlan->length.push_back(fftPlan->length[0]);
4514                                 colPlan->length.push_back(fftPlan->length[1]);
4515                                 colPlan->inStride.push_back(fftPlan->outStride[0]);
4516                                 colPlan->inStride.push_back(fftPlan->outStride[1]);
4517                                 colPlan->outStride.push_back(fftPlan->outStride[0]);
4518                                 colPlan->outStride.push_back(fftPlan->outStride[1]);
4519                                 colPlan->iDist    = fftPlan->oDist;
4520                                 colPlan->oDist    = fftPlan->oDist;
4521
4522                                 //Set callback data if set on top level plan
4523                                 if (fftPlan->hasPostCallback)
4524                                 {
4525                                         colPlan->hasPostCallback = true;
4526                                         colPlan->postCallbackParam = fftPlan->postCallbackParam;
4527                                         colPlan->postcallUserData = fftPlan->postcallUserData;
4528                                 }
4529
4530                                 OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
4531                         }
4532
4533                         fftPlan->baked = true;
4534                         return  CLFFT_SUCCESS;
4535                 }
4536         }
4537
4538         
4539         clfftStatus err = selectAction(fftPlan, fftPlan->action, commQueueFFT);
4540
4541         //      Allocate resources
4542         OPENCL_V( fftPlan->AllocateBuffers (), _T("AllocateBuffers() failed"));
4543
4544         fftPlan->ConstructAndEnqueueConstantBuffers( commQueueFFT );
4545
4546         //      Record that we baked the plan
4547         fftPlan->baked          = true;
4548
4549         return  CLFFT_SUCCESS;
4550 }
4551
4552 clfftStatus clfftCopyPlan( clfftPlanHandle* out_plHandle, cl_context new_context, clfftPlanHandle in_plHandle )
4553 {
4554         FFTRepo& fftRepo        = FFTRepo::getInstance( );
4555         FFTPlan* in_fftPlan     = NULL, *out_fftPlan = NULL;
4556         lockRAII* in_planLock = NULL, *out_planLock = NULL;
4557
4558         OPENCL_V( fftRepo.getPlan( in_plHandle, in_fftPlan, in_planLock ), _T( "fftRepo.getPlan failed" ) );
4559
4560         OPENCL_V( clfftCreateDefaultPlan( out_plHandle, new_context, in_fftPlan->dim, &in_fftPlan->length[ 0 ] ),
4561                 _T( "clfftCreateDefaultPlan failed" ) );
4562
4563         OPENCL_V( fftRepo.getPlan( *out_plHandle, out_fftPlan, out_planLock ), _T( "fftRepo.getPlan failed" ) );
4564
4565         //      Let other operations complete before attempting to copy the plan
4566         scopedLock sLock( *in_planLock, _T( "clfftCopyPlan" ) );
4567
4568         out_fftPlan->baked = false;
4569         out_fftPlan->gen = in_fftPlan->gen;
4570         out_fftPlan->envelope = in_fftPlan->envelope;
4571         out_fftPlan->dim = in_fftPlan->dim;
4572         out_fftPlan->inputLayout = in_fftPlan->inputLayout;
4573         out_fftPlan->outputLayout = in_fftPlan->outputLayout;
4574         out_fftPlan->placeness = in_fftPlan->placeness;
4575         out_fftPlan->precision = in_fftPlan->precision;
4576         out_fftPlan->forwardScale = in_fftPlan->forwardScale;
4577         out_fftPlan->backwardScale = in_fftPlan->backwardScale;
4578         out_fftPlan->iDist = in_fftPlan->iDist;
4579         out_fftPlan->oDist = in_fftPlan->oDist;
4580         out_fftPlan->length = in_fftPlan->length;
4581         out_fftPlan->inStride = in_fftPlan->inStride;
4582         out_fftPlan->outStride = in_fftPlan->outStride;
4583         out_fftPlan->batchsize = in_fftPlan->batchsize;
4584         out_fftPlan->transposed = in_fftPlan->transposed;
4585
4586         return  CLFFT_SUCCESS;
4587 }
4588
4589 clfftStatus FFTPlan::ConstructAndEnqueueConstantBuffers( cl_command_queue* commQueueFFT )
4590 {
4591         //      Construct the constant buffer and call clEnqueueWriteBuffer
4592         //
4593         cb_t ConstantBufferParams [CLFFT_CB_SIZE];
4594         memset (& ConstantBufferParams, 0, sizeof (ConstantBufferParams));
4595
4596         ConstantBufferParams[0].u = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
4597
4598
4599         OPENCL_V(clEnqueueWriteBuffer( *commQueueFFT,
4600                 /*fftPlan->*/const_buffer,
4601                 1,              // TODO? non-blocking write?
4602                 0,
4603                 sizeof(ConstantBufferParams),
4604                 &ConstantBufferParams,
4605                 0,
4606                 NULL,
4607                 NULL), _T("clEnqueueWriteBuffer failed") );
4608
4609         return CLFFT_SUCCESS;
4610 }
4611
4612
4613 clfftStatus     clfftDestroyPlan( clfftPlanHandle* plHandle )
4614 {
4615         FFTRepo& fftRepo        = FFTRepo::getInstance( );
4616         FFTPlan* fftPlan        = NULL;
4617         lockRAII* planLock      = NULL;
4618
4619         OPENCL_V( fftRepo.getPlan( *plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
4620
4621         //      Recursively destroy subplans, that are used for higher dimensional FFT's
4622         if( fftPlan->planX )
4623                 clfftDestroyPlan( &fftPlan->planX );
4624         if( fftPlan->planY )
4625                 clfftDestroyPlan( &fftPlan->planY );
4626         if( fftPlan->planZ )
4627                 clfftDestroyPlan( &fftPlan->planZ );
4628         if( fftPlan->planTX )
4629                 clfftDestroyPlan( &fftPlan->planTX );
4630         if( fftPlan->planTY )
4631                 clfftDestroyPlan( &fftPlan->planTY );
4632         if( fftPlan->planTZ )
4633                 clfftDestroyPlan( &fftPlan->planTZ );
4634         if( fftPlan->planRCcopy )
4635                 clfftDestroyPlan( &fftPlan->planRCcopy );
4636         if( fftPlan->planCopy )
4637                 clfftDestroyPlan( &fftPlan->planCopy );
4638
4639         fftRepo.deletePlan( plHandle );
4640
4641         return  CLFFT_SUCCESS;
4642 }
4643
4644 //      This routine will query the OpenCL context for it's devices
4645 //      and their hardware limitations, which we synthesize into a
4646 //      hardware "envelope".
4647 //      We only query the devices the first time we're called after
4648 //      the object's context is set.  On 2nd and subsequent calls,
4649 //      we just return the pointer.
4650 //
4651 clfftStatus FFTPlan::SetEnvelope ()
4652 {
4653
4654         // TODO  The caller has already acquired the lock on *this
4655         //      However, we shouldn't depend on it.
4656
4657         if (0 == envelope.limit_LocalMemSize) do {
4658                 //      First time, query OpenCL for the device info
4659                 //
4660                 memset (&envelope, 0, sizeof(envelope));
4661
4662                 //      Get the size needed for the device list
4663                 //
4664                 size_t deviceListSize = 0;
4665                 OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
4666                         _T("Getting device array size ( ::clGetContextInfo() )" ));
4667                 cl_uint n = cl_uint (deviceListSize / sizeof(cl_device_id));
4668                 if (n == 0) break;
4669
4670                 std::vector< cl_device_id > devices( n+1 );
4671                 //      Get the device list
4672                 //
4673                 OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &devices[ 0 ], NULL ),
4674                         _T("Getting device array ( ::clGetContextInfo() )") );
4675
4676                 //      Get the # of devices
4677                 //
4678                 cl_uint cContextDevices = 0;
4679
4680                 size_t deviceVersionSize        = 0;
4681                 OPENCL_V( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
4682                         _T("Getting CL_DEVICE_VERSION Info string size ( ::clGetDeviceInfo() )" ));
4683
4684                 std::vector< char > szDeviceVersion( deviceVersionSize );
4685                 OPENCL_V( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
4686                         _T("Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" ));
4687
4688                 char openclstr[11]="OpenCL 1.0";
4689
4690                 if (!strncmp((const char*)&szDeviceVersion[ 0 ], openclstr, 10))
4691                 {
4692                         cContextDevices = 1;
4693                 }
4694                 else
4695                 {
4696                         OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_NUM_DEVICES, sizeof( cContextDevices ), &cContextDevices, NULL ),
4697                                 _T("Getting number of context devices ( ::clGetContextInfo() )" ));
4698                 }
4699
4700                 cContextDevices = std::min<cl_uint> (cContextDevices, n);
4701                 if (0 == cContextDevices)
4702                         break;
4703
4704                 envelope.limit_LocalMemSize  = 32768;
4705                 envelope.limit_WorkGroupSize = 256;
4706                 envelope.limit_Dimensions    = countOf (envelope.limit_Size);
4707                 for (size_t u = 0; u < countOf (envelope.limit_Size); ++u) {
4708                         envelope.limit_Size[u] = 256;
4709                 }
4710
4711                 for( cl_uint i = 0; i < cContextDevices; ++i )
4712                 {
4713                         cl_device_id devId = devices[i];
4714
4715                         cl_ulong memsize = 0;
4716                         unsigned int maxdim = 0;
4717                         size_t temp[countOf (envelope.limit_Size)];
4718                         memset (&temp, 0, sizeof(temp));
4719
4720                         OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &memsize, NULL ),
4721                                 _T("Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )") );
4722                         envelope.limit_LocalMemSize = std::min<size_t> (envelope.limit_LocalMemSize, memsize);
4723
4724                         OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( unsigned int ), &maxdim, NULL ),
4725                                 _T("Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )") );
4726                         BUG_CHECK (countOf (envelope.limit_Size) >= maxdim);
4727                         envelope.limit_Dimensions = std::min<size_t> (envelope.limit_Dimensions, maxdim);
4728
4729                         OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &temp[0], NULL ),
4730                                 _T("Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )") );
4731                         envelope.limit_WorkGroupSize = std::min<size_t> (envelope.limit_WorkGroupSize, temp[0]);
4732
4733                         OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( temp ), &temp[0], NULL ),
4734                                 _T("Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )") );
4735                         for (size_t u = 0; u < envelope.limit_Dimensions; ++u) {
4736                                 BUG_CHECK (temp[u] > 0)
4737                                 envelope.limit_Size[u] = std::min<size_t> (envelope.limit_Size[u], temp[u]);
4738                         }
4739                 }
4740
4741                 BUG_CHECK (envelope.limit_LocalMemSize >= 1024)
4742         } while (0);
4743
4744         return CLFFT_SUCCESS;
4745 }
4746
4747 clfftStatus FFTPlan::AllocateBuffers ()
4748 {
4749         cl_int status = CL_SUCCESS;
4750
4751         assert (NULL == const_buffer);
4752         ReleaseBuffers ();
4753
4754         assert(4 == sizeof(int));
4755
4756         do {
4757                 const_buffer = clCreateBuffer (context,
4758                                                                                 CL_MEM_READ_ONLY,
4759                                                                                 CLFFT_CB_SIZE * sizeof (int),
4760                                                                                 0,
4761                                                                                 &status);
4762                 if (CL_SUCCESS != status)
4763                         break;
4764         } while (0);
4765
4766         return  (clfftStatus) status;
4767 }
4768
4769 clfftStatus FFTPlan::ReleaseBuffers ()
4770 {
4771         clfftStatus result = CLFFT_SUCCESS;
4772         clfftStatus tmp;
4773
4774         if( NULL != const_buffer )
4775         {
4776                 tmp = static_cast< clfftStatus >( clReleaseMemObject( const_buffer ) );
4777                 const_buffer = NULL;
4778                 if( CLFFT_SUCCESS == result )
4779                         result = tmp;
4780         }
4781
4782         if( (NULL != intBuffer) && libCreatedIntBuffer )
4783         {
4784                 tmp = static_cast< clfftStatus >( clReleaseMemObject( intBuffer ) );
4785                 intBuffer = NULL;
4786                 if( CLFFT_SUCCESS == result )
4787                         result = tmp;
4788         }
4789
4790         if( NULL != intBufferRC )
4791         {
4792                 tmp = static_cast< clfftStatus >( clReleaseMemObject( intBufferRC ) );
4793                 intBufferRC = NULL;
4794                 if( CLFFT_SUCCESS == result )
4795                         result = tmp;
4796         }
4797         
4798         if( NULL != intBufferC2R )
4799         {
4800                 tmp = static_cast< clfftStatus >( clReleaseMemObject( intBufferC2R ) );
4801                 intBufferC2R = NULL;
4802                 if( CLFFT_SUCCESS == result )
4803                         result = tmp;
4804         }
4805
4806         return  result;
4807 }
4808
4809
4810
4811 clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
4812 {
4813         switch(gen)
4814         {
4815         case Stockham:          return GetMax1DLengthStockham(longest);
4816     case Transpose_GCN:                 *longest = 4096; return CLFFT_SUCCESS;
4817     case Transpose_SQUARE:              *longest = 4096; return CLFFT_SUCCESS;
4818         case Transpose_NONSQUARE:       *longest = 4096; return CLFFT_SUCCESS;
4819     case Copy:                                  *longest = 4096; return CLFFT_SUCCESS;
4820         default:                        assert(false); return CLFFT_NOTIMPLEMENTED;
4821         }
4822 }
4823
4824 clfftStatus FFTPlan::GetEnvelope (const FFTEnvelope ** ppEnvelope) const
4825 {
4826         if( &envelope == NULL )
4827     { 
4828         assert( false );
4829         return CLFFT_NOTIMPLEMENTED;
4830     }
4831
4832         *ppEnvelope = &envelope;
4833         return CLFFT_SUCCESS;
4834 }
4835
4836 size_t FFTPlan::ElementSize() const
4837 {
4838         return ( ((precision == CLFFT_DOUBLE) || (precision == CLFFT_DOUBLE_FAST)) ? sizeof( std::complex<double> ) : sizeof( std::complex<float> ) );
4839 }
4840