1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
17 ////////////////////////////////////////////
19 // clfft.plan.cpp : Defines the entry point for the console application.
27 #include "generator.stockham.h"
28 #include "../include/convenienceFunctions.h"
30 #include "fft_binary_lookup.h"
34 const std::string beginning_of_binary( "<[�_beginning_of_binary_�]>" );
35 const std::string end_of_binary( "<[�_I_may_be_a_sorry_case,_but_I_don't_write_jokes_in_base_13_�]>" );
36 const std::string end_of_file( "<[�_You're_off_the_edge_of_the_map,_mate._Here_there_be_monsters_�]>" );
38 static bool pow235(size_t num, size_t &pow2, size_t &pow3, size_t &pow5)
40 //a helper function to decide if a number is only radix 2, 3 and 5
41 if (num % 2 != 0 && num % 3 != 0 && num % 5 != 0)
69 static bool split1D_for_inplace(size_t num, vector<vector<size_t> > &splitNums, clfftPrecision precision, size_t threshold)
71 /* a helper function to split big 1D to friendly 2D sizes for inplace transpose kernels
72 currently only radix 2, 3 and 5 are supported
73 the algorithm looks for ways to split up the 1D into 2D such that one of the dimensions is multiples of the other dimension.
74 And this mupliple is radix2, 3 or 5.
75 each splited dimentsion should be further splited until that it is smaller than 4096
79 if (num % 2 != 0 && num % 3 != 0 && num % 5 != 0)
82 //let's figure out pow2, pow3 and pow5 such that num = 2^pow2 * 3^pow3 * 5^pow5
83 size_t pow2, pow3, pow5;
84 pow2 = pow3 = pow5 = 0;
85 bool status = pow235(num, pow2, pow3, pow5);
95 //pow2 and pow3 are odd
98 //pow2, pow3 and pow5 are odd
99 //one dimension is 2*3*5 = 30 times bigger than the other dimension
100 divide_factor = 2 * 3 * 5;
104 //pow2 and pow3 are odd, pow 5 is even
105 //one dimension is 2*3 = 6 times bigger than the other dimension
106 divide_factor = 2 * 3;
111 //pow2 is odd, pow3 is even
114 //pow2, pow5 are odd pow3 is eve
115 divide_factor = 2 * 5;
119 //pow2 is odd, pow3 and pow5 are even
130 //pow3 is odd pow2 is even
133 //pow2 is even, pow3 and pow5 are odd
134 divide_factor = 3 * 5;
138 //pow2 and pow5 are even, pow3 is odd
147 //pow5 is odd pow2 pow3 is eve
158 //add some special cases
159 if (num == 2687385600)
160 divide_factor = 2 * 2 * 3 * 3;
161 if (num == 2916000000)
162 divide_factor = 2 * 2 * 3 * 3 * 5 * 5;
163 if (num == 3057647616)
164 divide_factor = 2 * 2 * 3 * 3;
166 num = num / divide_factor;
167 //now the remaining num should have even number of pow2, pow3 and pow5 and we can do sqrt
168 size_t temp = (size_t)sqrt((double)num);
169 vector<size_t> splitVec;
170 splitVec.push_back(temp*divide_factor);
171 splitVec.push_back(temp);
172 splitNums.push_back(splitVec);
174 status = status && split1D_for_inplace(temp*divide_factor, splitNums, precision, threshold);
175 status = status && split1D_for_inplace(temp, splitNums, precision, threshold);
180 // Returns CLFFT_SUCCESS if the fp64 is present, CLFFT_DEVICE_NO_DOUBLE if it is not found.
181 clfftStatus checkDevExt( std::string ext, const cl_device_id &device )
183 size_t deviceExtSize = 0;
184 OPENCL_V( ::clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, 0, NULL, &deviceExtSize ),
185 "Getting CL_DEVICE_EXTENSIONS Platform Info string size ( ::clGetDeviceInfo() )" );
187 std::vector< char > szDeviceExt( deviceExtSize );
188 OPENCL_V( ::clGetDeviceInfo( device, CL_DEVICE_EXTENSIONS, deviceExtSize, &szDeviceExt[ 0 ], NULL ),
189 "Getting CL_DEVICE_EXTENSIONS Platform Info string ( ::clGetDeviceInfo() )" );
191 std::string strDeviceExt = &szDeviceExt[ 0 ];
193 if( strDeviceExt.find( ext.c_str( ), 0 ) == std::string::npos )
194 return CLFFT_DEVICE_NO_DOUBLE;
197 return CLFFT_SUCCESS;
200 clfftStatus clfftCreateDefaultPlanInternal( clfftPlanHandle* plHandle, cl_context context, const clfftDim dim,
201 const size_t* clLengths )
203 if( clLengths == NULL )
204 return CLFFT_INVALID_HOST_PTR;
206 size_t lenX = 1, lenY = 1, lenZ = 1;
212 // Minimum length size is 1
213 if( clLengths[ DimX ] == 0 )
214 return CLFFT_INVALID_ARG_VALUE;
216 if( !IsASupportedLength( clLengths[ DimX ] ) )
218 return CLFFT_NOTIMPLEMENTED;
221 lenX = clLengths[ DimX ];
226 // Minimum length size is 1
227 if( clLengths[ DimX ] == 0 || clLengths[ DimY ] == 0 )
228 return CLFFT_INVALID_ARG_VALUE;
230 if( !IsASupportedLength( clLengths[ DimX ] ) || !IsASupportedLength( clLengths[ DimY ] ) )
232 return CLFFT_NOTIMPLEMENTED;
235 lenX = clLengths[ DimX ];
236 lenY = clLengths[ DimY ];
241 // Minimum length size is 1
242 if( clLengths[ DimX ] == 0 || clLengths[ DimY ] == 0 || clLengths[ DimZ ] == 0 )
243 return CLFFT_INVALID_ARG_VALUE;
245 if( !IsASupportedLength( clLengths[ DimX ] ) || !IsASupportedLength( clLengths[ DimY ] ) ||
246 !IsASupportedLength( clLengths[ DimZ ] ))
248 return CLFFT_NOTIMPLEMENTED;
251 lenX = clLengths[ DimX ];
252 lenY = clLengths[ DimY ];
253 lenZ = clLengths[ DimZ ];
257 return CLFFT_NOTIMPLEMENTED;
261 FFTPlan *fftPlan = NULL;
262 FFTRepo& fftRepo = FFTRepo::getInstance( );
263 OPENCL_V( fftRepo.createPlan( plHandle, fftPlan ), _T( "fftRepo.insertPlan failed" ) );
265 fftPlan->baked = false;
267 fftPlan->placeness = CLFFT_INPLACE;
268 fftPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
269 fftPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
270 fftPlan->precision = CLFFT_SINGLE;
271 fftPlan->context = context;
272 fftPlan->forwardScale = 1.0;
273 fftPlan->backwardScale = 1.0 / static_cast< double >( lenX * lenY * lenZ );
274 fftPlan->batchsize = 1;
275 fftPlan->gen = Stockham; //default setting
277 OPENCL_V(fftPlan->SetEnvelope(), _T("SetEnvelope failed"));
279 clRetainContext( fftPlan->context );
282 /////////////////////////////////////////////////////////////////
283 // Detect OpenCL devices
284 /////////////////////////////////////////////////////////////////
285 // First, get the size of device list data
286 size_t deviceListSize;
287 OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
288 "Getting device array size ( ::clGetContextInfo() )" );
290 // Allocate memory for the devices
291 fftPlan->devices.resize( deviceListSize / sizeof( cl_device_id ) );
293 /* Now, get the device list data */
294 OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &fftPlan->devices[ 0 ], NULL ),
295 "Getting device array ( ::clGetContextInfo() )" );
298 // Need to devise a way to generate better names
299 tstringstream tstream;
300 tstream << _T( "plan_" ) << *plHandle;
302 lockRAII* planLock = NULL;
303 OPENCL_V( fftRepo.getPlan( *plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
304 planLock->setName( tstream.str( ) );
306 // Set the lengths and default strides/pitches depending on the dim that the user passes to us
311 fftPlan->length.push_back( lenX );
312 fftPlan->inStride.push_back( 1 );
313 fftPlan->outStride.push_back( 1 );
314 fftPlan->iDist = lenX;
315 fftPlan->oDist = lenX;
320 fftPlan->length.push_back( lenX );
321 fftPlan->length.push_back( lenY );
322 fftPlan->inStride.push_back( 1 );
323 fftPlan->inStride.push_back( lenX );
324 fftPlan->outStride.push_back( 1 );
325 fftPlan->outStride.push_back( lenX );
326 fftPlan->iDist = lenX*lenY;
327 fftPlan->oDist = lenX*lenY;
332 fftPlan->length.push_back( lenX );
333 fftPlan->length.push_back( lenY );
334 fftPlan->length.push_back( lenZ );
335 fftPlan->inStride.push_back( 1 );
336 fftPlan->inStride.push_back( lenX );
337 fftPlan->inStride.push_back( lenX*lenY );
338 fftPlan->outStride.push_back( 1 );
339 fftPlan->outStride.push_back( lenX );
340 fftPlan->outStride.push_back( lenX*lenY );
341 fftPlan->iDist = lenX*lenY*lenZ;
342 fftPlan->oDist = lenX*lenY*lenZ;
347 fftPlan->plHandle = *plHandle;
349 return CLFFT_SUCCESS;
352 // This external entry-point should not be called from within the library. Use clfftCreateDefaultPlanInternal instead.
353 clfftStatus clfftCreateDefaultPlan( clfftPlanHandle* plHandle, cl_context context, const clfftDim dim,
354 const size_t* clLengths )
356 clfftStatus ret = clfftCreateDefaultPlanInternal(plHandle, context, dim, clLengths);
358 if(ret == CLFFT_SUCCESS)
360 FFTRepo& fftRepo = FFTRepo::getInstance( );
361 FFTPlan *fftPlan = NULL;
362 lockRAII* planLock = NULL;
363 OPENCL_V( fftRepo.getPlan( *plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
365 fftPlan->userPlan = true;
372 std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHandle, bool withPlHandle)
374 // Logic to define a sensible filename
375 const std::string kernelPrefix( "clfft.kernel." );
376 std::string generatorName;
377 std::stringstream kernelPath;
382 case Stockham: generatorName = "Stockham"; break;
383 case Transpose_GCN: generatorName = "Transpose"; break;
384 case Transpose_SQUARE: generatorName = "Transpose"; break;
385 case Transpose_NONSQUARE: generatorName = "TransposeNonSquare"; break;
386 case Copy: generatorName = "Copy"; break;
390 kernelPath << kernelPrefix << generatorName ;
393 kernelPath << plHandle;
397 return kernelPath.str();
401 clfftStatus selectAction(FFTPlan * fftPlan, FFTAction *& action, cl_command_queue* commQueueFFT)
403 // set the action we are baking a leaf
406 switch (fftPlan->gen)
410 // Instantiate the default stockham generator
411 action = new FFTGeneratedStockhamAction (fftPlan->plHandle, fftPlan, *commQueueFFT, err);
412 OPENCL_V( err, "FFTGeneratedStockhamAction() failed");
418 action = new FFTGeneratedTransposeGCNAction(fftPlan->plHandle, fftPlan, *commQueueFFT, err);
419 OPENCL_V( err, "FFTGeneratedTransposeGCNAction() failed");
426 action = new FFTGeneratedCopyAction (fftPlan->plHandle, fftPlan, *commQueueFFT, err);
427 OPENCL_V( err, "FFTGeneratedCopyAction() failed");
434 OPENCL_V( CLFFT_NOTIMPLEMENTED, "selectAction() failed");
438 return CLFFT_SUCCESS;
442 inline size_t PrecisionWidth(clfftPrecision pr)
446 case CLFFT_SINGLE: return 1;
447 case CLFFT_DOUBLE: return 2;
448 default: assert(false); return 1;
454 clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT,
455 void (CL_CALLBACK *pfn_notify)( clfftPlanHandle plHandle, void *user_data ), void* user_data )
457 // We do not currently support multi-GPU transforms
459 return CLFFT_NOTIMPLEMENTED;
461 // Notification mechanism is not set up yet; BakePlan can be called recursively to decompose higher dimension FFT's into
462 // arrays of 1d transforms, and this must be implemented to make only a single callback to the user.
463 if( pfn_notify != NULL )
464 return CLFFT_NOTIMPLEMENTED;
466 if( user_data != NULL )
467 return CLFFT_NOTIMPLEMENTED;
469 FFTRepo& fftRepo = FFTRepo::getInstance( );
470 FFTPlan* fftPlan = NULL;
471 lockRAII* planLock = NULL;
473 OPENCL_V( fftRepo.getPlan( plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
474 scopedLock sLock( *planLock, _T( "clfftBakePlan" ) );
476 // if we have already baked the plan and nothing has changed since, we're done here
477 if( fftPlan->baked == true )
479 return CLFFT_SUCCESS;
482 // Store the device for which we are baking
483 clGetCommandQueueInfo(*commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &fftPlan->bakeDevice, NULL);
485 //find product of lengths
486 size_t maxLengthInAnyDim = 1;
489 case CLFFT_3D: maxLengthInAnyDim = maxLengthInAnyDim > fftPlan->length[DimZ] ? maxLengthInAnyDim : fftPlan->length[DimZ];
490 case CLFFT_2D: maxLengthInAnyDim = maxLengthInAnyDim > fftPlan->length[DimY] ? maxLengthInAnyDim : fftPlan->length[DimY];
491 case CLFFT_1D: maxLengthInAnyDim = maxLengthInAnyDim > fftPlan->length[DimX] ? maxLengthInAnyDim : fftPlan->length[DimX];
494 const bool rc = (fftPlan->inputLayout == CLFFT_REAL) || (fftPlan->outputLayout == CLFFT_REAL);
496 // upper bounds on transfrom lengths - address this in the next release
497 size_t SP_MAX_LEN = 1 << 24;
498 size_t DP_MAX_LEN = 1 << 22;
499 if((fftPlan->precision == CLFFT_SINGLE) && (maxLengthInAnyDim > SP_MAX_LEN) && rc) return CLFFT_NOTIMPLEMENTED;
500 if((fftPlan->precision == CLFFT_DOUBLE) && (maxLengthInAnyDim > DP_MAX_LEN) && rc) return CLFFT_NOTIMPLEMENTED;
503 // release buffers, as these will be created only in EnqueueTransform
504 if( NULL != fftPlan->intBuffer ) { OPENCL_V( clReleaseMemObject( fftPlan->intBuffer ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBuffer = NULL; }
505 if( NULL != fftPlan->intBufferRC ) { OPENCL_V( clReleaseMemObject( fftPlan->intBufferRC ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBufferRC = NULL; }
506 if( NULL != fftPlan->intBufferC2R ) { OPENCL_V( clReleaseMemObject( fftPlan->intBufferC2R ), _T( "Failed to release internal temporary buffer" ) ); fftPlan->intBufferC2R = NULL; }
509 if( fftPlan->userPlan ) // confirm it is top-level plan (user plan)
511 if(fftPlan->placeness == CLFFT_INPLACE)
513 if( (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR) || (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR) )
514 return CLFFT_INVALID_PLAN;
517 // Make sure strides & distance are same for C-C transforms
518 if(fftPlan->placeness == CLFFT_INPLACE)
520 if( (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL) )
523 for(size_t i=0; i<fftPlan->dim; i++)
524 if(fftPlan->inStride[i] != fftPlan->outStride[i])
525 return CLFFT_INVALID_PLAN;
528 if(fftPlan->iDist != fftPlan->oDist)
529 return CLFFT_INVALID_PLAN;
534 if(fftPlan->gen == Copy)
537 fftPlan->action = new FFTGeneratedCopyAction(plHandle, fftPlan, *commQueueFFT, err);
538 OPENCL_V( err, _T( "FFTGeneratedCopyAction() failed" ) );
539 fftPlan->baked = true;
540 return CLFFT_SUCCESS;
544 if( fftPlan->userPlan )
546 // If the user specifies double precision, check that the device supports double precision first
547 if( fftPlan->precision == CLFFT_DOUBLE || fftPlan->precision == CLFFT_DOUBLE_FAST )
549 clfftStatus retAmdFp64 = checkDevExt( "cl_amd_fp64", fftPlan->bakeDevice );
550 if( retAmdFp64 != CLFFT_SUCCESS )
552 // If AMD's extention is not supported, check for Khronos extention
553 clfftStatus retKhrFp64 = checkDevExt( "cl_khr_fp64", fftPlan->bakeDevice );
554 if( retKhrFp64 != CLFFT_SUCCESS )
560 // Compress the plan by discarding length '1' dimensions
561 // decision to pick generator
562 if( fftPlan->userPlan && !rc ) // confirm it is top-level plan (user plan)
564 size_t dmnsn = fftPlan->dim;
565 bool pow2flag = true;
567 // switch case flows with no 'break' statements
572 if(fftPlan->length[DimZ] == 1)
575 fftPlan-> inStride.erase(fftPlan-> inStride.begin() + 2);
576 fftPlan->outStride.erase(fftPlan->outStride.begin() + 2);
577 fftPlan-> length.erase(fftPlan-> length.begin() + 2);
581 if( !IsPo2(fftPlan->length[DimZ])) pow2flag=false;
585 if(fftPlan->length[DimY] == 1)
588 fftPlan-> inStride.erase(fftPlan-> inStride.begin() + 1);
589 fftPlan->outStride.erase(fftPlan->outStride.begin() + 1);
590 fftPlan-> length.erase(fftPlan-> length.begin() + 1);
594 if( !IsPo2(fftPlan->length[DimY])) pow2flag=false;
599 if( (fftPlan->length[DimX] == 1) && (dmnsn > 1) )
602 fftPlan-> inStride.erase(fftPlan-> inStride.begin());
603 fftPlan->outStride.erase(fftPlan->outStride.begin());
604 fftPlan-> length.erase(fftPlan-> length.begin());
608 if( !IsPo2(fftPlan->length[DimX])) pow2flag=false;
612 fftPlan->dim = (clfftDim)dmnsn;
615 // first time check transposed
616 if (fftPlan->transposed != CLFFT_NOTRANSPOSE && fftPlan->dim != CLFFT_2D &&
617 fftPlan->dim == fftPlan->length.size())
618 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
620 // The largest vector we can transform in a single pass
621 // depends on the GPU caps -- especially the amount of LDS
624 size_t Large1DThreshold = 0;
627 OPENCL_V(fftPlan->GetMax1DLength (&Large1DThreshold), _T("GetMax1DLength failed"));
628 BUG_CHECK (Large1DThreshold > 1);
630 // Verify that the data passed to us is packed
631 switch( fftPlan->dim )
635 if ( !Is1DPossible(fftPlan->length[0], Large1DThreshold) )
637 size_t clLengths[] = { 1, 1, 0 };
638 size_t in_1d, in_x, count;
640 BUG_CHECK (IsPo2 (Large1DThreshold))
643 if( IsPo2(fftPlan->length[0]) )
645 // Enable block compute under these conditions
646 if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) && !rc
647 && (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1)
648 && (!clfftGetRequestLibNoMemAlloc() || (fftPlan->placeness == CLFFT_OUTOFPLACE)) )
650 fftPlan->blockCompute = true;
652 if(1 == PrecisionWidth(fftPlan->precision))
654 switch(fftPlan->length[0])
656 case 8192: clLengths[1] = 64; break;
657 case 16384: clLengths[1] = 64; break;
658 case 32768: clLengths[1] = 128; break;
659 case 65536: clLengths[1] = 256; break;
660 case 131072: clLengths[1] = 64; break;
661 case 262144: clLengths[1] = 64; break;
662 case 524288: clLengths[1] = 256; break;
663 case 1048576: clLengths[1] = 256; break;
664 default: assert(false);
669 switch(fftPlan->length[0])
671 case 4096: clLengths[1] = 64; break;
672 case 8192: clLengths[1] = 64; break;
673 case 16384: clLengths[1] = 64; break;
674 case 32768: clLengths[1] = 128; break;
675 case 65536: clLengths[1] = 64; break;
676 case 131072: clLengths[1] = 64; break;
677 case 262144: clLengths[1] = 128; break;
678 case 524288: clLengths[1] = 256; break;
679 default: assert(false);
685 if( clfftGetRequestLibNoMemAlloc() && !rc && (fftPlan->placeness == CLFFT_INPLACE) )
687 in_x = BitScanF(fftPlan->length[0]);
689 clLengths[1] = (size_t)1 << in_x;
691 else if( fftPlan->length[0] > (Large1DThreshold * Large1DThreshold) )
693 clLengths[1] = fftPlan->length[0] / Large1DThreshold;
697 in_1d = BitScanF (Large1DThreshold); // this is log2(LARGE1D_THRESHOLD)
698 in_x = BitScanF (fftPlan->length[0]); // this is log2(length)
699 BUG_CHECK (in_1d > 0)
701 if (count*in_1d < in_x)
704 in_1d = in_x / count;
705 if (in_1d * count < in_x) in_1d++;
707 clLengths[1] = (size_t)1 << in_1d;
713 // This array must be kept sorted in the ascending order
715 size_t supported[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 24,
716 25, 26, 27, 28, 30, 32, 33, 35, 36, 39, 40, 42, 44, 45, 48, 49, 50, 52, 54,
717 55, 56, 60, 63, 64, 65, 66, 70, 72, 75, 77, 78, 80, 81, 84, 88, 90, 91, 96,
718 98, 99, 100, 104, 105, 108, 110, 112, 117, 120, 121, 125, 126, 128, 130, 132,
719 135, 140, 143, 144, 147, 150, 154, 156, 160, 162, 165, 168, 169, 175, 176,
720 180, 182, 189, 192, 195, 196, 198, 200, 208, 210, 216, 220, 224, 225, 231,
721 234, 240, 242, 243, 245, 250, 252, 256, 260, 264, 270, 273, 275, 280, 286,
722 288, 294, 297, 300, 308, 312, 315, 320, 324, 325, 330, 336, 338, 343, 350,
723 351, 352, 360, 363, 364, 375, 378, 384, 385, 390, 392, 396, 400, 405, 416,
724 420, 429, 432, 440, 441, 448, 450, 455, 462, 468, 480, 484, 486, 490, 495,
725 500, 504, 507, 512, 520, 525, 528, 539, 540, 546, 550, 560, 567, 572, 576,
726 585, 588, 594, 600, 605, 616, 624, 625, 630, 637, 640, 648, 650, 660, 672,
727 675, 676, 686, 693, 700, 702, 704, 715, 720, 726, 728, 729, 735, 750, 756,
728 768, 770, 780, 784, 792, 800, 810, 819, 825, 832, 840, 845, 847, 858, 864,
729 875, 880, 882, 891, 896, 900, 910, 924, 936, 945, 960, 968, 972, 975, 980,
730 990, 1000, 1001, 1008, 1014, 1024, 1029, 1040, 1050, 1053, 1056, 1078, 1080,
731 1089, 1092, 1100, 1120, 1125, 1134, 1144, 1152, 1155, 1170, 1176, 1183, 1188,
732 1200, 1210, 1215, 1225, 1232, 1248, 1250, 1260, 1274, 1280, 1287, 1296, 1300,
733 1320, 1323, 1331, 1344, 1350, 1352, 1365, 1372, 1375, 1386, 1400, 1404, 1408,
734 1430, 1440, 1452, 1456, 1458, 1470, 1485, 1500, 1512, 1521, 1536, 1540, 1560,
735 1568, 1573, 1575, 1584, 1600, 1617, 1620, 1625, 1638, 1650, 1664, 1680, 1690,
736 1694, 1701, 1715, 1716, 1728, 1750, 1755, 1760, 1764, 1782, 1792, 1800, 1815,
737 1820, 1848, 1859, 1872, 1875, 1890, 1911, 1920, 1925, 1936, 1944, 1950, 1960,
738 1980, 2000, 2002, 2016, 2025, 2028, 2048, 2058, 2079, 2080, 2100, 2106, 2112,
739 2145, 2156, 2160, 2178, 2184, 2187, 2197, 2200, 2205, 2240, 2250, 2268, 2275,
740 2288, 2304, 2310, 2340, 2352, 2366, 2376, 2400, 2401, 2420, 2430, 2450, 2457,
741 2464, 2475, 2496, 2500, 2520, 2535, 2541, 2548, 2560, 2574, 2592, 2600, 2625,
742 2640, 2646, 2662, 2673, 2688, 2695, 2700, 2704, 2730, 2744, 2750, 2772, 2800,
743 2808, 2816, 2835, 2860, 2880, 2904, 2912, 2916, 2925, 2940, 2970, 3000, 3003,
744 3024, 3025, 3042, 3072, 3080, 3087, 3120, 3125, 3136, 3146, 3150, 3159, 3168,
745 3185, 3200, 3234, 3240, 3250, 3267, 3276, 3300, 3328, 3360, 3375, 3380, 3388,
746 3402, 3430, 3432, 3456, 3465, 3500, 3510, 3520, 3528, 3549, 3564, 3575, 3584,
747 3600, 3630, 3640, 3645, 3675, 3696, 3718, 3744, 3750, 3773, 3780, 3822, 3840,
748 3850, 3861, 3872, 3888, 3900, 3920, 3960, 3969, 3993, 4000, 4004, 4032, 4050,
751 size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
752 size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
754 size_t halfPowerLength = (size_t)1 << ( (StockhamGenerator::CeilPo2(fftPlan->length[0]) + 1) / 2 );
755 size_t factoredLengthStart = (halfPowerLength < maxFactoredLength) ? halfPowerLength : maxFactoredLength;
757 size_t indexStart = 0;
758 while(supported[indexStart] < factoredLengthStart) indexStart++;
760 for(size_t i = indexStart; i >= 1; i--)
762 if( fftPlan->length[0] % supported[i] == 0 )
764 if (Is1DPossible(supported[i], Large1DThreshold))
766 clLengths[1] = supported[i];
772 // add some special cases
774 if (fftPlan->length[0] == 10000)
775 clLengths[1] = 100;//100 x 100
776 if (fftPlan->length[0] == 100000)
777 clLengths[1] = 100;//100 x 1,000
778 if (fftPlan->length[0] == 10000000)
779 clLengths[1] = 1000;//1,000 x 10,000
780 if (fftPlan->length[0] == 100000000)
781 clLengths[1] = 10000;//10,000 x 10,000
782 if (fftPlan->length[0] == 1000000000)
783 clLengths[1] = 10000;//10,000 x 100,000
785 if (fftPlan->length[0] == 3099363912)
786 clLengths[1] = 78732;//39366 x 78732
787 if (fftPlan->length[0] == 39366)
788 clLengths[1] = 81;//81*486
789 if (fftPlan->length[0] == 78732)
790 clLengths[1] = 162;//162*486
791 if (fftPlan->length[0] == 354294)
794 size_t threshold = 4096;
795 if (fftPlan->precision == CLFFT_DOUBLE)
797 if (clfftGetRequestLibNoMemAlloc() &&
798 fftPlan->placeness == CLFFT_INPLACE &&
799 (fftPlan->inputLayout == fftPlan->outputLayout)
800 && fftPlan->length[0] > threshold)
802 //for inplace fft with inplace transpose, the split logic is different
803 vector<vector<size_t> > splitNums;
804 bool implemented = split1D_for_inplace(fftPlan->length[0], splitNums, fftPlan->precision, threshold);
806 clLengths[1] = splitNums[0][0];
809 clLengths[0] = fftPlan->length[0]/clLengths[1];
811 // Start of block where transposes are generated; 1D FFT
812 while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
814 if (fftPlan->length[0] <= Large1DThreshold) break;
816 if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
818 if ( IsPo2(fftPlan->length[0]) &&
819 (fftPlan->length[0] <= 262144/PrecisionWidth(fftPlan->precision)) && (fftPlan->length.size() <= 1) &&
820 (!clfftGetRequestLibNoMemAlloc() || (fftPlan->placeness == CLFFT_OUTOFPLACE)) ) break;
822 if ( clLengths[0]<=32 && clLengths[1]<=32) break;
825 size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
826 size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
828 if( (smallerDim % 64 == 0) || (biggerDim % 64 == 0) )
831 clfftGenerators transGen = Transpose_GCN;
833 size_t dim_ratio = biggerDim / smallerDim;
834 size_t dim_residue = biggerDim % smallerDim;
835 // If this is an in-place transform the
836 // input and output layout, dimensions and strides
837 // *MUST* be the same.
839 bool inStrideEqualsOutStride = true;
840 for (size_t u = fftPlan->inStride.size(); u-- > 0; ) {
841 if (fftPlan->inStride[u] != fftPlan->outStride[u])
843 inStrideEqualsOutStride = false;
847 //packed data is required for inplace transpose
848 bool isDataPacked = true;
849 for (size_t u = 0; u < fftPlan->inStride.size(); u++)
853 if (fftPlan->inStride[0] == 1)
857 isDataPacked = false;
863 size_t packDataSize = 1;
864 for (size_t i = 0; i < u; i++)
865 packDataSize *= fftPlan->length[i];
866 if (fftPlan->inStride[u] == packDataSize)
870 isDataPacked = false;
875 if (clfftGetRequestLibNoMemAlloc() &&
877 ((dim_ratio % 2 == 0) ||
878 (dim_ratio % 3 == 0) ||
879 (dim_ratio % 5 == 0) ||
880 (dim_ratio % 10 == 0)) &&
881 fftPlan->placeness == CLFFT_INPLACE &&
882 (fftPlan->inputLayout == fftPlan->outputLayout) &&
883 (inStrideEqualsOutStride) && (isDataPacked))
886 fftPlan->allOpsInplace = true;
887 transGen = Transpose_NONSQUARE;
888 //std::cout << "Transpose_NONSQUARE" << std::endl;
891 if( clfftGetRequestLibNoMemAlloc() &&
892 (clLengths[0] == clLengths[1]) &&
893 fftPlan->placeness == CLFFT_INPLACE )
896 fftPlan->allOpsInplace = true;
897 transGen = Transpose_SQUARE;
900 if (fftPlan->tmpBufSize != 0)
903 if ( (fftPlan->tmpBufSize==0 ) && !fftPlan->allOpsInplace)
905 fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
906 fftPlan->batchsize * fftPlan->ElementSize();
908 for (size_t index = 1; index < fftPlan->length.size(); index++)
910 fftPlan->tmpBufSize *= fftPlan->length[index];
915 //Input --> tmp buffer
916 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
917 _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
919 FFTPlan* trans1Plan = NULL;
920 lockRAII* trans1Lock = NULL;
921 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
923 trans1Plan->placeness = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
924 trans1Plan->precision = fftPlan->precision;
925 trans1Plan->tmpBufSize = 0;
926 trans1Plan->batchsize = fftPlan->batchsize;
927 trans1Plan->envelope = fftPlan->envelope;
928 trans1Plan->inputLayout = fftPlan->inputLayout;
929 trans1Plan->outputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
930 trans1Plan->inStride[0] = fftPlan->inStride[0];
931 trans1Plan->inStride[1] = clLengths[0];
932 trans1Plan->outStride[0] = 1;
933 trans1Plan->outStride[1] = clLengths[1] + padding;
934 trans1Plan->iDist = fftPlan->iDist;
935 trans1Plan->oDist = clLengths[0] * trans1Plan->outStride[1];
936 trans1Plan->gen = transGen;
937 trans1Plan->transflag = true;
939 if (trans1Plan->gen == Transpose_NONSQUARE || trans1Plan->gen == Transpose_SQUARE)// inplace transpose
941 for (size_t index = 1; index < fftPlan->length.size(); index++)
943 //trans1Plan->length.push_back(fftPlan->length[index]);
945 replacing the line above with the two lines below since:
946 fftPlan is still 1D, thus the broken down transpose should be 2D not 3D
947 the batchSize for the transpose should increase accordingly.
948 the iDist should decrease accordingly. Push back to length will cause a 3D transpose
950 trans1Plan->batchsize = trans1Plan->batchsize * fftPlan->length[index];
951 trans1Plan->iDist = trans1Plan->iDist / fftPlan->length[index];
953 trans1Plan->inStride.push_back(fftPlan->inStride[index]);
954 trans1Plan->outStride.push_back(trans1Plan->oDist);
955 trans1Plan->oDist *= fftPlan->length[index];
960 for (size_t index = 1; index < fftPlan->length.size(); index++)
962 trans1Plan->length.push_back(fftPlan->length[index]);
964 trans1Plan->inStride.push_back(fftPlan->inStride[index]);
965 trans1Plan->outStride.push_back(trans1Plan->oDist);
966 trans1Plan->oDist *= fftPlan->length[index];
970 //Set callback data if set on top level plan
971 if (fftPlan->hasPreCallback)
973 trans1Plan->hasPreCallback = true;
974 trans1Plan->preCallback = fftPlan->preCallback;
975 trans1Plan->precallUserData = fftPlan->precallUserData;
978 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
979 _T( "BakePlan large1d trans1 plan failed" ) );
983 //size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
984 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
985 _T( "CreateDefaultPlan Large1d column failed" ) );
987 FFTPlan* row1Plan = NULL;
988 lockRAII* row1Lock = NULL;
989 OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
991 row1Plan->placeness = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
992 row1Plan->precision = fftPlan->precision;
993 row1Plan->forwardScale = 1.0f;
994 row1Plan->backwardScale = 1.0f;
995 row1Plan->tmpBufSize = 0;
996 row1Plan->batchsize = fftPlan->batchsize;
998 row1Plan->gen = fftPlan->gen;
999 row1Plan->envelope = fftPlan->envelope;
1001 // twiddling is done in row2
1002 row1Plan->large1D = 0;
1004 row1Plan->length.push_back(clLengths[0]);
1005 row1Plan->inputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1006 row1Plan->outputLayout = fftPlan->outputLayout;
1007 row1Plan->inStride[0] = 1;
1008 row1Plan->outStride[0] = fftPlan->outStride[0];
1009 row1Plan->inStride.push_back(clLengths[1]+padding);
1010 row1Plan->outStride.push_back(clLengths[1]);
1011 row1Plan->iDist = clLengths[0] * row1Plan->inStride[1];
1012 row1Plan->oDist = fftPlan->oDist;
1014 for (size_t index = 1; index < fftPlan->length.size(); index++)
1016 row1Plan->length.push_back(fftPlan->length[index]);
1017 row1Plan->inStride.push_back(row1Plan->iDist);
1018 row1Plan->iDist *= fftPlan->length[index];
1019 row1Plan->outStride.push_back(fftPlan->outStride[index]);
1022 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
1023 _T( "BakePlan large1d first row plan failed" ) );
1026 //Output --> tmp buffer
1027 clLengths[2] = clLengths[0];
1028 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
1029 _T( "CreateDefaultPlan Large1d transpose 2 failed" ) );
1031 FFTPlan* trans2Plan = NULL;
1032 lockRAII* trans2Lock = NULL;
1033 OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
1035 trans2Plan->placeness = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
1036 trans2Plan->precision = fftPlan->precision;
1037 trans2Plan->tmpBufSize = 0;
1038 trans2Plan->batchsize = fftPlan->batchsize;
1039 trans2Plan->envelope = fftPlan->envelope;
1040 trans2Plan->inputLayout = fftPlan->outputLayout;
1041 trans2Plan->outputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1042 trans2Plan->inStride[0] = fftPlan->outStride[0];
1043 trans2Plan->inStride[1] = clLengths[1];
1044 trans2Plan->outStride[0] = 1;
1045 trans2Plan->outStride[1] = clLengths[0] + padding;
1046 trans2Plan->iDist = fftPlan->oDist;
1047 trans2Plan->oDist = clLengths[1] * trans2Plan->outStride[1];
1048 trans2Plan->gen = transGen;
1050 //if (transGen != Transpose_NONSQUARE)//twiddle
1051 trans2Plan->large1D = fftPlan->length[0];
1053 trans2Plan->transflag = true;
1055 if (trans2Plan->gen == Transpose_NONSQUARE || trans2Plan->gen == Transpose_SQUARE)// inplace transpose
1057 for (size_t index = 1; index < fftPlan->length.size(); index++)
1059 //trans2Plan->length.push_back(fftPlan->length[index]);
1061 replacing the line above with the two lines below since:
1062 fftPlan is still 1D, thus the broken down transpose should be 2D not 3D
1063 the batchSize for the transpose should increase accordingly.
1064 the iDist should decrease accordingly. Push back to length will cause a 3D transpose
1066 trans2Plan->batchsize = trans2Plan->batchsize * fftPlan->length[index];
1067 trans2Plan->iDist = trans2Plan->iDist / fftPlan->length[index];
1068 trans2Plan->inStride.push_back(fftPlan->outStride[index]);
1069 trans2Plan->outStride.push_back(trans2Plan->oDist);
1070 trans2Plan->oDist *= fftPlan->length[index];
1075 for (size_t index = 1; index < fftPlan->length.size(); index++)
1077 trans2Plan->length.push_back(fftPlan->length[index]);
1079 trans2Plan->inStride.push_back(fftPlan->outStride[index]);
1080 trans2Plan->outStride.push_back(trans2Plan->oDist);
1081 trans2Plan->oDist *= fftPlan->length[index];
1085 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
1086 _T( "BakePlan large1d trans2 plan failed" ) );
1090 //size clLengths[0], batch clLengths[1]
1091 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
1092 _T( "CreateDefaultPlan Large1d second row plan failed" ) );
1094 FFTPlan* row2Plan = NULL;
1095 lockRAII* row2Lock = NULL;
1096 OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
1098 row2Plan->placeness = CLFFT_INPLACE;
1099 row2Plan->precision = fftPlan->precision;
1100 row2Plan->forwardScale = fftPlan->forwardScale;
1101 row2Plan->backwardScale = fftPlan->backwardScale;
1102 row2Plan->tmpBufSize = 0;
1103 row2Plan->batchsize = fftPlan->batchsize;
1105 row2Plan->gen = fftPlan->gen;
1106 row2Plan->envelope = fftPlan->envelope;
1109 row2Plan->length.push_back(clLengths[1]);
1110 row2Plan->inputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1111 row2Plan->outputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1112 row2Plan->inStride[0] = 1;
1113 row2Plan->outStride[0] = 1;
1114 row2Plan->inStride.push_back(clLengths[0] + padding);
1115 row2Plan->outStride.push_back(clLengths[0] + padding);
1116 row2Plan->iDist = clLengths[1] * row2Plan->inStride[1];
1117 row2Plan->oDist = clLengths[1] * row2Plan->outStride[1];
1119 for (size_t index = 1; index < fftPlan->length.size(); index++)
1121 row2Plan->length.push_back(fftPlan->length[index]);
1122 row2Plan->inStride.push_back(row2Plan->iDist);
1123 row2Plan->outStride.push_back(row2Plan->oDist);
1124 row2Plan->iDist *= fftPlan->length[index];
1125 row2Plan->oDist *= fftPlan->length[index];
1128 //if (transGen != Transpose_NONSQUARE)//twiddle in transform
1130 // row2Plan->large1D = fftPlan->length[0];
1131 // row2Plan->twiddleFront = true;
1134 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
1135 _T( "BakePlan large1d second row plan failed" ) );
1139 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
1140 _T( "CreateDefaultPlan Large1d transpose 3 failed" ) );
1142 FFTPlan* trans3Plan = NULL;
1143 lockRAII* trans3Lock = NULL;
1144 OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
1146 trans3Plan->placeness = fftPlan->allOpsInplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE;
1147 trans3Plan->precision = fftPlan->precision;
1148 trans3Plan->tmpBufSize = 0;
1149 trans3Plan->batchsize = fftPlan->batchsize;
1150 trans3Plan->envelope = fftPlan->envelope;
1151 trans3Plan->inputLayout = fftPlan->allOpsInplace ? fftPlan->inputLayout : CLFFT_COMPLEX_INTERLEAVED;
1152 trans3Plan->outputLayout = fftPlan->outputLayout;
1153 trans3Plan->inStride[0] = 1;
1154 trans3Plan->inStride[1] = clLengths[0] + padding;
1155 trans3Plan->outStride[0] = fftPlan->outStride[0];
1156 trans3Plan->outStride[1] = clLengths[1];
1157 trans3Plan->iDist = clLengths[1] * trans3Plan->inStride[1];
1158 trans3Plan->oDist = fftPlan->oDist;
1159 trans3Plan->gen = transGen;
1160 trans3Plan->transflag = true;
1161 trans3Plan->transOutHorizontal = true;
1164 if (trans3Plan->gen == Transpose_NONSQUARE)// inplace transpose
1166 for (size_t index = 1; index < fftPlan->length.size(); index++)
1168 //trans3Plan->length.push_back(fftPlan->length[index]);
1170 replacing the line above with the two lines below since:
1171 fftPlan is still 1D, thus the broken down transpose should be 2D not 3D
1172 the batchSize for the transpose should increase accordingly.
1173 the iDist should decrease accordingly. Push back to length will cause a 3D transpose
1175 trans3Plan->batchsize = trans3Plan->batchsize * fftPlan->length[index];
1176 //trans3Plan->iDist = trans3Plan->iDist / fftPlan->length[index];
1177 //trans3Plan->inStride.push_back(trans3Plan->iDist);
1178 trans3Plan->inStride.push_back(fftPlan->inStride[index]);
1179 //trans3Plan->iDist *= fftPlan->length[index];
1180 trans3Plan->outStride.push_back(fftPlan->outStride[index]);
1183 else if (trans3Plan->gen == Transpose_SQUARE)
1185 for (size_t index = 1; index < fftPlan->length.size(); index++)
1187 trans3Plan->batchsize = trans3Plan->batchsize * fftPlan->length[index];
1188 //trans3Plan->iDist = trans3Plan->iDist / fftPlan->length[index];
1189 //trans3Plan->inStride.push_back(trans3Plan->iDist);
1190 trans3Plan->inStride.push_back(fftPlan->inStride[index]);
1191 //trans3Plan->iDist *= fftPlan->length[index];
1192 trans3Plan->outStride.push_back(fftPlan->outStride[index]);
1197 for (size_t index = 1; index < fftPlan->length.size(); index++)
1199 trans3Plan->length.push_back(fftPlan->length[index]);
1201 trans3Plan->inStride.push_back(trans3Plan->iDist);
1202 trans3Plan->iDist *= fftPlan->length[index];
1203 trans3Plan->outStride.push_back(fftPlan->outStride[index]);
1207 //Set callback data if set on top level plan
1208 if (fftPlan->hasPostCallback)
1210 trans3Plan->hasPostCallback = true;
1211 trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
1212 trans3Plan->postcallUserData = fftPlan->postcallUserData;
1215 OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
1216 _T( "BakePlan large1d trans3 plan failed" ) );
1218 fftPlan->transflag = true;
1219 fftPlan->baked = true;
1220 return CLFFT_SUCCESS;
1223 size_t length0 = clLengths[0];
1224 size_t length1 = clLengths[1];
1227 // For real transforms
1228 // Special case optimization with 5-step algorithm
1229 if( (fftPlan->inputLayout == CLFFT_REAL) && IsPo2(fftPlan->length[0])
1230 && (fftPlan->length.size() == 1)
1231 && (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1)
1232 && (fftPlan->length[0] > 4096) && (fftPlan->length.size() == 1) )
1235 ARG_CHECK(clLengths[0] <= Large1DThreshold);
1238 size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
1239 size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
1241 if( (smallerDim % 64 == 0) || (biggerDim % 64 == 0) )
1245 if (fftPlan->tmpBufSize==0 )
1247 size_t Nf = (1 + smallerDim/2) * biggerDim;
1248 fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim / 2;
1250 if(fftPlan->tmpBufSize < Nf)
1251 fftPlan->tmpBufSize = Nf;
1253 fftPlan->tmpBufSize *= ( fftPlan->batchsize * fftPlan->ElementSize() );
1255 for (size_t index=1; index < fftPlan->length.size(); index++)
1257 fftPlan->tmpBufSize *= fftPlan->length[index];
1261 if (fftPlan->tmpBufSizeRC==0 )
1263 fftPlan->tmpBufSizeRC = fftPlan->tmpBufSize;
1267 //Input --> tmp buffer
1268 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
1269 _T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
1271 FFTPlan* trans1Plan = NULL;
1272 lockRAII* trans1Lock = NULL;
1273 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
1275 trans1Plan->placeness = CLFFT_OUTOFPLACE;
1276 trans1Plan->precision = fftPlan->precision;
1277 trans1Plan->tmpBufSize = 0;
1278 trans1Plan->batchsize = fftPlan->batchsize;
1279 trans1Plan->envelope = fftPlan->envelope;
1280 trans1Plan->inputLayout = fftPlan->inputLayout;
1281 trans1Plan->outputLayout = CLFFT_REAL;
1282 trans1Plan->inStride[0] = fftPlan->inStride[0];
1283 trans1Plan->inStride[1] = clLengths[0];
1284 trans1Plan->outStride[0] = 1;
1285 trans1Plan->outStride[1] = clLengths[1] + padding;
1286 trans1Plan->iDist = fftPlan->iDist;
1287 trans1Plan->oDist = clLengths[0] * trans1Plan->outStride[1];
1288 trans1Plan->gen = Transpose_GCN;
1289 trans1Plan->transflag = true;
1291 //Set callback data if set on top level plan
1292 if (fftPlan->hasPreCallback)
1294 trans1Plan->hasPreCallback = true;
1295 trans1Plan->preCallback = fftPlan->preCallback;
1296 trans1Plan->precallUserData = fftPlan->precallUserData;
1299 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
1300 _T( "BakePlan large1d trans1 plan failed" ) );
1304 //size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
1305 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
1306 _T( "CreateDefaultPlan Large1d column failed" ) );
1308 FFTPlan* row1Plan = NULL;
1309 lockRAII* row1Lock = NULL;
1310 OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
1312 row1Plan->placeness = CLFFT_OUTOFPLACE;
1313 row1Plan->precision = fftPlan->precision;
1314 row1Plan->forwardScale = 1.0f;
1315 row1Plan->backwardScale = 1.0f;
1316 row1Plan->tmpBufSize = 0;
1317 row1Plan->batchsize = fftPlan->batchsize;
1319 row1Plan->gen = fftPlan->gen;
1320 row1Plan->envelope = fftPlan->envelope;
1322 // twiddling is done in row2
1323 row1Plan->large1D = 0;
1325 row1Plan->length.push_back(clLengths[0]);
1326 row1Plan->inputLayout = CLFFT_REAL;
1327 row1Plan->outputLayout = CLFFT_HERMITIAN_INTERLEAVED;
1328 row1Plan->inStride[0] = 1;
1329 row1Plan->outStride[0] = 1;
1330 row1Plan->inStride.push_back(clLengths[1]+padding);
1331 row1Plan->outStride.push_back(1 + clLengths[1]/2);
1332 row1Plan->iDist = clLengths[0] * row1Plan->inStride[1];
1333 row1Plan->oDist = clLengths[0] * row1Plan->outStride[1];
1336 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
1337 _T( "BakePlan large1d first row plan failed" ) );
1340 //Output --> tmp buffer
1341 clLengths[2] = clLengths[0];
1342 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
1343 _T( "CreateDefaultPlan Large1d transpose 2 failed" ) );
1345 FFTPlan* trans2Plan = NULL;
1346 lockRAII* trans2Lock = NULL;
1347 OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
1349 trans2Plan->transflag = true;
1351 size_t transLengths[2];
1352 transLengths[0] = 1 + clLengths[1]/2;
1353 transLengths[1] = clLengths[0];
1354 OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, transLengths ),
1355 _T( "clfftSetPlanLength for planTY transpose failed" ) );
1359 trans2Plan->placeness = CLFFT_OUTOFPLACE;
1360 trans2Plan->precision = fftPlan->precision;
1361 trans2Plan->tmpBufSize = 0;
1362 trans2Plan->batchsize = fftPlan->batchsize;
1363 trans2Plan->envelope = fftPlan->envelope;
1364 trans2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1365 trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1366 trans2Plan->inStride[0] = 1;
1367 trans2Plan->inStride[1] = 1 + clLengths[1]/2;
1368 trans2Plan->outStride[0] = 1;
1369 trans2Plan->outStride[1] = clLengths[0];
1370 trans2Plan->iDist = clLengths[0] * trans2Plan->inStride[1];
1371 trans2Plan->oDist = (1 + clLengths[1]/2) * trans2Plan->outStride[1];
1372 trans2Plan->gen = Transpose_GCN;
1373 trans2Plan->transflag = true;
1374 trans2Plan->transOutHorizontal = true;
1376 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
1377 _T( "BakePlan large1d trans2 plan failed" ) );
1381 //size clLengths[0], batch clLengths[1]
1382 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
1383 _T( "CreateDefaultPlan Large1d second row plan failed" ) );
1385 FFTPlan* row2Plan = NULL;
1386 lockRAII* row2Lock = NULL;
1387 OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
1389 row2Plan->placeness = CLFFT_OUTOFPLACE;
1390 row2Plan->precision = fftPlan->precision;
1391 row2Plan->forwardScale = fftPlan->forwardScale;
1392 row2Plan->backwardScale = fftPlan->backwardScale;
1393 row2Plan->tmpBufSize = 0;
1394 row2Plan->batchsize = fftPlan->batchsize;
1396 row2Plan->gen = fftPlan->gen;
1397 row2Plan->envelope = fftPlan->envelope;
1400 row2Plan->length.push_back(1+clLengths[1]/2);
1401 row2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1402 row2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1403 row2Plan->inStride[0] = 1;
1404 row2Plan->outStride[0] = 1;
1405 row2Plan->inStride.push_back(clLengths[0]);
1406 row2Plan->outStride.push_back(1 + clLengths[0]/2);
1407 row2Plan->iDist = (1 + clLengths[1]/2) * row2Plan->inStride[1];
1408 row2Plan->oDist = clLengths[1] * row2Plan->outStride[1];
1410 row2Plan->large1D = fftPlan->length[0];
1411 row2Plan->twiddleFront = true;
1413 row2Plan->realSpecial = true;
1414 row2Plan->realSpecial_Nr = clLengths[1];
1416 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
1417 _T( "BakePlan large1d second row plan failed" ) );
1421 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
1422 _T( "CreateDefaultPlan Large1d transpose 3 failed" ) );
1424 FFTPlan* trans3Plan = NULL;
1425 lockRAII* trans3Lock = NULL;
1426 OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
1428 trans3Plan->transflag = true;
1430 transLengths[0] = 1 + clLengths[0]/2;
1431 transLengths[1] = clLengths[1];
1432 OPENCL_V(clfftSetPlanLength( fftPlan->planTZ, CLFFT_2D, transLengths ),
1433 _T( "clfftSetPlanLength for planTZ transpose failed" ) );
1435 trans3Plan->placeness = CLFFT_OUTOFPLACE;
1436 trans3Plan->precision = fftPlan->precision;
1437 trans3Plan->tmpBufSize = 0;
1438 trans3Plan->batchsize = fftPlan->batchsize;
1439 trans3Plan->envelope = fftPlan->envelope;
1440 trans3Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1441 if(fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR)
1442 trans3Plan->outputLayout = CLFFT_COMPLEX_PLANAR;
1444 trans3Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1445 trans3Plan->inStride[0] = 1;
1446 trans3Plan->inStride[1] = 1 + clLengths[0]/2;
1447 trans3Plan->outStride[0] = 1;
1448 trans3Plan->outStride[1] = clLengths[1];
1449 trans3Plan->iDist = clLengths[1] * trans3Plan->inStride[1];
1450 trans3Plan->oDist = fftPlan->oDist;
1451 trans3Plan->gen = Transpose_GCN;
1452 trans3Plan->transflag = true;
1453 trans3Plan->realSpecial = true;
1454 trans3Plan->transOutHorizontal = true;
1456 //Set callback data if set on top level plan
1457 if (fftPlan->hasPostCallback)
1459 trans3Plan->hasPostCallback = true;
1460 trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
1461 trans3Plan->postcallUserData = fftPlan->postcallUserData;
1464 OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
1465 _T( "BakePlan large1d trans3 plan failed" ) );
1467 fftPlan->transflag = true;
1468 fftPlan->baked = true;
1469 return CLFFT_SUCCESS;
1471 else if (fftPlan->inputLayout == CLFFT_REAL)
1473 if (fftPlan->tmpBufSizeRC == 0)
1475 fftPlan->tmpBufSizeRC = length0 * length1 *
1476 fftPlan->batchsize * fftPlan->ElementSize();
1477 for (size_t index = 1; index < fftPlan->length.size(); index++)
1479 fftPlan->tmpBufSizeRC *= fftPlan->length[index];
1483 // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
1484 // transposed output
1485 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1]),
1486 _T("CreateDefaultPlan Large1d column failed"));
1488 FFTPlan* colTPlan = NULL;
1489 lockRAII* colLock = NULL;
1490 OPENCL_V(fftRepo.getPlan(fftPlan->planX, colTPlan, colLock), _T("fftRepo.getPlan failed"));
1492 // current plan is to create intermediate buffer, packed and interleave
1493 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
1494 // elements in the original buffer. Like a transpose of the matrix
1495 // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
1497 //this part are common for both passes
1498 colTPlan->placeness = CLFFT_OUTOFPLACE;
1499 colTPlan->precision = fftPlan->precision;
1500 colTPlan->forwardScale = 1.0f;
1501 colTPlan->backwardScale = 1.0f;
1502 colTPlan->tmpBufSize = 0;
1503 colTPlan->batchsize = fftPlan->batchsize;
1505 colTPlan->gen = fftPlan->gen;
1506 colTPlan->envelope = fftPlan->envelope;
1508 //Pass large1D flag to confirm we need multiply twiddle factor
1509 colTPlan->large1D = fftPlan->length[0];
1510 colTPlan->RCsimple = true;
1512 colTPlan->length.push_back(clLengths[0]);
1515 colTPlan->inputLayout = fftPlan->inputLayout;
1516 colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1517 colTPlan->inStride[0] = fftPlan->inStride[0] * clLengths[0];
1518 colTPlan->outStride[0] = 1;
1519 colTPlan->iDist = fftPlan->iDist;
1520 colTPlan->oDist = length0 * length1;//fftPlan->length[0];
1521 colTPlan->inStride.push_back(fftPlan->inStride[0]);
1522 colTPlan->outStride.push_back(length1);//clLengths[1]);
1524 for (size_t index = 1; index < fftPlan->length.size(); index++)
1526 colTPlan->length.push_back(fftPlan->length[index]);
1527 colTPlan->inStride.push_back(fftPlan->inStride[index]);
1528 // tmp buffer is tightly packed
1529 colTPlan->outStride.push_back(colTPlan->oDist);
1530 colTPlan->oDist *= fftPlan->length[index];
1533 //Set callback data if set on top level plan
1534 if (fftPlan->hasPreCallback)
1536 colTPlan->hasPreCallback = true;
1537 colTPlan->preCallback = fftPlan->preCallback;
1538 colTPlan->precallUserData = fftPlan->precallUserData;
1541 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d first column plan failed"));
1543 //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
1544 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0]),
1545 _T("CreateDefaultPlan large1D row failed"));
1547 FFTPlan* col2Plan = NULL;
1548 lockRAII* rowLock = NULL;
1549 OPENCL_V(fftRepo.getPlan(fftPlan->planY, col2Plan, rowLock), _T("fftRepo.getPlan failed"));
1551 // This is second column fft, intermediate buffer is packed and interleaved
1552 // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
1554 col2Plan->precision = fftPlan->precision;
1555 col2Plan->forwardScale = fftPlan->forwardScale;
1556 col2Plan->backwardScale = fftPlan->backwardScale;
1557 col2Plan->tmpBufSize = 0;
1558 col2Plan->batchsize = fftPlan->batchsize;
1560 col2Plan->gen = fftPlan->gen;
1561 col2Plan->envelope = fftPlan->envelope;
1563 col2Plan->length.push_back(length1);
1565 col2Plan->inStride[0] = length1;
1566 col2Plan->inStride.push_back(1);
1567 col2Plan->iDist = length0 * length1;
1569 // make sure colTPlan (first column plan) does not recurse, otherwise large twiddle mul
1570 // cannot be done with this algorithm sequence
1571 assert(colTPlan->planX == 0);
1574 col2Plan->placeness = CLFFT_INPLACE;
1575 col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1576 col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1578 col2Plan->outStride[0] = length1;
1579 col2Plan->outStride.push_back(1);
1580 col2Plan->oDist = length0 * length1;
1582 for (size_t index = 1; index < fftPlan->length.size(); index++)
1584 col2Plan->length.push_back(fftPlan->length[index]);
1585 col2Plan->inStride.push_back(col2Plan->iDist);
1586 col2Plan->outStride.push_back(col2Plan->oDist);
1587 col2Plan->iDist *= fftPlan->length[index];
1588 col2Plan->oDist *= fftPlan->length[index];
1592 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
1594 if ( (fftPlan->outputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
1595 (fftPlan->outputLayout == CLFFT_HERMITIAN_PLANAR) )
1597 // copy plan to get back to hermitian
1598 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0]),
1599 _T("CreateDefaultPlan RC copy failed"));
1601 FFTPlan* copyPlan = NULL;
1602 lockRAII* copyLock = NULL;
1603 OPENCL_V(fftRepo.getPlan(fftPlan->planRCcopy, copyPlan, copyLock), _T("fftRepo.getPlan failed"));
1605 // This is second column fft, intermediate buffer is packed and interleaved
1606 // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
1608 // common part for both passes
1609 copyPlan->placeness = CLFFT_OUTOFPLACE;
1610 copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1611 copyPlan->outputLayout = fftPlan->outputLayout;
1613 copyPlan->precision = fftPlan->precision;
1614 copyPlan->forwardScale = 1.0f;
1615 copyPlan->backwardScale = 1.0f;
1616 copyPlan->tmpBufSize = 0;
1617 copyPlan->batchsize = fftPlan->batchsize;
1619 copyPlan->gen = Copy;
1620 copyPlan->envelope = fftPlan->envelope;
1623 copyPlan->inStride[0] = 1;
1624 copyPlan->iDist = fftPlan->length[0];
1626 copyPlan->outStride[0] = fftPlan->outStride[0];
1627 copyPlan->oDist = fftPlan->oDist;
1629 for (size_t index = 1; index < fftPlan->length.size(); index++)
1631 copyPlan->length.push_back(fftPlan->length[index]);
1632 copyPlan->inStride.push_back(copyPlan->inStride[index - 1] * fftPlan->length[index - 1]);
1633 copyPlan->iDist *= fftPlan->length[index];
1634 copyPlan->outStride.push_back(fftPlan->outStride[index]);
1637 //Set callback data if set on top level plan
1638 if (fftPlan->hasPostCallback)
1640 copyPlan->hasPostCallback = true;
1641 copyPlan->postCallbackParam = fftPlan->postCallbackParam;
1642 copyPlan->postcallUserData = fftPlan->postcallUserData;
1645 OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d RC copy plan failed"));
1649 else if(fftPlan->outputLayout == CLFFT_REAL)
1651 if (fftPlan->tmpBufSizeRC==0 )
1653 fftPlan->tmpBufSizeRC = length0 * length1 *
1654 fftPlan->batchsize * fftPlan->ElementSize();
1655 for (size_t index=1; index < fftPlan->length.size(); index++)
1657 fftPlan->tmpBufSizeRC *= fftPlan->length[index];
1661 if ((fftPlan->inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
1662 (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR))
1664 // copy plan to from hermitian to full complex
1665 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planRCcopy, fftPlan->context, CLFFT_1D, &fftPlan->length[0]),
1666 _T("CreateDefaultPlan RC copy failed"));
1668 FFTPlan* copyPlan = NULL;
1669 lockRAII* copyLock = NULL;
1670 OPENCL_V(fftRepo.getPlan(fftPlan->planRCcopy, copyPlan, copyLock), _T("fftRepo.getPlan failed"));
1672 // This is second column fft, intermediate buffer is packed and interleaved
1673 // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
1675 // common part for both passes
1676 copyPlan->placeness = CLFFT_OUTOFPLACE;
1677 copyPlan->inputLayout = fftPlan->inputLayout;
1678 copyPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1680 copyPlan->precision = fftPlan->precision;
1681 copyPlan->forwardScale = 1.0f;
1682 copyPlan->backwardScale = 1.0f;
1683 copyPlan->tmpBufSize = 0;
1684 copyPlan->batchsize = fftPlan->batchsize;
1686 copyPlan->gen = Copy;
1687 copyPlan->envelope = fftPlan->envelope;
1689 copyPlan->inStride[0] = fftPlan->inStride[0];
1690 copyPlan->iDist = fftPlan->iDist;
1692 copyPlan->outStride[0] = 1;
1693 copyPlan->oDist = fftPlan->length[0];
1695 for (size_t index = 1; index < fftPlan->length.size(); index++)
1697 copyPlan->length.push_back(fftPlan->length[index]);
1698 copyPlan->outStride.push_back(copyPlan->outStride[index - 1] * fftPlan->length[index - 1]);
1699 copyPlan->oDist *= fftPlan->length[index];
1700 copyPlan->inStride.push_back(fftPlan->inStride[index]);
1703 //Set callback data if set on top level plan
1704 if (fftPlan->hasPreCallback)
1706 copyPlan->hasPreCallback = true;
1707 copyPlan->preCallback = fftPlan->preCallback;
1708 copyPlan->precallUserData = fftPlan->precallUserData;
1711 OPENCL_V(clfftBakePlan(fftPlan->planRCcopy, numQueues, commQueueFFT, NULL, NULL), _T("BakePlan large1d RC copy plan failed"));
1714 // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
1715 // transposed output
1716 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
1717 _T( "CreateDefaultPlan Large1d column failed" ) );
1719 FFTPlan* colTPlan = NULL;
1720 lockRAII* colLock = NULL;
1721 OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
1723 // current plan is to create intermediate buffer, packed and interleave
1724 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
1725 // elements in the original buffer. Like a transpose of the matrix
1726 // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
1728 //this part are common for both passes
1729 colTPlan->precision = fftPlan->precision;
1730 colTPlan->forwardScale = 1.0f;
1731 colTPlan->backwardScale = 1.0f;
1732 colTPlan->tmpBufSize = 0;
1733 colTPlan->batchsize = fftPlan->batchsize;
1735 colTPlan->gen = fftPlan->gen;
1736 colTPlan->envelope = fftPlan->envelope;
1738 //Pass large1D flag to confirm we need multiply twiddle factor
1739 colTPlan->large1D = fftPlan->length[0];
1741 colTPlan->length.push_back(clLengths[0]);
1743 colTPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1744 colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1746 colTPlan->inStride[0] = length0;
1747 colTPlan->inStride.push_back(1);
1748 colTPlan->iDist = length0 * length1;
1750 colTPlan->outStride[0] = length0;
1751 colTPlan->outStride.push_back(1);
1752 colTPlan->oDist = length0 * length1;
1754 for (size_t index=1; index < fftPlan->length.size(); index++)
1756 colTPlan->length.push_back(fftPlan->length[index]);
1757 colTPlan->inStride.push_back(colTPlan->iDist);
1758 colTPlan->outStride.push_back(colTPlan->oDist);
1759 colTPlan->iDist *= fftPlan->length[index];
1760 colTPlan->oDist *= fftPlan->length[index];
1763 if ((fftPlan->inputLayout == CLFFT_HERMITIAN_INTERLEAVED) ||
1764 (fftPlan->inputLayout == CLFFT_HERMITIAN_PLANAR))
1766 colTPlan->placeness = CLFFT_INPLACE;
1770 colTPlan->placeness = CLFFT_OUTOFPLACE;
1773 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
1775 //another column FFT, size clLengths[0], batch clLengths[1], output without transpose
1776 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
1777 _T( "CreateDefaultPlan large1D row failed" ) );
1779 FFTPlan* col2Plan = NULL;
1780 lockRAII* rowLock = NULL;
1781 OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
1783 // This is second column fft, intermediate buffer is packed and interleaved
1784 // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
1786 // common part for both passes
1787 col2Plan->placeness = CLFFT_OUTOFPLACE;
1788 col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1789 col2Plan->outputLayout = fftPlan->outputLayout;
1791 col2Plan->precision = fftPlan->precision;
1792 col2Plan->forwardScale = fftPlan->forwardScale;
1793 col2Plan->backwardScale = fftPlan->backwardScale;
1794 col2Plan->tmpBufSize = 0;
1795 col2Plan->batchsize = fftPlan->batchsize;
1797 col2Plan->gen = fftPlan->gen;
1798 col2Plan->envelope = fftPlan->envelope;
1800 col2Plan->RCsimple = true;
1801 col2Plan->length.push_back(length1);
1803 col2Plan->inStride[0] = 1;
1804 col2Plan->inStride.push_back(length0);
1805 col2Plan->iDist = length0 * length1;
1807 col2Plan->outStride[0] = length1 * fftPlan->outStride[0];
1808 col2Plan->outStride.push_back(fftPlan->outStride[0]);
1809 col2Plan->oDist = fftPlan->oDist;
1811 for (size_t index=1; index < fftPlan->length.size(); index++)
1813 col2Plan->length.push_back(fftPlan->length[index]);
1814 col2Plan->inStride.push_back(col2Plan->iDist);
1815 col2Plan->iDist *= fftPlan->length[index];
1816 col2Plan->outStride.push_back(fftPlan->outStride[index]);
1819 //Set callback data if set on top level plan
1820 if (fftPlan->hasPostCallback)
1822 col2Plan->hasPostCallback = true;
1823 col2Plan->postCallbackParam = fftPlan->postCallbackParam;
1824 col2Plan->postcallUserData = fftPlan->postcallUserData;
1827 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
1832 if( (fftPlan->length[0] > 262144/PrecisionWidth(fftPlan->precision)) && fftPlan->blockCompute )
1834 assert(fftPlan->length[0] <= 1048576);
1837 size_t padding = 64;
1838 if (fftPlan->tmpBufSize==0 )
1840 fftPlan->tmpBufSize = (length1 + padding) * length0 *
1841 fftPlan->batchsize * fftPlan->ElementSize();
1842 for (size_t index=1; index < fftPlan->length.size(); index++)
1844 fftPlan->tmpBufSize *= fftPlan->length[index];
1848 // Algorithm in this case is
1849 // T(with pad, out_of_place), R (in_place), C(in_place), Unpad(out_of_place)
1851 size_t len[3] = { clLengths[1], clLengths[0], 1 };
1853 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, len ),
1854 _T( "CreateDefaultPlan Large1d trans1 failed" ) );
1856 FFTPlan* trans1Plan = NULL;
1857 lockRAII* trans1Lock = NULL;
1858 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
1860 trans1Plan->placeness = CLFFT_OUTOFPLACE;
1861 trans1Plan->precision = fftPlan->precision;
1862 trans1Plan->tmpBufSize = 0;
1863 trans1Plan->batchsize = fftPlan->batchsize;
1864 trans1Plan->envelope = fftPlan->envelope;
1865 trans1Plan->inputLayout = fftPlan->inputLayout;
1866 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1867 trans1Plan->inStride[0] = fftPlan->inStride[0];
1868 trans1Plan->inStride[1] = length1;
1869 trans1Plan->outStride[0] = 1;
1870 trans1Plan->outStride[1] = length0 + padding;
1871 trans1Plan->iDist = fftPlan->iDist;
1872 trans1Plan->oDist = length1 * trans1Plan->outStride[1];
1873 trans1Plan->gen = Transpose_GCN;
1874 trans1Plan->transflag = true;
1876 for (size_t index=1; index < fftPlan->length.size(); index++)
1878 trans1Plan->length.push_back(fftPlan->length[index]);
1879 trans1Plan->inStride.push_back(fftPlan->inStride[index]);
1880 trans1Plan->outStride.push_back(trans1Plan->oDist);
1881 trans1Plan->oDist *= fftPlan->length[index];
1884 //Set callback data if set on top level plan
1885 if (fftPlan->hasPreCallback)
1887 trans1Plan->hasPreCallback = true;
1888 trans1Plan->preCallback = fftPlan->preCallback;
1889 trans1Plan->precallUserData = fftPlan->precallUserData;
1892 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
1893 _T( "BakePlan large1d trans1 plan failed" ) );
1897 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[0] ),
1898 _T( "CreateDefaultPlan Large1d column failed" ) );
1900 FFTPlan* rowPlan = NULL;
1901 lockRAII* rowLock = NULL;
1902 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
1904 assert(fftPlan->large1D == 0);
1906 rowPlan->placeness = CLFFT_INPLACE;
1907 rowPlan->precision = fftPlan->precision;
1908 rowPlan->forwardScale = 1.0f;
1909 rowPlan->backwardScale = 1.0f;
1910 rowPlan->tmpBufSize = 0;
1911 rowPlan->batchsize = fftPlan->batchsize;
1913 rowPlan->gen = fftPlan->gen;
1914 rowPlan->envelope = fftPlan->envelope;
1916 rowPlan->length.push_back(length1);
1919 rowPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1920 rowPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1921 rowPlan->inStride[0] = 1;
1922 rowPlan->outStride[0] = 1;
1923 rowPlan->inStride.push_back(length0+padding);
1924 rowPlan->outStride.push_back(length0+padding);
1925 rowPlan->iDist = (length0+padding)*length1;
1926 rowPlan->oDist = (length0+padding)*length1;
1928 for (size_t index=1; index < fftPlan->length.size(); index++)
1930 rowPlan->length.push_back(fftPlan->length[index]);
1931 rowPlan->inStride.push_back(rowPlan->iDist);
1932 rowPlan->iDist *= fftPlan->length[index];
1933 rowPlan->outStride.push_back(rowPlan->oDist);
1934 rowPlan->oDist *= fftPlan->length[index];
1938 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first row plan failed" ) );
1941 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[1] ),
1942 _T( "CreateDefaultPlan large1D column failed" ) );
1944 FFTPlan* col2Plan = NULL;
1945 lockRAII* colLock = NULL;
1946 OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, colLock ), _T( "fftRepo.getPlan failed" ) );
1948 col2Plan->placeness = CLFFT_INPLACE;
1949 col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
1950 col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
1951 col2Plan->precision = fftPlan->precision;
1952 col2Plan->forwardScale = fftPlan->forwardScale;
1953 col2Plan->backwardScale = fftPlan->backwardScale;
1954 col2Plan->tmpBufSize = 0;
1955 col2Plan->batchsize = fftPlan->batchsize;
1957 col2Plan->gen = fftPlan->gen;
1958 col2Plan->envelope = fftPlan->envelope;
1960 col2Plan->large1D = fftPlan->length[0];
1961 col2Plan->twiddleFront = true;
1963 col2Plan->length.push_back(clLengths[0]);
1967 col2Plan->blockCompute = true;
1968 col2Plan->blockComputeType = BCT_C2C;
1970 col2Plan->inStride[0] = length0+padding;
1971 col2Plan->outStride[0] = length0+padding;
1972 col2Plan->iDist = (length0+padding) * length1;
1973 col2Plan->oDist = (length0+padding) * length1;
1974 col2Plan->inStride.push_back(1);
1975 col2Plan->outStride.push_back(1);
1978 for (size_t index=1; index < fftPlan->length.size(); index++)
1980 col2Plan->length.push_back(fftPlan->length[index]);
1981 col2Plan->inStride.push_back(col2Plan->iDist);
1982 col2Plan->outStride.push_back(col2Plan->oDist);
1983 col2Plan->iDist *= fftPlan->length[index];
1984 col2Plan->oDist *= fftPlan->length[index];
1988 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
1991 // copy plan to get results back to packed output
1992 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planCopy, fftPlan->context, CLFFT_1D, &clLengths[0] ),
1993 _T( "CreateDefaultPlan Copy failed" ) );
1995 FFTPlan* copyPlan = NULL;
1996 lockRAII* copyLock = NULL;
1997 OPENCL_V( fftRepo.getPlan( fftPlan->planCopy, copyPlan, copyLock ), _T( "fftRepo.getPlan failed" ) );
2000 copyPlan->placeness = CLFFT_OUTOFPLACE;
2001 copyPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2002 copyPlan->outputLayout = fftPlan->outputLayout;
2004 copyPlan->precision = fftPlan->precision;
2005 copyPlan->forwardScale = 1.0f;
2006 copyPlan->backwardScale = 1.0f;
2007 copyPlan->tmpBufSize = 0;
2008 copyPlan->batchsize = fftPlan->batchsize;
2010 copyPlan->gen = Copy;
2011 copyPlan->envelope = fftPlan->envelope;
2013 copyPlan->length.push_back(length1);
2015 copyPlan->inStride[0] = 1;
2016 copyPlan->inStride.push_back(length0+padding);
2017 copyPlan->iDist = length1*(length0+padding);
2019 copyPlan->outStride[0] = fftPlan->outStride[0];
2020 copyPlan->outStride.push_back(length0);
2021 copyPlan->oDist = fftPlan->oDist;
2023 for (size_t index=1; index < fftPlan->length.size(); index++)
2025 copyPlan->length.push_back(fftPlan->length[index]);
2026 copyPlan->inStride.push_back(copyPlan->inStride[index] * copyPlan->length[index]);
2027 copyPlan->iDist *= fftPlan->length[index];
2028 copyPlan->outStride.push_back(fftPlan->outStride[index]);
2031 OPENCL_V(clfftBakePlan(fftPlan->planCopy, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d copy plan failed" ) );
2036 if (fftPlan->tmpBufSize==0 )
2038 fftPlan->tmpBufSize = length0 * length1 *
2039 fftPlan->batchsize * fftPlan->ElementSize();
2040 for (size_t index=1; index < fftPlan->length.size(); index++)
2042 fftPlan->tmpBufSize *= fftPlan->length[index];
2046 // column FFT, size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
2047 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
2048 _T( "CreateDefaultPlan Large1d column failed" ) );
2050 FFTPlan* colTPlan = NULL;
2051 lockRAII* colLock = NULL;
2052 OPENCL_V( fftRepo.getPlan( fftPlan->planX, colTPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
2054 assert(fftPlan->large1D == 0);
2056 // current plan is to create intermediate buffer, packed and interleave
2057 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
2058 // elements in the original buffer. Like a transpose of the matrix
2059 // we need to pass clLengths[0] and instride size to kernel, so kernel can tell the difference
2061 //this part are common for both passes
2062 colTPlan->placeness = CLFFT_OUTOFPLACE;
2063 colTPlan->precision = fftPlan->precision;
2064 colTPlan->forwardScale = 1.0f;
2065 colTPlan->backwardScale = 1.0f;
2066 colTPlan->tmpBufSize = 0;
2067 colTPlan->batchsize = fftPlan->batchsize;
2069 colTPlan->gen = fftPlan->gen;
2070 colTPlan->envelope = fftPlan->envelope;
2072 //Pass large1D flag to confirm we need multiply twiddle factor
2073 colTPlan->large1D = fftPlan->length[0];
2075 colTPlan->length.push_back(length0);
2078 colTPlan->inputLayout = fftPlan->inputLayout;
2079 colTPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2080 colTPlan->inStride[0] = fftPlan->inStride[0] * length0;
2081 colTPlan->outStride[0] = length0;
2082 colTPlan->iDist = fftPlan->iDist;
2083 colTPlan->oDist = length0 * length1;
2084 colTPlan->inStride.push_back(fftPlan->inStride[0]);
2085 colTPlan->outStride.push_back(1);
2087 //Set callback data if set on top level plan
2088 if (fftPlan->hasPreCallback)
2090 colTPlan->hasPreCallback = true;
2091 colTPlan->preCallback = fftPlan->preCallback;
2092 colTPlan->precallUserData = fftPlan->precallUserData;
2095 // Enabling block column compute
2096 if( (colTPlan->inStride[0] == length0) && IsPo2(fftPlan->length[0]) && (fftPlan->length[0] < 524288) )
2098 colTPlan->blockCompute = true;
2099 colTPlan->blockComputeType = BCT_C2C;
2102 for (size_t index=1; index < fftPlan->length.size(); index++)
2104 colTPlan->length.push_back(fftPlan->length[index]);
2105 colTPlan->inStride.push_back(fftPlan->inStride[index]);
2106 // tmp buffer is tightly packed
2107 colTPlan->outStride.push_back(colTPlan->oDist);
2108 colTPlan->oDist *= fftPlan->length[index];
2112 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d first column plan failed" ) );
2114 //another column FFT, size clLengths[0], batch clLengths[1], output with transpose
2115 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
2116 _T( "CreateDefaultPlan large1D row failed" ) );
2118 FFTPlan* col2Plan = NULL;
2119 lockRAII* rowLock = NULL;
2120 OPENCL_V( fftRepo.getPlan( fftPlan->planY, col2Plan, rowLock ), _T( "fftRepo.getPlan failed" ) );
2122 // This is second column fft, intermediate buffer is packed and interleaved
2123 // we need to pass clLengths[1] and instride size to kernel, so kernel can tell the difference
2125 // common part for both passes
2126 col2Plan->outputLayout = fftPlan->outputLayout;
2127 col2Plan->precision = fftPlan->precision;
2128 col2Plan->forwardScale = fftPlan->forwardScale;
2129 col2Plan->backwardScale = fftPlan->backwardScale;
2130 col2Plan->tmpBufSize = 0;
2131 col2Plan->batchsize = fftPlan->batchsize;
2132 col2Plan->oDist = fftPlan->oDist;
2134 col2Plan->gen = fftPlan->gen;
2135 col2Plan->envelope = fftPlan->envelope;
2138 col2Plan->length.push_back(clLengths[1]);
2140 bool integratedTranposes = true;
2143 if( colTPlan->blockCompute && (fftPlan->outStride[0] == 1) && clLengths[0] <= 256)
2145 col2Plan->blockCompute = true;
2146 col2Plan->blockComputeType = BCT_R2C;
2148 col2Plan->placeness = CLFFT_OUTOFPLACE;
2149 col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2150 col2Plan->inStride[0] = 1;
2151 col2Plan->outStride[0] = length1;
2152 col2Plan->iDist = length0 * length1;
2153 col2Plan->inStride.push_back(length0);
2154 col2Plan->outStride.push_back(1);
2156 else if( colTPlan->blockCompute && (fftPlan->outStride[0] == 1) )
2158 integratedTranposes = false;
2160 col2Plan->placeness = CLFFT_INPLACE;
2161 col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2162 col2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2163 col2Plan->inStride[0] = 1;
2164 col2Plan->outStride[0] = 1;
2165 col2Plan->iDist = length0 * length1;
2166 col2Plan->oDist = length0 * length1;
2167 col2Plan->inStride.push_back(length0);
2168 col2Plan->outStride.push_back(length0);
2172 //first layer, large 1D from tmp buffer to output buffer
2173 col2Plan->placeness = CLFFT_OUTOFPLACE;
2174 col2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2175 col2Plan->inStride[0] = 1;
2176 col2Plan->outStride[0] = fftPlan->outStride[0] * clLengths[1];
2177 col2Plan->iDist = length0 * length1; //fftPlan->length[0];
2178 col2Plan->inStride.push_back(length0);
2179 col2Plan->outStride.push_back(fftPlan->outStride[0]);
2182 if(!integratedTranposes)
2184 for (size_t index=1; index < fftPlan->length.size(); index++)
2186 col2Plan->length.push_back(fftPlan->length[index]);
2187 col2Plan->inStride.push_back(col2Plan->iDist);
2188 col2Plan->outStride.push_back(col2Plan->oDist);
2189 col2Plan->iDist *= fftPlan->length[index];
2190 col2Plan->oDist *= fftPlan->length[index];
2195 for (size_t index=1; index < fftPlan->length.size(); index++)
2197 col2Plan->length.push_back(fftPlan->length[index]);
2198 col2Plan->inStride.push_back(col2Plan->iDist);
2199 col2Plan->outStride.push_back(fftPlan->outStride[index]);
2200 col2Plan->iDist *= fftPlan->length[index];
2204 //Set callback data if set on top level plan
2205 if (fftPlan->hasPostCallback && integratedTranposes)
2207 col2Plan->hasPostCallback = true;
2208 col2Plan->postCallbackParam = fftPlan->postCallbackParam;
2209 col2Plan->postcallUserData = fftPlan->postcallUserData;
2212 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan large1d second column plan failed" ) );
2214 if(!integratedTranposes)
2218 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
2219 _T( "CreateDefaultPlan Large1d transpose failed" ) );
2221 FFTPlan* trans3Plan = NULL;
2222 lockRAII* trans3Lock = NULL;
2223 OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
2225 trans3Plan->placeness = CLFFT_OUTOFPLACE;
2226 trans3Plan->precision = fftPlan->precision;
2227 trans3Plan->tmpBufSize = 0;
2228 trans3Plan->batchsize = fftPlan->batchsize;
2229 trans3Plan->envelope = fftPlan->envelope;
2230 trans3Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2231 trans3Plan->outputLayout = fftPlan->outputLayout;
2232 trans3Plan->inStride[0] = 1;
2233 trans3Plan->inStride[1] = clLengths[0];
2234 trans3Plan->outStride[0] = fftPlan->outStride[0];
2235 trans3Plan->outStride[1] = clLengths[1] * fftPlan->outStride[0];
2236 trans3Plan->iDist = fftPlan->length[0];
2237 trans3Plan->oDist = fftPlan->oDist;
2238 trans3Plan->gen = Transpose_GCN;
2239 trans3Plan->transflag = true;
2241 for (size_t index=1; index < fftPlan->length.size(); index++)
2243 trans3Plan->length.push_back(fftPlan->length[index]);
2244 trans3Plan->inStride.push_back(trans3Plan->iDist);
2245 trans3Plan->iDist *= fftPlan->length[index];
2246 trans3Plan->outStride.push_back(fftPlan->outStride[index]);
2249 //Set callback data if set on top level plan
2250 if (fftPlan->hasPostCallback)
2252 trans3Plan->hasPostCallback = true;
2253 trans3Plan->postCallbackParam = fftPlan->postCallbackParam;
2254 trans3Plan->postcallUserData = fftPlan->postcallUserData;
2257 OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
2258 _T( "BakePlan large1d trans plan failed" ) );
2263 fftPlan->baked = true;
2264 return CLFFT_SUCCESS;
2271 if (fftPlan->transflag) //Transpose for 2D
2273 clfftStatus err = CLFFT_SUCCESS;
2274 if(fftPlan->gen == Transpose_GCN)
2275 fftPlan->action = new FFTGeneratedTransposeGCNAction(plHandle, fftPlan, *commQueueFFT, err);
2276 else if (fftPlan->gen == Transpose_SQUARE)
2277 fftPlan->action = new FFTGeneratedTransposeSquareAction(plHandle, fftPlan, *commQueueFFT, err);
2278 else if (fftPlan->gen == Transpose_NONSQUARE)
2280 if(fftPlan->nonSquareKernelType != NON_SQUARE_TRANS_PARENT)
2281 fftPlan->action = new FFTGeneratedTransposeNonSquareAction(plHandle, fftPlan, *commQueueFFT, err);
2284 size_t clLengths[] = { 1, 1, 0 };
2285 clLengths[0] = fftPlan->length[0];
2286 clLengths[1] = fftPlan->length[1];
2288 //NON_SQUARE_KERNEL_ORDER currKernelOrder;
2289 // controlling the transpose and swap kernel order
2290 // if leading dim is larger than the other dim it makes sense to swap and transpose
2291 if (clLengths[0] > clLengths[1])
2293 //Twiddling will be done in swap kernel, in regardless of the order
2294 fftPlan->nonSquareKernelOrder = SWAP_AND_TRANSPOSE;
2298 if (fftPlan->large1D != 0 && 0)
2300 //this is not going to happen anymore
2301 fftPlan->nonSquareKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
2305 //twiddling can be done in swap
2306 fftPlan->nonSquareKernelOrder = TRANSPOSE_AND_SWAP;
2310 //std::cout << "currKernelOrder = " << fftPlan->nonSquareKernelOrder << std::endl;
2311 //ends tranpose kernel order
2314 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths),
2315 _T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
2317 FFTPlan* trans1Plan = NULL;
2318 lockRAII* trans1Lock = NULL;
2319 OPENCL_V(fftRepo.getPlan(fftPlan->planTX, trans1Plan, trans1Lock), _T("fftRepo.getPlan failed"));
2321 trans1Plan->placeness = CLFFT_INPLACE;
2322 trans1Plan->precision = fftPlan->precision;
2323 trans1Plan->tmpBufSize = 0;
2324 trans1Plan->batchsize = fftPlan->batchsize;
2325 trans1Plan->envelope = fftPlan->envelope;
2326 trans1Plan->inputLayout = fftPlan->inputLayout;
2327 trans1Plan->outputLayout = fftPlan->outputLayout;
2328 trans1Plan->inStride[0] = fftPlan->inStride[0];
2329 trans1Plan->outStride[0] = fftPlan->outStride[0];
2330 trans1Plan->inStride[1] = fftPlan->inStride[1];
2331 trans1Plan->outStride[1] = fftPlan->outStride[1];
2332 trans1Plan->iDist = fftPlan->iDist;
2333 trans1Plan->oDist = fftPlan->oDist;
2334 trans1Plan->gen = Transpose_NONSQUARE;
2335 trans1Plan->nonSquareKernelOrder = fftPlan->nonSquareKernelOrder;
2336 if(fftPlan->nonSquareKernelOrder == SWAP_AND_TRANSPOSE)
2337 trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
2338 else if (fftPlan->nonSquareKernelOrder == TRANSPOSE_AND_SWAP)
2339 trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED;
2340 else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP)
2341 trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING;
2342 trans1Plan->transflag = true;
2343 trans1Plan->large1D = fftPlan->large1D;//twiddling may happen in this kernel
2345 if (trans1Plan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
2347 //this should be in a function to avoide duplicate code TODO
2348 //need to treat a non square matrix as a sqaure matrix with bigger batch size
2349 size_t lengthX = trans1Plan->length[0];
2350 size_t lengthY = trans1Plan->length[1];
2352 size_t BatchFactor = (lengthX > lengthY) ? (lengthX / lengthY) : (lengthY / lengthX);
2353 trans1Plan->transposeMiniBatchSize = BatchFactor;
2354 trans1Plan->batchsize *= BatchFactor;
2355 trans1Plan->iDist = trans1Plan->iDist / BatchFactor;
2356 if (lengthX > lengthY)
2358 trans1Plan->length[0] = lengthX / BatchFactor;
2359 trans1Plan->inStride[1] = lengthX / BatchFactor;
2361 else if (lengthX < lengthY)
2363 trans1Plan->length[1] = lengthY / BatchFactor;
2364 trans1Plan->inStride[1] = lengthX;
2368 for (size_t index = 2; index < fftPlan->length.size(); index++)
2370 trans1Plan->length.push_back(fftPlan->length[index]);
2371 trans1Plan->inStride.push_back(fftPlan->inStride[index]);
2372 trans1Plan->outStride.push_back(fftPlan->outStride[index]);
2375 if (fftPlan->hasPreCallback)
2377 trans1Plan->hasPreCallback = true;
2378 trans1Plan->preCallback = fftPlan->preCallback;
2379 trans1Plan->precallUserData = fftPlan->precallUserData;
2383 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL),
2384 _T("BakePlan transpose_nsq_stage1 plan failed"));
2388 OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengths),
2389 _T("CreateDefaultPlan transpose_nsq_stage2 plan failed"));
2391 FFTPlan* trans2Plan = NULL;
2392 lockRAII* trans2Lock = NULL;
2393 OPENCL_V(fftRepo.getPlan(fftPlan->planTY, trans2Plan, trans2Lock), _T("fftRepo.getPlan failed"));
2395 trans2Plan->placeness = CLFFT_INPLACE;
2396 trans2Plan->precision = fftPlan->precision;
2397 trans2Plan->tmpBufSize = 0;
2398 trans2Plan->batchsize = fftPlan->batchsize;
2399 trans2Plan->envelope = fftPlan->envelope;
2400 trans2Plan->inputLayout = fftPlan->inputLayout;
2401 trans2Plan->outputLayout = fftPlan->outputLayout;
2402 trans2Plan->inStride[0] = fftPlan->inStride[0];
2403 trans2Plan->outStride[0] = fftPlan->outStride[0];
2404 trans2Plan->inStride[1] = fftPlan->inStride[1];
2405 trans2Plan->outStride[1] = fftPlan->outStride[1];
2406 trans2Plan->iDist = fftPlan->iDist;
2407 trans2Plan->oDist = fftPlan->oDist;
2408 trans2Plan->gen = Transpose_NONSQUARE;
2409 trans2Plan->nonSquareKernelOrder = fftPlan->nonSquareKernelOrder;
2410 if (fftPlan->nonSquareKernelOrder == SWAP_AND_TRANSPOSE)
2411 trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED;
2412 else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_AND_SWAP)
2413 trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
2414 else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP)
2415 trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
2416 trans2Plan->transflag = true;
2417 trans2Plan->large1D = fftPlan->large1D;//twiddling may happen in this kernel
2419 if (trans2Plan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
2421 //need to treat a non square matrix as a sqaure matrix with bigger batch size
2422 size_t lengthX = trans2Plan->length[0];
2423 size_t lengthY = trans2Plan->length[1];
2425 size_t BatchFactor = (lengthX > lengthY) ? (lengthX/lengthY) : (lengthY/lengthX);
2426 trans2Plan->transposeMiniBatchSize = BatchFactor;
2427 trans2Plan->batchsize *= BatchFactor;
2428 trans2Plan->iDist = trans2Plan->iDist / BatchFactor;
2429 if (lengthX > lengthY)
2431 trans2Plan->length[0] = lengthX / BatchFactor;
2432 trans2Plan->inStride[1] = lengthX / BatchFactor;
2434 else if(lengthX < lengthY)
2436 trans2Plan->length[1] = lengthY / BatchFactor;
2437 trans2Plan->inStride[1] = lengthX;
2441 for (size_t index = 2; index < fftPlan->length.size(); index++)
2443 trans2Plan->length.push_back(fftPlan->length[index]);
2444 trans2Plan->inStride.push_back(fftPlan->inStride[index]);
2445 trans2Plan->outStride.push_back(fftPlan->outStride[index]);
2448 if (fftPlan->hasPostCallback)
2450 trans2Plan->hasPostCallback = true;
2451 trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
2452 trans2Plan->postcallUserData = fftPlan->postcallUserData;
2455 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL),
2456 _T("BakePlan transpose_nsq_stage2 plan failed"));
2460 fftPlan->action = new FFTGeneratedTransposeGCNAction(plHandle, fftPlan, *commQueueFFT, err);
2462 OPENCL_V( err, "FFTGeneratedTransposeXXXAction failed");
2464 fftPlan->baked = true;
2465 return CLFFT_SUCCESS;
2468 size_t length0 = fftPlan->length[0];
2469 size_t length1 = fftPlan->length[1];
2472 if (fftPlan->length[0] > Large1DThreshold ||
2473 fftPlan->length[1] > Large1DThreshold)
2474 fftPlan->large2D = true;
2476 while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
2481 // TODO : Check for a better way to do this.
2482 bool isnvidia = false;
2483 for (size_t Idx = 0; !isnvidia && Idx < numQueues; Idx++)
2485 cl_command_queue QIdx = commQueueFFT[Idx];
2486 cl_device_id Device;
2487 clGetCommandQueueInfo(QIdx, CL_QUEUE_DEVICE, sizeof(Device), &Device, NULL);
2489 clGetDeviceInfo(Device, CL_DEVICE_VENDOR, sizeof(Vendor), &Vendor, NULL);
2490 isnvidia |= (strncmp(Vendor, "NVIDIA", 6) == 0);
2492 // nvidia gpus are failing when doing transpose for 2D FFTs
2493 if (isnvidia) break;
2495 if (fftPlan->length.size() != 2) break;
2496 if (!(IsPo2(fftPlan->length[0])) || !(IsPo2(fftPlan->length[1])))
2498 if (fftPlan->length[1] < 32) break;
2499 //TBD: restrict the use large2D in x!=y case becase we will need two temp buffers
2500 // (1) for 2D usage (2) for 1D large usage
2501 //if (fftPlan->large2D) break;
2502 //Performance show 512 is the good case with transpose
2503 //if user want the result to be transposed, then we will.
2505 if (fftPlan->length[0] < 64) break;
2506 //x!=y case, we need tmp buffer, currently temp buffer only support interleaved format
2507 //if (fftPlan->length[0] != fftPlan->length[1] && fftPlan->outputLayout == CLFFT_COMPLEX_PLANAR) break;
2508 if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1 ||
2509 fftPlan->inStride[1] != fftPlan->length[0] || fftPlan->outStride[1] != fftPlan->length[0])
2511 //if (fftPlan->placeness != CLFFT_INPLACE || fftPlan->inputLayout != CLFFT_COMPLEX_PLANAR)
2513 //if (fftPlan->batchsize != 1) break;
2514 //if (fftPlan->precision != CLFFT_SINGLE) break;
2516 fftPlan->transflag = true;
2519 // x=y & x!=y, In->In for inplace, In->out for outofplace
2520 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
2521 _T( "CreateDefaultPlan for planX failed" ) );
2523 FFTPlan* rowPlan = NULL;
2524 lockRAII* rowLock = NULL;
2525 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
2527 rowPlan->inputLayout = fftPlan->inputLayout;
2528 rowPlan->outputLayout = fftPlan->outputLayout;
2529 rowPlan->placeness = fftPlan->placeness;
2530 rowPlan->outStride[0] = fftPlan->outStride[0];
2531 rowPlan->outStride.push_back(fftPlan->outStride[1]);
2532 rowPlan->oDist = fftPlan->oDist;
2533 rowPlan->precision = fftPlan->precision;
2534 rowPlan->forwardScale = 1.0f;
2535 rowPlan->backwardScale = 1.0f;
2536 rowPlan->tmpBufSize = 0;
2538 rowPlan->gen = fftPlan->gen;
2539 rowPlan->envelope = fftPlan->envelope;
2540 rowPlan->batchsize = fftPlan->batchsize;
2541 rowPlan->inStride[0] = fftPlan->inStride[0];
2542 rowPlan->length.push_back(fftPlan->length[1]);
2543 rowPlan->inStride.push_back(fftPlan->inStride[1]);
2544 rowPlan->iDist = fftPlan->iDist;
2546 //Set callback data if set on top level plan
2547 if (fftPlan->hasPreCallback)
2549 rowPlan->hasPreCallback = true;
2550 rowPlan->preCallback = fftPlan->preCallback;
2551 rowPlan->precallUserData = fftPlan->precallUserData;
2554 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
2555 _T( "BakePlan for planX failed" ) );
2557 //Create transpose plan for first transpose
2558 //x=y: inplace. x!=y inplace: in->tmp, outofplace out->tmp
2559 size_t clLengths[] = { 1, 1, 0 };
2560 clLengths[0] = fftPlan->length[0];
2561 clLengths[1] = fftPlan->length[1];
2563 size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
2564 size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
2567 fftPlan->transpose_in_2d_inplace = (clLengths[0]==clLengths[1]) ? true : false;
2568 if ( (!fftPlan->transpose_in_2d_inplace) && fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2 )
2570 if ((smallerDim % 64 == 0) || (biggerDim % 64 == 0))
2574 // we need tmp buffer for x!=y case
2575 // we assume the tmp buffer is packed interleaved
2576 fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
2577 fftPlan->batchsize * fftPlan->ElementSize();
2580 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
2581 _T( "CreateDefaultPlan for planT failed" ) );
2583 FFTPlan* transPlanX = NULL;
2584 lockRAII* transLockX = NULL;
2585 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, transPlanX, transLockX ), _T( "fftRepo.getPlan failed" ) );
2587 transPlanX->inputLayout = fftPlan->outputLayout;
2588 transPlanX->precision = fftPlan->precision;
2589 transPlanX->tmpBufSize = 0;
2591 transPlanX->envelope = fftPlan->envelope;
2592 transPlanX->batchsize = fftPlan->batchsize;
2593 transPlanX->inStride[0] = fftPlan->outStride[0];
2594 transPlanX->inStride[1] = fftPlan->outStride[1];
2595 transPlanX->iDist = fftPlan->oDist;
2596 transPlanX->transflag = true;
2598 if (!fftPlan->transpose_in_2d_inplace)
2600 transPlanX->gen = Transpose_GCN;
2601 transPlanX->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2602 transPlanX->placeness = CLFFT_OUTOFPLACE;
2603 transPlanX->outStride[0] = 1;
2604 transPlanX->outStride[1] = clLengths[1] + padding;
2605 transPlanX->oDist = clLengths[0] * transPlanX->outStride[1];
2609 transPlanX->gen = Transpose_SQUARE;
2610 transPlanX->outputLayout = fftPlan->outputLayout;
2611 transPlanX->placeness = CLFFT_INPLACE;
2612 transPlanX->outStride[0] = fftPlan->outStride[0];
2613 transPlanX->outStride[1] = fftPlan->outStride[1];
2614 transPlanX->oDist = fftPlan->oDist;
2617 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
2618 _T( "BakePlan for planTX failed" ) );
2620 //create second row plan
2621 //x!=y: tmp->tmp, x=y case: In->In or Out->Out
2622 //if Transposed result is a choice x!=y: tmp->In or out
2623 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
2624 _T( "CreateDefaultPlan for planY failed" ) );
2626 FFTPlan* colPlan = NULL;
2627 lockRAII* colLock = NULL;
2628 OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
2630 if (!fftPlan->transpose_in_2d_inplace)
2632 colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2633 colPlan->inStride[0] = 1;
2634 colPlan->inStride.push_back(clLengths[1] + padding);
2635 colPlan->iDist = clLengths[0] * colPlan->inStride[1];
2637 if (fftPlan->transposed == CLFFT_NOTRANSPOSE)
2639 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2640 colPlan->outStride[0] = 1;
2641 colPlan->outStride.push_back(clLengths[1] + padding);
2642 colPlan->oDist = clLengths[0] * colPlan->outStride[1];
2643 colPlan->placeness = CLFFT_INPLACE;
2647 colPlan->outputLayout = fftPlan->outputLayout;
2648 colPlan->outStride[0] = fftPlan->outStride[0];
2649 colPlan->outStride.push_back(clLengths[1] * fftPlan->outStride[0]);
2650 colPlan->oDist = fftPlan->oDist;
2651 colPlan->placeness = CLFFT_OUTOFPLACE;
2656 colPlan->inputLayout = fftPlan->outputLayout;
2657 colPlan->outputLayout = fftPlan->outputLayout;
2658 colPlan->outStride[0] = fftPlan->outStride[0];
2659 colPlan->outStride.push_back(fftPlan->outStride[1]);
2660 colPlan->oDist = fftPlan->oDist;
2661 colPlan->inStride[0] = fftPlan->outStride[0];
2662 colPlan->inStride.push_back(fftPlan->outStride[1]);
2663 colPlan->iDist = fftPlan->oDist;
2664 colPlan->placeness = CLFFT_INPLACE;
2667 colPlan->precision = fftPlan->precision;
2668 colPlan->forwardScale = fftPlan->forwardScale;
2669 colPlan->backwardScale = fftPlan->backwardScale;
2670 colPlan->tmpBufSize = 0;
2672 colPlan->gen = fftPlan->gen;
2673 colPlan->envelope = fftPlan->envelope;
2674 colPlan->batchsize = fftPlan->batchsize;
2675 colPlan->length.push_back(fftPlan->length[0]);
2677 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
2678 _T( "BakePlan for planY failed" ) );
2680 if (fftPlan->transposed == CLFFT_TRANSPOSED)
2682 fftPlan->baked = true;
2683 return CLFFT_SUCCESS;
2686 //Create transpose plan for second transpose
2687 //x!=y case tmp->In or Out, x=y case In->In or Out->out
2688 size_t clLengthsY[2] = { clLengths[1], clLengths[0] };
2689 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengthsY ),
2690 _T( "CreateDefaultPlan for planTY failed" ) );
2692 FFTPlan* transPlanY = NULL;
2693 lockRAII* transLockY = NULL;
2694 OPENCL_V( fftRepo.getPlan( fftPlan->planTY, transPlanY, transLockY ), _T( "fftRepo.getPlan failed" ) );
2696 if (!fftPlan->transpose_in_2d_inplace)
2698 transPlanY->gen = Transpose_GCN;
2699 transPlanY->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2700 transPlanY->placeness = CLFFT_OUTOFPLACE;
2701 transPlanY->inStride[0] = 1;
2702 transPlanY->inStride[1] = clLengths[1] + padding;
2703 transPlanY->iDist = clLengths[0] * transPlanY->inStride[1];
2704 transPlanY->transOutHorizontal = true;
2708 transPlanY->gen = Transpose_SQUARE;
2709 transPlanY->inputLayout = fftPlan->outputLayout;
2710 transPlanY->placeness = CLFFT_INPLACE;
2711 transPlanY->inStride[0] = fftPlan->outStride[0];
2712 transPlanY->inStride[1] = fftPlan->outStride[1];
2713 transPlanY->iDist = fftPlan->oDist;
2715 transPlanY->outputLayout = fftPlan->outputLayout;
2716 transPlanY->outStride[0] = fftPlan->outStride[0];
2717 transPlanY->outStride[1] = fftPlan->outStride[1];
2718 transPlanY->oDist = fftPlan->oDist;
2719 transPlanY->precision = fftPlan->precision;
2720 transPlanY->tmpBufSize = 0;
2722 transPlanY->envelope = fftPlan->envelope;
2723 transPlanY->batchsize = fftPlan->batchsize;
2724 transPlanY->transflag = true;
2726 //Set callback data if set on top level plan
2727 if (fftPlan->hasPostCallback)
2729 transPlanY->hasPostCallback = true;
2730 transPlanY->postCallbackParam = fftPlan->postCallbackParam;
2731 transPlanY->postcallUserData = fftPlan->postcallUserData;
2734 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
2735 _T( "BakePlan for planTY failed" ) );
2737 fftPlan->baked = true;
2738 return CLFFT_SUCCESS;
2742 if (fftPlan->transposed != CLFFT_NOTRANSPOSE)
2743 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2746 if(fftPlan->inputLayout == CLFFT_REAL)
2748 length0 = fftPlan->length[0];
2749 length1 = fftPlan->length[1];
2751 size_t Nt = (1 + length0/2);
2755 // real to hermitian
2758 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
2759 _T( "CreateDefaultPlan for planX failed" ) );
2761 FFTPlan* rowPlan = NULL;
2762 lockRAII* rowLock = NULL;
2763 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
2766 rowPlan->outputLayout = fftPlan->outputLayout;
2767 rowPlan->inputLayout = fftPlan->inputLayout;
2768 rowPlan->placeness = fftPlan->placeness;
2769 rowPlan->length.push_back(length1);
2771 rowPlan->inStride[0] = fftPlan->inStride[0];
2772 rowPlan->inStride.push_back(fftPlan->inStride[1]);
2773 rowPlan->iDist = fftPlan->iDist;
2775 rowPlan->precision = fftPlan->precision;
2776 rowPlan->forwardScale = 1.0f;
2777 rowPlan->backwardScale = 1.0f;
2778 rowPlan->tmpBufSize = 0;
2780 rowPlan->gen = fftPlan->gen;
2781 rowPlan->envelope = fftPlan->envelope;
2783 rowPlan->batchsize = fftPlan->batchsize;
2785 rowPlan->outStride[0] = fftPlan->outStride[0];
2786 rowPlan->outStride.push_back(fftPlan->outStride[1]);
2787 rowPlan->oDist = fftPlan->oDist;
2789 //this 2d is decomposed from 3d
2790 for (size_t index=2; index < fftPlan->length.size(); index++)
2792 rowPlan->length.push_back(fftPlan->length[index]);
2793 rowPlan->inStride.push_back(fftPlan->inStride[index]);
2794 rowPlan->outStride.push_back(fftPlan->outStride[index]);
2797 //Set callback data if set on top level plan
2798 if (fftPlan->hasPreCallback)
2800 rowPlan->hasPreCallback = true;
2801 rowPlan->preCallback = fftPlan->preCallback;
2802 rowPlan->precallUserData = fftPlan->precallUserData;
2805 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
2807 if( (rowPlan->inStride[0] == 1) && (rowPlan->outStride[0] == 1) &&
2808 ( ((rowPlan->inStride[1] == Nt*2) && (rowPlan->placeness == CLFFT_INPLACE)) ||
2809 ((rowPlan->inStride[1] == length0) && (rowPlan->placeness == CLFFT_OUTOFPLACE)) )
2810 && (rowPlan->outStride[1] == Nt) )
2812 // calc temp buf size
2813 if (fftPlan->tmpBufSize==0)
2815 fftPlan->tmpBufSize = Nt * length1 * fftPlan->batchsize * fftPlan->ElementSize();
2817 for (size_t index=2; index < fftPlan->length.size(); index++)
2819 fftPlan->tmpBufSize *= fftPlan->length[index];
2823 // create first transpose plan
2827 size_t transLengths[2] = { length0, length1 };
2828 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, transLengths ),
2829 _T( "CreateDefaultPlan for planTX transpose failed" ) );
2831 FFTPlan* trans1Plan = NULL;
2832 lockRAII* trans1Lock = NULL;
2833 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
2835 trans1Plan->transflag = true;
2837 transLengths[0] = Nt;
2838 OPENCL_V(clfftSetPlanLength( fftPlan->planTX, CLFFT_2D, transLengths ),
2839 _T( "clfftSetPlanLength for planTX transpose failed" ) );
2841 switch(fftPlan->outputLayout)
2843 case CLFFT_HERMITIAN_INTERLEAVED:
2845 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2846 trans1Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2849 case CLFFT_HERMITIAN_PLANAR:
2851 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2852 trans1Plan->inputLayout = CLFFT_COMPLEX_PLANAR;
2855 default: assert(false);
2858 trans1Plan->placeness = CLFFT_OUTOFPLACE;
2859 trans1Plan->precision = fftPlan->precision;
2860 trans1Plan->tmpBufSize = 0;
2861 trans1Plan->batchsize = fftPlan->batchsize;
2862 trans1Plan->envelope = fftPlan->envelope;
2863 trans1Plan->forwardScale = 1.0f;
2864 trans1Plan->backwardScale = 1.0f;
2866 trans1Plan->inStride[0] = 1;
2867 trans1Plan->inStride[1] = Nt;
2868 trans1Plan->outStride[0] = 1;
2869 trans1Plan->outStride[1] = length1;
2870 trans1Plan->iDist = rowPlan->oDist;
2871 trans1Plan->oDist = Nt*length1;
2872 trans1Plan->transOutHorizontal = true;
2874 trans1Plan->gen = Transpose_GCN;
2877 for (size_t index=2; index < fftPlan->length.size(); index++)
2879 trans1Plan->length.push_back(fftPlan->length[index]);
2880 trans1Plan->inStride.push_back(rowPlan->outStride[index]);
2881 trans1Plan->outStride.push_back(trans1Plan->oDist);
2882 trans1Plan->oDist *= fftPlan->length[index];
2885 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
2886 _T( "BakePlan for planTX failed" ) );
2889 // Create column plan as a row plan
2890 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
2891 _T( "CreateDefaultPlan for planY failed" ) );
2893 FFTPlan* colPlan = NULL;
2894 lockRAII* colLock = NULL;
2895 OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
2897 colPlan->outputLayout = trans1Plan->outputLayout;
2898 colPlan->inputLayout = trans1Plan->outputLayout;
2899 colPlan->placeness = CLFFT_INPLACE;
2900 colPlan->length.push_back(Nt);
2902 colPlan->inStride[0] = 1;
2903 colPlan->inStride.push_back(length1);
2904 colPlan->iDist = Nt*length1;
2906 colPlan->outStride[0] = 1;
2907 colPlan->outStride.push_back(length1);
2908 colPlan->oDist = Nt*length1;
2910 colPlan->precision = fftPlan->precision;
2911 colPlan->forwardScale = fftPlan->forwardScale;
2912 colPlan->backwardScale = fftPlan->backwardScale;
2913 colPlan->tmpBufSize = 0;
2915 colPlan->gen = fftPlan->gen;
2916 colPlan->envelope = fftPlan->envelope;
2918 colPlan->batchsize = fftPlan->batchsize;
2920 //this 2d is decomposed from 3d
2921 for (size_t index=2; index < fftPlan->length.size(); index++)
2923 colPlan->length.push_back(fftPlan->length[index]);
2924 colPlan->inStride.push_back(colPlan->iDist);
2925 colPlan->outStride.push_back(colPlan->oDist);
2926 colPlan->iDist *= fftPlan->length[index];
2927 colPlan->oDist *= fftPlan->length[index];
2930 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
2931 _T( "BakePlan for planY failed" ) );
2933 if (fftPlan->transposed == CLFFT_TRANSPOSED)
2935 fftPlan->baked = true;
2936 return CLFFT_SUCCESS;
2939 // create second transpose plan
2943 size_t trans2Lengths[2] = { length1, length0 };
2944 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, trans2Lengths ),
2945 _T( "CreateDefaultPlan for planTY transpose failed" ) );
2947 FFTPlan* trans2Plan = NULL;
2948 lockRAII* trans2Lock = NULL;
2949 OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
2951 trans2Plan->transflag = true;
2953 trans2Lengths[1] = Nt;
2954 OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, trans2Lengths ),
2955 _T( "clfftSetPlanLength for planTY transpose failed" ) );
2957 switch(fftPlan->outputLayout)
2959 case CLFFT_HERMITIAN_INTERLEAVED:
2961 trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
2962 trans2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2965 case CLFFT_HERMITIAN_PLANAR:
2967 trans2Plan->outputLayout = CLFFT_COMPLEX_PLANAR;
2968 trans2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
2971 default: assert(false);
2974 trans2Plan->placeness = CLFFT_OUTOFPLACE;
2975 trans2Plan->precision = fftPlan->precision;
2976 trans2Plan->tmpBufSize = 0;
2977 trans2Plan->batchsize = fftPlan->batchsize;
2978 trans2Plan->envelope = fftPlan->envelope;
2979 trans2Plan->forwardScale = 1.0f;
2980 trans2Plan->backwardScale = 1.0f;
2982 trans2Plan->inStride[0] = 1;
2983 trans2Plan->inStride[1] = length1;
2984 trans2Plan->outStride[0] = 1;
2985 trans2Plan->outStride[1] = Nt;
2986 trans2Plan->iDist = Nt*length1;
2987 trans2Plan->oDist = fftPlan->oDist;
2989 trans2Plan->gen = Transpose_GCN;
2990 trans2Plan->transflag = true;
2992 for (size_t index=2; index < fftPlan->length.size(); index++)
2994 trans2Plan->length.push_back(fftPlan->length[index]);
2995 trans2Plan->inStride.push_back(trans2Plan->iDist);
2996 trans2Plan->iDist *= fftPlan->length[index];
2997 trans2Plan->outStride.push_back(fftPlan->outStride[index]);
3001 //Set callback data if set on top level plan
3002 if (fftPlan->hasPostCallback)
3004 trans2Plan->hasPostCallback = true;
3005 trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
3006 trans2Plan->postcallUserData = fftPlan->postcallUserData;
3009 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
3010 _T( "BakePlan for planTY failed" ) );
3016 // complex to complex
3018 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
3019 _T( "CreateDefaultPlan for planY failed" ) );
3021 FFTPlan* colPlan = NULL;
3022 lockRAII* colLock = NULL;
3023 OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3025 switch(fftPlan->outputLayout)
3027 case CLFFT_HERMITIAN_INTERLEAVED:
3029 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3030 colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3033 case CLFFT_HERMITIAN_PLANAR:
3035 colPlan->outputLayout = CLFFT_COMPLEX_PLANAR;
3036 colPlan->inputLayout = CLFFT_COMPLEX_PLANAR;
3039 default: assert(false);
3042 colPlan->placeness = CLFFT_INPLACE;
3043 colPlan->length.push_back(Nt);
3045 colPlan->outStride[0] = fftPlan->outStride[1];
3046 colPlan->outStride.push_back(fftPlan->outStride[0]);
3047 colPlan->oDist = fftPlan->oDist;
3050 colPlan->precision = fftPlan->precision;
3051 colPlan->forwardScale = fftPlan->forwardScale;
3052 colPlan->backwardScale = fftPlan->backwardScale;
3053 colPlan->tmpBufSize = fftPlan->tmpBufSize;
3055 colPlan->gen = fftPlan->gen;
3056 colPlan->envelope = fftPlan->envelope;
3058 colPlan->batchsize = fftPlan->batchsize;
3060 colPlan->inStride[0] = rowPlan->outStride[1];
3061 colPlan->inStride.push_back(rowPlan->outStride[0]);
3062 colPlan->iDist = rowPlan->oDist;
3064 //this 2d is decomposed from 3d
3065 for (size_t index=2; index < fftPlan->length.size(); index++)
3067 colPlan->length.push_back(fftPlan->length[index]);
3068 colPlan->outStride.push_back(fftPlan->outStride[index]);
3069 colPlan->inStride.push_back(rowPlan->outStride[index]);
3072 //Set callback data if set on top level plan
3073 if (fftPlan->hasPostCallback)
3075 colPlan->hasPostCallback = true;
3076 colPlan->postCallbackParam = fftPlan->postCallbackParam;
3077 colPlan->postcallUserData = fftPlan->postcallUserData;
3080 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
3084 else if(fftPlan->outputLayout == CLFFT_REAL)
3086 length0 = fftPlan->length[0];
3087 length1 = fftPlan->length[1];
3089 size_t Nt = (1 + length0/2);
3090 if (fftPlan->tmpBufSize==0)
3092 fftPlan->tmpBufSize = Nt * length1 * fftPlan->batchsize * fftPlan->ElementSize();
3093 for (size_t index=2; index < fftPlan->length.size(); index++)
3094 fftPlan->tmpBufSize *= fftPlan->length[index];
3097 if ((fftPlan->tmpBufSizeC2R==0) && (fftPlan->placeness == CLFFT_OUTOFPLACE) && (fftPlan->length.size() == 2))
3099 fftPlan->tmpBufSizeC2R = fftPlan->tmpBufSize;
3102 if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) &&
3103 ( ((fftPlan->outStride[1] == Nt*2) && (fftPlan->oDist == Nt*2*length1) && (fftPlan->placeness == CLFFT_INPLACE)) ||
3104 ((fftPlan->outStride[1] == length0) && (fftPlan->oDist == length0*length1) && (fftPlan->placeness == CLFFT_OUTOFPLACE)) )
3105 && (fftPlan->inStride[1] == Nt) && (fftPlan->iDist == Nt*length1) )
3107 // create first transpose plan
3111 size_t transLengths[2] = { length0, length1 };
3112 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, transLengths ),
3113 _T( "CreateDefaultPlan for planTY transpose failed" ) );
3115 FFTPlan* trans1Plan = NULL;
3116 lockRAII* trans1Lock = NULL;
3117 OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
3119 trans1Plan->transflag = true;
3121 transLengths[0] = Nt;
3122 OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, transLengths ),
3123 _T( "clfftSetPlanLength for planTY transpose failed" ) );
3125 switch(fftPlan->inputLayout)
3127 case CLFFT_HERMITIAN_INTERLEAVED:
3129 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3130 trans1Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3133 case CLFFT_HERMITIAN_PLANAR:
3135 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3136 trans1Plan->inputLayout = CLFFT_COMPLEX_PLANAR;
3139 default: assert(false);
3142 trans1Plan->placeness = CLFFT_OUTOFPLACE;
3143 trans1Plan->precision = fftPlan->precision;
3144 trans1Plan->tmpBufSize = 0;
3145 trans1Plan->batchsize = fftPlan->batchsize;
3146 trans1Plan->envelope = fftPlan->envelope;
3147 trans1Plan->forwardScale = 1.0f;
3148 trans1Plan->backwardScale = 1.0f;
3150 trans1Plan->inStride[0] = 1;
3151 trans1Plan->inStride[1] = Nt;
3152 trans1Plan->outStride[0] = 1;
3153 trans1Plan->outStride[1] = length1;
3154 trans1Plan->iDist = fftPlan->iDist;
3155 trans1Plan->oDist = Nt*length1;
3156 trans1Plan->transOutHorizontal = true;
3158 trans1Plan->gen = Transpose_GCN;
3161 for (size_t index=2; index < fftPlan->length.size(); index++)
3163 trans1Plan->length.push_back(fftPlan->length[index]);
3164 trans1Plan->inStride.push_back(fftPlan->inStride[index]);
3165 trans1Plan->outStride.push_back(trans1Plan->oDist);
3166 trans1Plan->oDist *= fftPlan->length[index];
3169 //Set callback data if set on top level plan
3170 if (fftPlan->hasPreCallback)
3172 trans1Plan->hasPreCallback = true;
3173 trans1Plan->preCallback = fftPlan->preCallback;
3174 trans1Plan->precallUserData = fftPlan->precallUserData;
3177 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
3178 _T( "BakePlan for planTY failed" ) );
3181 // complex to complex
3183 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
3184 _T( "CreateDefaultPlan for planY failed" ) );
3186 FFTPlan* colPlan = NULL;
3187 lockRAII* colLock = NULL;
3188 OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3190 colPlan->length.push_back(Nt);
3192 colPlan->inStride[0] = 1;
3193 colPlan->inStride.push_back(length1);
3194 colPlan->iDist = trans1Plan->oDist;
3196 colPlan->placeness = CLFFT_INPLACE;
3197 colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3198 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3200 colPlan->outStride[0] = colPlan->inStride[0];
3201 colPlan->outStride.push_back(colPlan->inStride[1]);
3202 colPlan->oDist = colPlan->iDist;
3204 for (size_t index=2; index < fftPlan->length.size(); index++)
3206 colPlan->length.push_back(fftPlan->length[index]);
3207 colPlan->inStride.push_back(trans1Plan->outStride[index]);
3208 colPlan->outStride.push_back(trans1Plan->outStride[index]);
3212 colPlan->precision = fftPlan->precision;
3213 colPlan->forwardScale = 1.0f;
3214 colPlan->backwardScale = 1.0f;
3215 colPlan->tmpBufSize = 0;
3217 colPlan->gen = fftPlan->gen;
3218 colPlan->envelope = fftPlan->envelope;
3220 colPlan->batchsize = fftPlan->batchsize;
3222 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
3224 // create second transpose plan
3228 size_t trans2Lengths[2] = { length1, length0 };
3229 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, trans2Lengths ),
3230 _T( "CreateDefaultPlan for planTX transpose failed" ) );
3232 FFTPlan* trans2Plan = NULL;
3233 lockRAII* trans2Lock = NULL;
3234 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
3236 trans2Plan->transflag = true;
3238 trans2Lengths[1] = Nt;
3239 OPENCL_V(clfftSetPlanLength( fftPlan->planTX, CLFFT_2D, trans2Lengths ),
3240 _T( "clfftSetPlanLength for planTX transpose failed" ) );
3243 trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3244 trans2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3247 trans2Plan->placeness = CLFFT_OUTOFPLACE;
3248 trans2Plan->precision = fftPlan->precision;
3249 trans2Plan->tmpBufSize = 0;
3250 trans2Plan->batchsize = fftPlan->batchsize;
3251 trans2Plan->envelope = fftPlan->envelope;
3252 trans2Plan->forwardScale = 1.0f;
3253 trans2Plan->backwardScale = 1.0f;
3255 trans2Plan->inStride[0] = 1;
3256 trans2Plan->inStride[1] = length1;
3257 trans2Plan->outStride[0] = 1;
3258 trans2Plan->outStride[1] = Nt;
3259 trans2Plan->iDist = colPlan->oDist;
3260 trans2Plan->oDist = Nt*length1;
3262 trans2Plan->gen = Transpose_GCN;
3263 trans2Plan->transflag = true;
3265 for (size_t index=2; index < fftPlan->length.size(); index++)
3267 trans2Plan->length.push_back(fftPlan->length[index]);
3268 trans2Plan->inStride.push_back(colPlan->outStride[index]);
3269 trans2Plan->outStride.push_back(trans2Plan->oDist);
3270 trans2Plan->oDist *= fftPlan->length[index];
3274 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
3275 _T( "BakePlan for planTX failed" ) );
3278 // hermitian to real
3281 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
3282 _T( "CreateDefaultPlan for planX failed" ) );
3284 FFTPlan* rowPlan = NULL;
3285 lockRAII* rowLock = NULL;
3286 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
3288 rowPlan->outputLayout = fftPlan->outputLayout;
3289 rowPlan->inputLayout = CLFFT_HERMITIAN_INTERLEAVED;
3291 rowPlan->length.push_back(length1);
3293 rowPlan->outStride[0] = fftPlan->outStride[0];
3294 rowPlan->outStride.push_back(fftPlan->outStride[1]);
3295 rowPlan->oDist = fftPlan->oDist;
3297 rowPlan->inStride[0] = trans2Plan->outStride[0];
3298 rowPlan->inStride.push_back(trans2Plan->outStride[1]);
3299 rowPlan->iDist = trans2Plan->oDist;
3301 for (size_t index=2; index < fftPlan->length.size(); index++)
3303 rowPlan->length.push_back(fftPlan->length[index]);
3304 rowPlan->inStride.push_back(trans2Plan->outStride[index]);
3305 rowPlan->outStride.push_back(fftPlan->outStride[index]);
3308 if (fftPlan->placeness == CLFFT_INPLACE)
3310 rowPlan->placeness = CLFFT_INPLACE;
3314 rowPlan->placeness = CLFFT_OUTOFPLACE;
3318 rowPlan->precision = fftPlan->precision;
3319 rowPlan->forwardScale = fftPlan->forwardScale;
3320 rowPlan->backwardScale = fftPlan->backwardScale;
3321 rowPlan->tmpBufSize = 0;
3323 rowPlan->gen = fftPlan->gen;
3324 rowPlan->envelope = fftPlan->envelope;
3326 rowPlan->batchsize = fftPlan->batchsize;
3328 //Set callback data if set on top level plan
3329 if (fftPlan->hasPostCallback)
3331 rowPlan->hasPostCallback = true;
3332 rowPlan->postCallbackParam = fftPlan->postCallbackParam;
3333 rowPlan->postcallUserData = fftPlan->postcallUserData;
3336 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
3342 // complex to complex
3344 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
3345 _T( "CreateDefaultPlan for planY failed" ) );
3347 FFTPlan* colPlan = NULL;
3348 lockRAII* colLock = NULL;
3349 OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3352 switch(fftPlan->inputLayout)
3354 case CLFFT_HERMITIAN_INTERLEAVED:
3356 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3357 colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3360 case CLFFT_HERMITIAN_PLANAR:
3362 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3363 colPlan->inputLayout = CLFFT_COMPLEX_PLANAR;
3366 default: assert(false);
3370 colPlan->length.push_back(Nt);
3372 colPlan->inStride[0] = fftPlan->inStride[1];
3373 colPlan->inStride.push_back(fftPlan->inStride[0]);
3374 colPlan->iDist = fftPlan->iDist;
3377 if (fftPlan->placeness == CLFFT_INPLACE)
3379 colPlan->placeness = CLFFT_INPLACE;
3383 if(fftPlan->length.size() > 2)
3384 colPlan->placeness = CLFFT_INPLACE;
3386 colPlan->placeness = CLFFT_OUTOFPLACE;
3389 if(colPlan->placeness == CLFFT_INPLACE)
3391 colPlan->outStride[0] = colPlan->inStride[0];
3392 colPlan->outStride.push_back(colPlan->inStride[1]);
3393 colPlan->oDist = colPlan->iDist;
3395 for (size_t index=2; index < fftPlan->length.size(); index++)
3397 colPlan->length.push_back(fftPlan->length[index]);
3398 colPlan->inStride.push_back(fftPlan->inStride[index]);
3399 colPlan->outStride.push_back(fftPlan->inStride[index]);
3404 colPlan->outStride[0] = Nt;
3405 colPlan->outStride.push_back(1);
3406 colPlan->oDist = Nt*length1;
3408 for (size_t index=2; index < fftPlan->length.size(); index++)
3410 colPlan->length.push_back(fftPlan->length[index]);
3411 colPlan->inStride.push_back(fftPlan->inStride[index]);
3412 colPlan->outStride.push_back(colPlan->oDist);
3413 colPlan->oDist *= fftPlan->length[index];
3417 colPlan->precision = fftPlan->precision;
3418 colPlan->forwardScale = 1.0f;
3419 colPlan->backwardScale = 1.0f;
3420 colPlan->tmpBufSize = 0;
3422 colPlan->gen = fftPlan->gen;
3423 colPlan->envelope = fftPlan->envelope;
3425 colPlan->batchsize = fftPlan->batchsize;
3427 //Set callback data if set on top level plan
3428 if (fftPlan->hasPreCallback)
3430 colPlan->hasPreCallback = true;
3431 colPlan->preCallback = fftPlan->preCallback;
3432 colPlan->precallUserData = fftPlan->precallUserData;
3435 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
3438 // hermitian to real
3441 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
3442 _T( "CreateDefaultPlan for planX failed" ) );
3444 FFTPlan* rowPlan = NULL;
3445 lockRAII* rowLock = NULL;
3446 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
3448 rowPlan->outputLayout = fftPlan->outputLayout;
3449 rowPlan->inputLayout = CLFFT_HERMITIAN_INTERLEAVED;
3451 rowPlan->length.push_back(length1);
3453 rowPlan->outStride[0] = fftPlan->outStride[0];
3454 rowPlan->outStride.push_back(fftPlan->outStride[1]);
3455 rowPlan->oDist = fftPlan->oDist;
3457 if (fftPlan->placeness == CLFFT_INPLACE)
3459 rowPlan->placeness = CLFFT_INPLACE;
3461 rowPlan->inStride[0] = colPlan->outStride[1];
3462 rowPlan->inStride.push_back(colPlan->outStride[0]);
3463 rowPlan->iDist = colPlan->oDist;
3465 for (size_t index=2; index < fftPlan->length.size(); index++)
3467 rowPlan->length.push_back(fftPlan->length[index]);
3468 rowPlan->inStride.push_back(colPlan->outStride[index]);
3469 rowPlan->outStride.push_back(fftPlan->outStride[index]);
3474 rowPlan->placeness = CLFFT_OUTOFPLACE;
3476 rowPlan->inStride[0] = 1;
3477 rowPlan->inStride.push_back(Nt);
3478 rowPlan->iDist = Nt*length1;
3480 for (size_t index=2; index < fftPlan->length.size(); index++)
3482 rowPlan->length.push_back(fftPlan->length[index]);
3483 rowPlan->outStride.push_back(fftPlan->outStride[index]);
3484 rowPlan->inStride.push_back(rowPlan->iDist);
3485 rowPlan->iDist *= fftPlan->length[index];
3490 rowPlan->precision = fftPlan->precision;
3491 rowPlan->forwardScale = fftPlan->forwardScale;
3492 rowPlan->backwardScale = fftPlan->backwardScale;
3493 rowPlan->tmpBufSize = 0;
3495 rowPlan->gen = fftPlan->gen;
3496 rowPlan->envelope = fftPlan->envelope;
3498 rowPlan->batchsize = fftPlan->batchsize;
3500 //Set callback data if set on top level plan
3501 if (fftPlan->hasPostCallback)
3503 rowPlan->hasPostCallback = true;
3504 rowPlan->postCallbackParam = fftPlan->postCallbackParam;
3505 rowPlan->postcallUserData = fftPlan->postcallUserData;
3508 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
3513 if (fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2)
3515 fftPlan->tmpBufSize = length0 * length1 *
3516 fftPlan->batchsize * fftPlan->ElementSize();
3520 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimX ] ),
3521 _T( "CreateDefaultPlan for planX failed" ) );
3523 FFTPlan* rowPlan = NULL;
3524 lockRAII* rowLock = NULL;
3525 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
3527 rowPlan->inputLayout = fftPlan->inputLayout;
3528 if (fftPlan->large2D || fftPlan->length.size()>2)
3530 rowPlan->outputLayout = fftPlan->outputLayout;
3531 rowPlan->placeness = fftPlan->placeness;
3532 rowPlan->outStride[0] = fftPlan->outStride[0];
3533 rowPlan->outStride.push_back(fftPlan->outStride[1]);
3534 rowPlan->oDist = fftPlan->oDist;
3538 rowPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3539 rowPlan->placeness = CLFFT_OUTOFPLACE;
3540 rowPlan->outStride[0] = length1;//1;
3541 rowPlan->outStride.push_back(1);//length0);
3542 rowPlan->oDist = length0 * length1;
3544 rowPlan->precision = fftPlan->precision;
3545 rowPlan->forwardScale = 1.0f;
3546 rowPlan->backwardScale = 1.0f;
3547 rowPlan->tmpBufSize = fftPlan->tmpBufSize;
3549 rowPlan->gen = fftPlan->gen;
3550 rowPlan->envelope = fftPlan->envelope;
3552 // This is the row fft, the first elements distance between the first two FFTs is the distance of the first elements
3553 // of the first two rows in the original buffer.
3554 rowPlan->batchsize = fftPlan->batchsize;
3555 rowPlan->inStride[0] = fftPlan->inStride[0];
3557 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
3558 rowPlan->length.push_back(fftPlan->length[1]);
3559 rowPlan->inStride.push_back(fftPlan->inStride[1]);
3561 //this 2d is decomposed from 3d
3562 if (fftPlan->length.size()>2)
3564 rowPlan->length.push_back(fftPlan->length[2]);
3565 rowPlan->inStride.push_back(fftPlan->inStride[2]);
3566 rowPlan->outStride.push_back(fftPlan->outStride[2]);
3569 rowPlan->iDist = fftPlan->iDist;
3571 //Set callback data if set on top level plan
3572 if (fftPlan->hasPreCallback)
3574 rowPlan->hasPreCallback = true;
3575 rowPlan->preCallback = fftPlan->preCallback;
3576 rowPlan->precallUserData = fftPlan->precallUserData;
3579 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
3582 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimY ] ),
3583 _T( "CreateDefaultPlan for planY failed" ) );
3585 FFTPlan* colPlan = NULL;
3586 lockRAII* colLock = NULL;
3587 OPENCL_V( fftRepo.getPlan( fftPlan->planY, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3589 if (fftPlan->large2D || fftPlan->length.size()>2)
3591 colPlan->inputLayout = fftPlan->outputLayout;
3592 colPlan->placeness = CLFFT_INPLACE;
3593 colPlan->inStride[0] = fftPlan->outStride[1];
3594 colPlan->inStride.push_back(fftPlan->outStride[0]);
3595 colPlan->iDist = fftPlan->oDist;
3599 colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3600 colPlan->placeness = CLFFT_OUTOFPLACE;
3601 colPlan->inStride[0] = 1;//length0;
3602 colPlan->inStride.push_back(length1);//1);
3603 colPlan->iDist = length0 * length1;
3606 colPlan->outputLayout = fftPlan->outputLayout;
3607 colPlan->precision = fftPlan->precision;
3608 colPlan->forwardScale = fftPlan->forwardScale;
3609 colPlan->backwardScale = fftPlan->backwardScale;
3610 colPlan->tmpBufSize = fftPlan->tmpBufSize;
3612 colPlan->gen = fftPlan->gen;
3613 colPlan->envelope = fftPlan->envelope;
3615 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
3616 // elements in the original buffer. Like a transpose of the matrix
3617 colPlan->batchsize = fftPlan->batchsize;
3618 colPlan->outStride[0] = fftPlan->outStride[1];
3620 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
3621 colPlan->length.push_back(fftPlan->length[0]);
3622 colPlan->outStride.push_back(fftPlan->outStride[0]);
3623 colPlan->oDist = fftPlan->oDist;
3625 //this 2d is decomposed from 3d
3626 if (fftPlan->length.size()>2)
3628 //assert(fftPlan->large2D);
3629 colPlan->length.push_back(fftPlan->length[2]);
3630 colPlan->inStride.push_back(fftPlan->outStride[2]);
3631 colPlan->outStride.push_back(fftPlan->outStride[2]);
3634 //Set callback data if set on top level plan
3635 if (fftPlan->hasPostCallback)
3637 colPlan->hasPostCallback = true;
3638 colPlan->postCallbackParam = fftPlan->postCallbackParam;
3639 colPlan->postcallUserData = fftPlan->postcallUserData;
3642 OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planY failed" ) );
3645 fftPlan->baked = true;
3646 return CLFFT_SUCCESS;
3650 if(fftPlan->inputLayout == CLFFT_REAL)
3653 size_t length0 = fftPlan->length[ DimX ];
3654 size_t length1 = fftPlan->length[ DimY ];
3655 size_t length2 = fftPlan->length[ DimZ ];
3657 size_t Nt = (1 + length0/2);
3661 size_t clLengths[] = { length0, length1, 0 };
3662 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
3663 _T( "CreateDefaultPlan 2D planX failed" ) );
3665 FFTPlan* xyPlan = NULL;
3666 lockRAII* rowLock = NULL;
3667 OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
3669 xyPlan->inputLayout = fftPlan->inputLayout;
3670 xyPlan->outputLayout = fftPlan->outputLayout;
3671 xyPlan->placeness = fftPlan->placeness;
3672 xyPlan->precision = fftPlan->precision;
3673 xyPlan->forwardScale = 1.0f;
3674 xyPlan->backwardScale = 1.0f;
3675 xyPlan->tmpBufSize = fftPlan->tmpBufSize;
3677 xyPlan->gen = fftPlan->gen;
3678 xyPlan->envelope = fftPlan->envelope;
3680 // This is the xy fft, the first elements distance between the first two FFTs is the distance of the first elements
3681 // of the first two rows in the original buffer.
3682 xyPlan->batchsize = fftPlan->batchsize;
3683 xyPlan->inStride[0] = fftPlan->inStride[0];
3684 xyPlan->inStride[1] = fftPlan->inStride[1];
3685 xyPlan->outStride[0] = fftPlan->outStride[0];
3686 xyPlan->outStride[1] = fftPlan->outStride[1];
3688 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
3689 xyPlan->length.push_back(fftPlan->length[2]);
3690 xyPlan->inStride.push_back(fftPlan->inStride[2]);
3691 xyPlan->outStride.push_back(fftPlan->outStride[2]);
3692 xyPlan->iDist = fftPlan->iDist;
3693 xyPlan->oDist = fftPlan->oDist;
3695 //this 3d is decomposed from 4d
3696 for (size_t index=3; index < fftPlan->length.size(); index++)
3698 xyPlan->length.push_back(fftPlan->length[index]);
3699 xyPlan->inStride.push_back(fftPlan->inStride[index]);
3700 xyPlan->outStride.push_back(fftPlan->outStride[index]);
3703 //Set callback data if set on top level plan
3704 if (fftPlan->hasPreCallback)
3706 xyPlan->hasPreCallback = true;
3707 xyPlan->preCallback = fftPlan->preCallback;
3708 xyPlan->precallUserData = fftPlan->precallUserData;
3711 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
3713 if( (xyPlan->inStride[0] == 1) && (xyPlan->outStride[0] == 1) &&
3714 (xyPlan->outStride[2] == Nt*length1) &&
3715 ( ((xyPlan->inStride[2] == Nt*2*length1) && (xyPlan->placeness == CLFFT_INPLACE)) ||
3716 ((xyPlan->inStride[2] == length0*length1) && (xyPlan->placeness == CLFFT_OUTOFPLACE)) ) )
3719 if (fftPlan->tmpBufSize==0)
3721 fftPlan->tmpBufSize = Nt * length1 * length2 * fftPlan->batchsize * fftPlan->ElementSize();
3723 for (size_t index=3; index < fftPlan->length.size(); index++)
3725 fftPlan->tmpBufSize *= fftPlan->length[index];
3729 // create first transpose plan
3733 size_t transLengths[2] = { length0*length1, length2 };
3734 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, transLengths ),
3735 _T( "CreateDefaultPlan for planTX transpose failed" ) );
3737 FFTPlan* trans1Plan = NULL;
3738 lockRAII* trans1Lock = NULL;
3739 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
3741 trans1Plan->transflag = true;
3743 transLengths[0] = Nt*length1;
3744 OPENCL_V(clfftSetPlanLength( fftPlan->planTX, CLFFT_2D, transLengths ),
3745 _T( "clfftSetPlanLength for planTX transpose failed" ) );
3747 switch(fftPlan->outputLayout)
3749 case CLFFT_HERMITIAN_INTERLEAVED:
3751 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3752 trans1Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3755 case CLFFT_HERMITIAN_PLANAR:
3757 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3758 trans1Plan->inputLayout = CLFFT_COMPLEX_PLANAR;
3761 default: assert(false);
3764 trans1Plan->placeness = CLFFT_OUTOFPLACE;
3765 trans1Plan->precision = fftPlan->precision;
3766 trans1Plan->tmpBufSize = 0;
3767 trans1Plan->batchsize = fftPlan->batchsize;
3768 trans1Plan->envelope = fftPlan->envelope;
3769 trans1Plan->forwardScale = 1.0f;
3770 trans1Plan->backwardScale = 1.0f;
3772 trans1Plan->inStride[0] = 1;
3773 trans1Plan->inStride[1] = Nt*length1;
3774 trans1Plan->outStride[0] = 1;
3775 trans1Plan->outStride[1] = length2;
3776 trans1Plan->iDist = xyPlan->oDist;
3777 trans1Plan->oDist = Nt*length1*length2;
3778 trans1Plan->transOutHorizontal = true;
3780 trans1Plan->gen = Transpose_GCN;
3783 for (size_t index=3; index < fftPlan->length.size(); index++)
3785 trans1Plan->length.push_back(fftPlan->length[index]);
3786 trans1Plan->inStride.push_back(xyPlan->outStride[index]);
3787 trans1Plan->outStride.push_back(trans1Plan->oDist);
3788 trans1Plan->oDist *= fftPlan->length[index];
3791 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
3792 _T( "BakePlan for planTX failed" ) );
3794 // Create column plan as a row plan
3795 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimZ ] ),
3796 _T( "CreateDefaultPlan for planZ failed" ) );
3798 FFTPlan* colPlan = NULL;
3799 lockRAII* colLock = NULL;
3800 OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3802 colPlan->outputLayout = trans1Plan->outputLayout;
3803 colPlan->inputLayout = trans1Plan->outputLayout;
3804 colPlan->placeness = CLFFT_INPLACE;
3805 colPlan->length.push_back(Nt*length1);
3807 colPlan->inStride[0] = 1;
3808 colPlan->inStride.push_back(length2);
3809 colPlan->iDist = Nt*length1*length2;
3811 colPlan->outStride[0] = 1;
3812 colPlan->outStride.push_back(length2);
3813 colPlan->oDist = Nt*length1*length2;
3815 colPlan->precision = fftPlan->precision;
3816 colPlan->forwardScale = fftPlan->forwardScale;
3817 colPlan->backwardScale = fftPlan->backwardScale;
3818 colPlan->tmpBufSize = 0;
3820 colPlan->gen = fftPlan->gen;
3821 colPlan->envelope = fftPlan->envelope;
3823 colPlan->batchsize = fftPlan->batchsize;
3825 //this 2d is decomposed from 3d
3826 for (size_t index=3; index < fftPlan->length.size(); index++)
3828 colPlan->length.push_back(fftPlan->length[index]);
3829 colPlan->inStride.push_back(colPlan->iDist);
3830 colPlan->outStride.push_back(colPlan->oDist);
3831 colPlan->iDist *= fftPlan->length[index];
3832 colPlan->oDist *= fftPlan->length[index];
3835 OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ),
3836 _T( "BakePlan for planZ failed" ) );
3838 if (fftPlan->transposed == CLFFT_TRANSPOSED)
3840 fftPlan->baked = true;
3841 return CLFFT_SUCCESS;
3844 // create second transpose plan
3848 size_t trans2Lengths[2] = { length2, length0*length1 };
3849 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, trans2Lengths ),
3850 _T( "CreateDefaultPlan for planTY transpose failed" ) );
3852 FFTPlan* trans2Plan = NULL;
3853 lockRAII* trans2Lock = NULL;
3854 OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
3856 trans2Plan->transflag = true;
3858 trans2Lengths[1] = Nt*length1;
3859 OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, trans2Lengths ),
3860 _T( "clfftSetPlanLength for planTY transpose failed" ) );
3862 switch(fftPlan->outputLayout)
3864 case CLFFT_HERMITIAN_INTERLEAVED:
3866 trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3867 trans2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3870 case CLFFT_HERMITIAN_PLANAR:
3872 trans2Plan->outputLayout = CLFFT_COMPLEX_PLANAR;
3873 trans2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3876 default: assert(false);
3879 trans2Plan->placeness = CLFFT_OUTOFPLACE;
3880 trans2Plan->precision = fftPlan->precision;
3881 trans2Plan->tmpBufSize = 0;
3882 trans2Plan->batchsize = fftPlan->batchsize;
3883 trans2Plan->envelope = fftPlan->envelope;
3884 trans2Plan->forwardScale = 1.0f;
3885 trans2Plan->backwardScale = 1.0f;
3887 trans2Plan->inStride[0] = 1;
3888 trans2Plan->inStride[1] = length2;
3889 trans2Plan->outStride[0] = 1;
3890 trans2Plan->outStride[1] = Nt*length1;
3891 trans2Plan->iDist = Nt*length1*length2;
3892 trans2Plan->oDist = fftPlan->oDist;
3894 trans2Plan->gen = Transpose_GCN;
3895 trans2Plan->transflag = true;
3897 for (size_t index=3; index < fftPlan->length.size(); index++)
3899 trans2Plan->length.push_back(fftPlan->length[index]);
3900 trans2Plan->inStride.push_back(trans2Plan->iDist);
3901 trans2Plan->iDist *= fftPlan->length[index];
3902 trans2Plan->outStride.push_back(fftPlan->outStride[index]);
3905 //Set callback data if set on top level plan
3906 if (fftPlan->hasPostCallback)
3908 trans2Plan->hasPostCallback = true;
3909 trans2Plan->postCallbackParam = fftPlan->postCallbackParam;
3910 trans2Plan->postcallUserData = fftPlan->postcallUserData;
3913 OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
3914 _T( "BakePlan for planTY failed" ) );
3921 clLengths[0] = fftPlan->length[ DimZ ];
3922 clLengths[1] = clLengths[2] = 0;
3923 //create 1D col plan
3924 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
3925 _T( "CreateDefaultPlan for planZ failed" ) );
3927 FFTPlan* colPlan = NULL;
3928 lockRAII* colLock = NULL;
3929 OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
3931 switch(fftPlan->outputLayout)
3933 case CLFFT_HERMITIAN_INTERLEAVED:
3935 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
3936 colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
3939 case CLFFT_HERMITIAN_PLANAR:
3941 colPlan->outputLayout = CLFFT_COMPLEX_PLANAR;
3942 colPlan->inputLayout = CLFFT_COMPLEX_PLANAR;
3945 default: assert(false);
3948 colPlan->placeness = CLFFT_INPLACE;
3949 colPlan->precision = fftPlan->precision;
3950 colPlan->forwardScale = fftPlan->forwardScale;
3951 colPlan->backwardScale = fftPlan->backwardScale;
3952 colPlan->tmpBufSize = fftPlan->tmpBufSize;
3954 colPlan->gen = fftPlan->gen;
3955 colPlan->envelope = fftPlan->envelope;
3957 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
3958 // elements in the original buffer. Like a transpose of the matrix
3959 colPlan->batchsize = fftPlan->batchsize;
3960 colPlan->inStride[0] = fftPlan->outStride[2];
3961 colPlan->outStride[0] = fftPlan->outStride[2];
3963 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
3964 colPlan->length.push_back(1 + fftPlan->length[0]/2);
3965 colPlan->length.push_back(fftPlan->length[1]);
3966 colPlan->inStride.push_back(fftPlan->outStride[0]);
3967 colPlan->inStride.push_back(fftPlan->outStride[1]);
3968 colPlan->outStride.push_back(fftPlan->outStride[0]);
3969 colPlan->outStride.push_back(fftPlan->outStride[1]);
3970 colPlan->iDist = fftPlan->oDist;
3971 colPlan->oDist = fftPlan->oDist;
3973 //this 3d is decomposed from 4d
3974 for (size_t index=3; index < fftPlan->length.size(); index++)
3976 colPlan->length.push_back(fftPlan->length[index]);
3977 colPlan->inStride.push_back(xyPlan->outStride[index]);
3978 colPlan->outStride.push_back(fftPlan->outStride[index]);
3981 //Set callback data if set on top level plan
3982 if (fftPlan->hasPostCallback)
3984 colPlan->hasPostCallback = true;
3985 colPlan->postCallbackParam = fftPlan->postCallbackParam;
3986 colPlan->postcallUserData = fftPlan->postcallUserData;
3989 OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
3992 else if(fftPlan->outputLayout == CLFFT_REAL)
3994 size_t length0 = fftPlan->length[ DimX ];
3995 size_t length1 = fftPlan->length[ DimY ];
3996 size_t length2 = fftPlan->length[ DimZ ];
3998 size_t Nt = (1 + length0/2);
4000 if (fftPlan->tmpBufSize == 0)
4002 fftPlan->tmpBufSize = Nt * length1 * length2 * fftPlan->batchsize * fftPlan->ElementSize();
4003 for (size_t index=3; index < fftPlan->length.size(); index++)
4004 fftPlan->tmpBufSize *= fftPlan->length[index];
4007 if ((fftPlan->tmpBufSizeC2R==0) && (fftPlan->placeness == CLFFT_OUTOFPLACE))
4009 fftPlan->tmpBufSizeC2R = fftPlan->tmpBufSize;
4012 if( (fftPlan->inStride[0] == 1) && (fftPlan->outStride[0] == 1) &&
4013 ( ((fftPlan->outStride[2] == Nt*2*length1) && (fftPlan->oDist == Nt*2*length1*length2) && (fftPlan->placeness == CLFFT_INPLACE)) ||
4014 ((fftPlan->outStride[2] == length0*length1) && (fftPlan->oDist == length0*length1*length2) && (fftPlan->placeness == CLFFT_OUTOFPLACE)) )
4015 && (fftPlan->inStride[2] == Nt*length1) && (fftPlan->iDist == Nt*length1*length2))
4017 // create first transpose plan
4021 size_t transLengths[2] = { length0*length1, length2 };
4022 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, transLengths ),
4023 _T( "CreateDefaultPlan for planTZ transpose failed" ) );
4025 FFTPlan* trans1Plan = NULL;
4026 lockRAII* trans1Lock = NULL;
4027 OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
4029 trans1Plan->transflag = true;
4031 transLengths[0] = Nt*length1;
4032 OPENCL_V(clfftSetPlanLength( fftPlan->planTZ, CLFFT_2D, transLengths ),
4033 _T( "clfftSetPlanLength for planTZ transpose failed" ) );
4035 switch(fftPlan->inputLayout)
4037 case CLFFT_HERMITIAN_INTERLEAVED:
4039 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4040 trans1Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
4043 case CLFFT_HERMITIAN_PLANAR:
4045 trans1Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4046 trans1Plan->inputLayout = CLFFT_COMPLEX_PLANAR;
4049 default: assert(false);
4052 trans1Plan->placeness = CLFFT_OUTOFPLACE;
4053 trans1Plan->precision = fftPlan->precision;
4054 trans1Plan->tmpBufSize = 0;
4055 trans1Plan->batchsize = fftPlan->batchsize;
4056 trans1Plan->envelope = fftPlan->envelope;
4057 trans1Plan->forwardScale = 1.0f;
4058 trans1Plan->backwardScale = 1.0f;
4060 trans1Plan->inStride[0] = 1;
4061 trans1Plan->inStride[1] = Nt*length1;
4062 trans1Plan->outStride[0] = 1;
4063 trans1Plan->outStride[1] = length2;
4064 trans1Plan->iDist = fftPlan->iDist;
4065 trans1Plan->oDist = Nt*length1*length2;
4066 trans1Plan->transOutHorizontal = true;
4068 trans1Plan->gen = Transpose_GCN;
4071 for (size_t index=3; index < fftPlan->length.size(); index++)
4073 trans1Plan->length.push_back(fftPlan->length[index]);
4074 trans1Plan->inStride.push_back(fftPlan->inStride[index]);
4075 trans1Plan->outStride.push_back(trans1Plan->oDist);
4076 trans1Plan->oDist *= fftPlan->length[index];
4079 //Set callback data if set on top level plan
4080 if (fftPlan->hasPreCallback)
4082 trans1Plan->hasPreCallback = true;
4083 trans1Plan->preCallback = fftPlan->preCallback;
4084 trans1Plan->precallUserData = fftPlan->precallUserData;
4087 OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
4088 _T( "BakePlan for planTZ failed" ) );
4091 // complex to complex
4093 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, &fftPlan->length[ DimZ ] ),
4094 _T( "CreateDefaultPlan for planZ failed" ) );
4096 FFTPlan* colPlan = NULL;
4097 lockRAII* colLock = NULL;
4098 OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
4100 colPlan->length.push_back(Nt*length1);
4102 colPlan->inStride[0] = 1;
4103 colPlan->inStride.push_back(length2);
4104 colPlan->iDist = trans1Plan->oDist;
4106 colPlan->placeness = CLFFT_INPLACE;
4107 colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
4108 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4110 colPlan->outStride[0] = colPlan->inStride[0];
4111 colPlan->outStride.push_back(colPlan->inStride[1]);
4112 colPlan->oDist = colPlan->iDist;
4114 for (size_t index=3; index < fftPlan->length.size(); index++)
4116 colPlan->length.push_back(fftPlan->length[index]);
4117 colPlan->inStride.push_back(trans1Plan->outStride[index-1]);
4118 colPlan->outStride.push_back(trans1Plan->outStride[index-1]);
4122 colPlan->precision = fftPlan->precision;
4123 colPlan->forwardScale = 1.0f;
4124 colPlan->backwardScale = 1.0f;
4125 colPlan->tmpBufSize = 0;
4127 colPlan->gen = fftPlan->gen;
4128 colPlan->envelope = fftPlan->envelope;
4130 colPlan->batchsize = fftPlan->batchsize;
4132 OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planZ failed" ) );
4134 // create second transpose plan
4138 size_t trans2Lengths[2] = { length2, length0*length1 };
4139 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, trans2Lengths ),
4140 _T( "CreateDefaultPlan for planTX transpose failed" ) );
4142 FFTPlan* trans2Plan = NULL;
4143 lockRAII* trans2Lock = NULL;
4144 OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
4146 trans2Plan->transflag = true;
4148 trans2Lengths[1] = Nt*length1;
4149 OPENCL_V(clfftSetPlanLength( fftPlan->planTX, CLFFT_2D, trans2Lengths ),
4150 _T( "clfftSetPlanLength for planTX transpose failed" ) );
4153 trans2Plan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4154 trans2Plan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
4157 trans2Plan->placeness = CLFFT_OUTOFPLACE;
4158 trans2Plan->precision = fftPlan->precision;
4159 trans2Plan->tmpBufSize = 0;
4160 trans2Plan->batchsize = fftPlan->batchsize;
4161 trans2Plan->envelope = fftPlan->envelope;
4162 trans2Plan->forwardScale = 1.0f;
4163 trans2Plan->backwardScale = 1.0f;
4165 trans2Plan->inStride[0] = 1;
4166 trans2Plan->inStride[1] = length2;
4167 trans2Plan->outStride[0] = 1;
4168 trans2Plan->outStride[1] = Nt*length1;
4169 trans2Plan->iDist = colPlan->oDist;
4170 trans2Plan->oDist = Nt*length1*length2;
4172 trans2Plan->gen = Transpose_GCN;
4173 trans2Plan->transflag = true;
4175 for (size_t index=3; index < fftPlan->length.size(); index++)
4177 trans2Plan->length.push_back(fftPlan->length[index]);
4178 trans2Plan->inStride.push_back(colPlan->outStride[index-1]);
4179 trans2Plan->outStride.push_back(trans2Plan->oDist);
4180 trans2Plan->oDist *= fftPlan->length[index];
4184 OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
4185 _T( "BakePlan for planTX failed" ) );
4188 // hermitian to real
4191 size_t clLengths[] = { length0, length1, 0 };
4192 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
4193 _T( "CreateDefaultPlan for 2D planX failed" ) );
4195 FFTPlan* rowPlan = NULL;
4196 lockRAII* rowLock = NULL;
4197 OPENCL_V( fftRepo.getPlan( fftPlan->planX, rowPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
4199 rowPlan->outputLayout = fftPlan->outputLayout;
4200 rowPlan->inputLayout = CLFFT_HERMITIAN_INTERLEAVED;
4202 rowPlan->length.push_back(length2);
4204 rowPlan->outStride[0] = fftPlan->outStride[0];
4205 rowPlan->outStride[1] = fftPlan->outStride[1];
4206 rowPlan->outStride.push_back(fftPlan->outStride[2]);
4207 rowPlan->oDist = fftPlan->oDist;
4209 rowPlan->inStride[0] = trans2Plan->outStride[0];
4210 rowPlan->inStride[1] = Nt;
4211 rowPlan->inStride.push_back(Nt*length1);
4212 rowPlan->iDist = trans2Plan->oDist;
4214 for (size_t index=3; index < fftPlan->length.size(); index++)
4216 rowPlan->length.push_back(fftPlan->length[index]);
4217 rowPlan->inStride.push_back(trans2Plan->outStride[index-1]);
4218 rowPlan->outStride.push_back(fftPlan->outStride[index]);
4221 if (fftPlan->placeness == CLFFT_INPLACE)
4223 rowPlan->placeness = CLFFT_INPLACE;
4227 rowPlan->placeness = CLFFT_OUTOFPLACE;
4231 rowPlan->precision = fftPlan->precision;
4232 rowPlan->forwardScale = fftPlan->forwardScale;
4233 rowPlan->backwardScale = fftPlan->backwardScale;
4234 rowPlan->tmpBufSize = 0;
4236 rowPlan->gen = fftPlan->gen;
4237 rowPlan->envelope = fftPlan->envelope;
4239 rowPlan->batchsize = fftPlan->batchsize;
4241 //Set callback data if set on top level plan
4242 if (fftPlan->hasPostCallback)
4244 rowPlan->hasPostCallback = true;
4245 rowPlan->postCallbackParam = fftPlan->postCallbackParam;
4246 rowPlan->postcallUserData = fftPlan->postcallUserData;
4249 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan for planX failed" ) );
4254 size_t clLengths[] = { 1, 0, 0 };
4256 clLengths[0] = fftPlan->length[ DimZ ];
4258 //create 1D col plan
4259 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
4260 _T( "CreateDefaultPlan for planZ failed" ) );
4262 FFTPlan* colPlan = NULL;
4263 lockRAII* colLock = NULL;
4264 OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
4266 switch(fftPlan->inputLayout)
4268 case CLFFT_HERMITIAN_INTERLEAVED:
4270 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4271 colPlan->inputLayout = CLFFT_COMPLEX_INTERLEAVED;
4274 case CLFFT_HERMITIAN_PLANAR:
4276 colPlan->outputLayout = CLFFT_COMPLEX_INTERLEAVED;
4277 colPlan->inputLayout = CLFFT_COMPLEX_PLANAR;
4280 default: assert(false);
4283 colPlan->length.push_back(Nt);
4284 colPlan->length.push_back(length1);
4286 colPlan->inStride[0] = fftPlan->inStride[2];
4287 colPlan->inStride.push_back(fftPlan->inStride[0]);
4288 colPlan->inStride.push_back(fftPlan->inStride[1]);
4289 colPlan->iDist = fftPlan->iDist;
4292 if (fftPlan->placeness == CLFFT_INPLACE)
4294 colPlan->placeness = CLFFT_INPLACE;
4296 colPlan->outStride[0] = colPlan->inStride[0];
4297 colPlan->outStride.push_back(colPlan->inStride[1]);
4298 colPlan->outStride.push_back(colPlan->inStride[2]);
4299 colPlan->oDist = colPlan->iDist;
4301 for (size_t index=3; index < fftPlan->length.size(); index++)
4303 colPlan->length.push_back(fftPlan->length[index]);
4304 colPlan->inStride.push_back(fftPlan->inStride[index]);
4305 colPlan->outStride.push_back(fftPlan->inStride[index]);
4310 colPlan->placeness = CLFFT_OUTOFPLACE;
4312 colPlan->outStride[0] = Nt*length1;
4313 colPlan->outStride.push_back(1);
4314 colPlan->outStride.push_back(Nt);
4315 colPlan->oDist = Nt*length1*length2;
4317 for (size_t index=3; index < fftPlan->length.size(); index++)
4319 colPlan->length.push_back(fftPlan->length[index]);
4320 colPlan->inStride.push_back(fftPlan->inStride[index]);
4321 colPlan->outStride.push_back(colPlan->oDist);
4322 colPlan->oDist *= fftPlan->length[index];
4327 colPlan->precision = fftPlan->precision;
4328 colPlan->forwardScale = 1.0f;
4329 colPlan->backwardScale = 1.0f;
4330 colPlan->tmpBufSize = 0;
4332 colPlan->gen = fftPlan->gen;
4333 colPlan->envelope = fftPlan->envelope;
4335 colPlan->batchsize = fftPlan->batchsize;
4337 //Set callback data if set on top level plan
4338 if (fftPlan->hasPreCallback)
4340 colPlan->hasPreCallback = true;
4341 colPlan->preCallback = fftPlan->preCallback;
4342 colPlan->precallUserData = fftPlan->precallUserData;
4345 OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
4348 clLengths[0] = fftPlan->length[ DimX ];
4349 clLengths[1] = fftPlan->length[ DimY ];
4352 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
4353 _T( "CreateDefaultPlan 2D planX failed" ) );
4355 FFTPlan* xyPlan = NULL;
4356 lockRAII* rowLock = NULL;
4357 OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
4359 xyPlan->inputLayout = CLFFT_HERMITIAN_INTERLEAVED;
4360 xyPlan->outputLayout = fftPlan->outputLayout;
4362 xyPlan->length.push_back(length2);
4364 xyPlan->outStride[0] = fftPlan->outStride[0];
4365 xyPlan->outStride[1] = fftPlan->outStride[1];
4366 xyPlan->outStride.push_back(fftPlan->outStride[2]);
4367 xyPlan->oDist = fftPlan->oDist;
4369 if (fftPlan->placeness == CLFFT_INPLACE)
4371 xyPlan->placeness = CLFFT_INPLACE;
4373 xyPlan->inStride[0] = colPlan->outStride[1];
4374 xyPlan->inStride[1] = colPlan->outStride[2];
4375 xyPlan->inStride.push_back(colPlan->outStride[0]);
4376 xyPlan->iDist = colPlan->oDist;
4378 for (size_t index=3; index < fftPlan->length.size(); index++)
4380 xyPlan->length.push_back(fftPlan->length[index]);
4381 xyPlan->inStride.push_back(colPlan->outStride[index]);
4382 xyPlan->outStride.push_back(fftPlan->outStride[index]);
4387 xyPlan->placeness = CLFFT_OUTOFPLACE;
4389 xyPlan->inStride[0] = 1;
4390 xyPlan->inStride[1] = Nt;
4391 xyPlan->inStride.push_back(Nt*length1);
4392 xyPlan->iDist = Nt*length1*length2;
4394 for (size_t index=3; index < fftPlan->length.size(); index++)
4396 xyPlan->length.push_back(fftPlan->length[index]);
4397 xyPlan->outStride.push_back(fftPlan->outStride[index]);
4398 xyPlan->inStride.push_back(xyPlan->iDist);
4399 xyPlan->iDist *= fftPlan->length[index];
4404 xyPlan->precision = fftPlan->precision;
4405 xyPlan->forwardScale = fftPlan->forwardScale;
4406 xyPlan->backwardScale = fftPlan->backwardScale;
4407 xyPlan->tmpBufSize = fftPlan->tmpBufSize;
4409 xyPlan->gen = fftPlan->gen;
4410 xyPlan->envelope = fftPlan->envelope;
4412 xyPlan->batchsize = fftPlan->batchsize;
4414 //Set callback data if set on top level plan
4415 if (fftPlan->hasPostCallback)
4417 xyPlan->hasPostCallback = true;
4418 xyPlan->postCallbackParam = fftPlan->postCallbackParam;
4419 xyPlan->postcallUserData = fftPlan->postcallUserData;
4422 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
4427 if (fftPlan->tmpBufSize==0 && (
4428 fftPlan->length[0] > Large1DThreshold ||
4429 fftPlan->length[1] > Large1DThreshold ||
4430 fftPlan->length[2] > Large1DThreshold
4433 fftPlan->tmpBufSize = fftPlan->length[0] * fftPlan->length[1] * fftPlan->length[2] *
4434 fftPlan->batchsize * fftPlan->ElementSize();
4437 size_t clLengths[] = { 1, 1, 0 };
4438 clLengths[0] = fftPlan->length[ DimX ];
4439 clLengths[1] = fftPlan->length[ DimY ];
4442 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_2D, clLengths ),
4443 _T( "CreateDefaultPlan 2D planX failed" ) );
4445 FFTPlan* xyPlan = NULL;
4446 lockRAII* rowLock = NULL;
4447 OPENCL_V( fftRepo.getPlan( fftPlan->planX, xyPlan, rowLock ), _T( "fftRepo.getPlan failed" ) );
4449 xyPlan->inputLayout = fftPlan->inputLayout;
4450 xyPlan->outputLayout = fftPlan->outputLayout;
4451 xyPlan->placeness = fftPlan->placeness;
4452 xyPlan->precision = fftPlan->precision;
4453 xyPlan->forwardScale = 1.0f;
4454 xyPlan->backwardScale = 1.0f;
4455 xyPlan->tmpBufSize = fftPlan->tmpBufSize;
4457 xyPlan->gen = fftPlan->gen;
4458 xyPlan->envelope = fftPlan->envelope;
4460 // This is the xy fft, the first elements distance between the first two FFTs is the distance of the first elements
4461 // of the first two rows in the original buffer.
4462 xyPlan->batchsize = fftPlan->batchsize;
4463 xyPlan->inStride[0] = fftPlan->inStride[0];
4464 xyPlan->inStride[1] = fftPlan->inStride[1];
4465 xyPlan->outStride[0] = fftPlan->outStride[0];
4466 xyPlan->outStride[1] = fftPlan->outStride[1];
4468 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
4469 xyPlan->length.push_back(fftPlan->length[2]);
4470 xyPlan->inStride.push_back(fftPlan->inStride[2]);
4471 xyPlan->outStride.push_back(fftPlan->outStride[2]);
4472 xyPlan->iDist = fftPlan->iDist;
4473 xyPlan->oDist = fftPlan->oDist;
4475 //Set callback data if set on top level plan
4476 if (fftPlan->hasPreCallback)
4478 xyPlan->hasPreCallback = true;
4479 xyPlan->preCallback = fftPlan->preCallback;
4480 xyPlan->precallUserData = fftPlan->precallUserData;
4483 OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->2D planX failed" ) );
4485 clLengths[0] = fftPlan->length[ DimZ ];
4486 clLengths[1] = clLengths[2] = 0;
4487 //create 1D col plan
4488 OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planZ, fftPlan->context, CLFFT_1D, clLengths ),
4489 _T( "CreateDefaultPlan for planZ failed" ) );
4491 FFTPlan* colPlan = NULL;
4492 lockRAII* colLock = NULL;
4493 OPENCL_V( fftRepo.getPlan( fftPlan->planZ, colPlan, colLock ), _T( "fftRepo.getPlan failed" ) );
4495 colPlan->inputLayout = fftPlan->outputLayout;
4496 colPlan->outputLayout = fftPlan->outputLayout;
4497 colPlan->placeness = CLFFT_INPLACE;
4498 colPlan->precision = fftPlan->precision;
4499 colPlan->forwardScale = fftPlan->forwardScale;
4500 colPlan->backwardScale = fftPlan->backwardScale;
4501 colPlan->tmpBufSize = fftPlan->tmpBufSize;
4503 colPlan->gen = fftPlan->gen;
4504 colPlan->envelope = fftPlan->envelope;
4506 // This is a column FFT, the first elements distance between each FFT is the distance of the first two
4507 // elements in the original buffer. Like a transpose of the matrix
4508 colPlan->batchsize = fftPlan->batchsize;
4509 colPlan->inStride[0] = fftPlan->outStride[2];
4510 colPlan->outStride[0] = fftPlan->outStride[2];
4512 //pass length and other info to kernel, so the kernel knows this is decomposed from higher dimension
4513 colPlan->length.push_back(fftPlan->length[0]);
4514 colPlan->length.push_back(fftPlan->length[1]);
4515 colPlan->inStride.push_back(fftPlan->outStride[0]);
4516 colPlan->inStride.push_back(fftPlan->outStride[1]);
4517 colPlan->outStride.push_back(fftPlan->outStride[0]);
4518 colPlan->outStride.push_back(fftPlan->outStride[1]);
4519 colPlan->iDist = fftPlan->oDist;
4520 colPlan->oDist = fftPlan->oDist;
4522 //Set callback data if set on top level plan
4523 if (fftPlan->hasPostCallback)
4525 colPlan->hasPostCallback = true;
4526 colPlan->postCallbackParam = fftPlan->postCallbackParam;
4527 colPlan->postcallUserData = fftPlan->postcallUserData;
4530 OPENCL_V(clfftBakePlan(fftPlan->planZ, numQueues, commQueueFFT, NULL, NULL ), _T( "BakePlan 3D->1D planZ failed" ) );
4533 fftPlan->baked = true;
4534 return CLFFT_SUCCESS;
4539 clfftStatus err = selectAction(fftPlan, fftPlan->action, commQueueFFT);
4541 // Allocate resources
4542 OPENCL_V( fftPlan->AllocateBuffers (), _T("AllocateBuffers() failed"));
4544 fftPlan->ConstructAndEnqueueConstantBuffers( commQueueFFT );
4546 // Record that we baked the plan
4547 fftPlan->baked = true;
4549 return CLFFT_SUCCESS;
4552 clfftStatus clfftCopyPlan( clfftPlanHandle* out_plHandle, cl_context new_context, clfftPlanHandle in_plHandle )
4554 FFTRepo& fftRepo = FFTRepo::getInstance( );
4555 FFTPlan* in_fftPlan = NULL, *out_fftPlan = NULL;
4556 lockRAII* in_planLock = NULL, *out_planLock = NULL;
4558 OPENCL_V( fftRepo.getPlan( in_plHandle, in_fftPlan, in_planLock ), _T( "fftRepo.getPlan failed" ) );
4560 OPENCL_V( clfftCreateDefaultPlan( out_plHandle, new_context, in_fftPlan->dim, &in_fftPlan->length[ 0 ] ),
4561 _T( "clfftCreateDefaultPlan failed" ) );
4563 OPENCL_V( fftRepo.getPlan( *out_plHandle, out_fftPlan, out_planLock ), _T( "fftRepo.getPlan failed" ) );
4565 // Let other operations complete before attempting to copy the plan
4566 scopedLock sLock( *in_planLock, _T( "clfftCopyPlan" ) );
4568 out_fftPlan->baked = false;
4569 out_fftPlan->gen = in_fftPlan->gen;
4570 out_fftPlan->envelope = in_fftPlan->envelope;
4571 out_fftPlan->dim = in_fftPlan->dim;
4572 out_fftPlan->inputLayout = in_fftPlan->inputLayout;
4573 out_fftPlan->outputLayout = in_fftPlan->outputLayout;
4574 out_fftPlan->placeness = in_fftPlan->placeness;
4575 out_fftPlan->precision = in_fftPlan->precision;
4576 out_fftPlan->forwardScale = in_fftPlan->forwardScale;
4577 out_fftPlan->backwardScale = in_fftPlan->backwardScale;
4578 out_fftPlan->iDist = in_fftPlan->iDist;
4579 out_fftPlan->oDist = in_fftPlan->oDist;
4580 out_fftPlan->length = in_fftPlan->length;
4581 out_fftPlan->inStride = in_fftPlan->inStride;
4582 out_fftPlan->outStride = in_fftPlan->outStride;
4583 out_fftPlan->batchsize = in_fftPlan->batchsize;
4584 out_fftPlan->transposed = in_fftPlan->transposed;
4586 return CLFFT_SUCCESS;
4589 clfftStatus FFTPlan::ConstructAndEnqueueConstantBuffers( cl_command_queue* commQueueFFT )
4591 // Construct the constant buffer and call clEnqueueWriteBuffer
4593 cb_t ConstantBufferParams [CLFFT_CB_SIZE];
4594 memset (& ConstantBufferParams, 0, sizeof (ConstantBufferParams));
4596 ConstantBufferParams[0].u = std::max<cl_uint> (1, cl_uint (/*fftPlan->*/batchsize));
4599 OPENCL_V(clEnqueueWriteBuffer( *commQueueFFT,
4600 /*fftPlan->*/const_buffer,
4601 1, // TODO? non-blocking write?
4603 sizeof(ConstantBufferParams),
4604 &ConstantBufferParams,
4607 NULL), _T("clEnqueueWriteBuffer failed") );
4609 return CLFFT_SUCCESS;
4613 clfftStatus clfftDestroyPlan( clfftPlanHandle* plHandle )
4615 FFTRepo& fftRepo = FFTRepo::getInstance( );
4616 FFTPlan* fftPlan = NULL;
4617 lockRAII* planLock = NULL;
4619 OPENCL_V( fftRepo.getPlan( *plHandle, fftPlan, planLock ), _T( "fftRepo.getPlan failed" ) );
4621 // Recursively destroy subplans, that are used for higher dimensional FFT's
4622 if( fftPlan->planX )
4623 clfftDestroyPlan( &fftPlan->planX );
4624 if( fftPlan->planY )
4625 clfftDestroyPlan( &fftPlan->planY );
4626 if( fftPlan->planZ )
4627 clfftDestroyPlan( &fftPlan->planZ );
4628 if( fftPlan->planTX )
4629 clfftDestroyPlan( &fftPlan->planTX );
4630 if( fftPlan->planTY )
4631 clfftDestroyPlan( &fftPlan->planTY );
4632 if( fftPlan->planTZ )
4633 clfftDestroyPlan( &fftPlan->planTZ );
4634 if( fftPlan->planRCcopy )
4635 clfftDestroyPlan( &fftPlan->planRCcopy );
4636 if( fftPlan->planCopy )
4637 clfftDestroyPlan( &fftPlan->planCopy );
4639 fftRepo.deletePlan( plHandle );
4641 return CLFFT_SUCCESS;
4644 // This routine will query the OpenCL context for it's devices
4645 // and their hardware limitations, which we synthesize into a
4646 // hardware "envelope".
4647 // We only query the devices the first time we're called after
4648 // the object's context is set. On 2nd and subsequent calls,
4649 // we just return the pointer.
4651 clfftStatus FFTPlan::SetEnvelope ()
4654 // TODO The caller has already acquired the lock on *this
4655 // However, we shouldn't depend on it.
4657 if (0 == envelope.limit_LocalMemSize) do {
4658 // First time, query OpenCL for the device info
4660 memset (&envelope, 0, sizeof(envelope));
4662 // Get the size needed for the device list
4664 size_t deviceListSize = 0;
4665 OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize ),
4666 _T("Getting device array size ( ::clGetContextInfo() )" ));
4667 cl_uint n = cl_uint (deviceListSize / sizeof(cl_device_id));
4670 std::vector< cl_device_id > devices( n+1 );
4671 // Get the device list
4673 OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, &devices[ 0 ], NULL ),
4674 _T("Getting device array ( ::clGetContextInfo() )") );
4676 // Get the # of devices
4678 cl_uint cContextDevices = 0;
4680 size_t deviceVersionSize = 0;
4681 OPENCL_V( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, 0, NULL, &deviceVersionSize ),
4682 _T("Getting CL_DEVICE_VERSION Info string size ( ::clGetDeviceInfo() )" ));
4684 std::vector< char > szDeviceVersion( deviceVersionSize );
4685 OPENCL_V( ::clGetDeviceInfo( devices[0], CL_DEVICE_VERSION, deviceVersionSize, &szDeviceVersion[ 0 ], NULL ),
4686 _T("Getting CL_DEVICE_VERSION Platform Info string ( ::clGetDeviceInfo() )" ));
4688 char openclstr[11]="OpenCL 1.0";
4690 if (!strncmp((const char*)&szDeviceVersion[ 0 ], openclstr, 10))
4692 cContextDevices = 1;
4696 OPENCL_V( ::clGetContextInfo( context, CL_CONTEXT_NUM_DEVICES, sizeof( cContextDevices ), &cContextDevices, NULL ),
4697 _T("Getting number of context devices ( ::clGetContextInfo() )" ));
4700 cContextDevices = std::min<cl_uint> (cContextDevices, n);
4701 if (0 == cContextDevices)
4704 envelope.limit_LocalMemSize = 32768;
4705 envelope.limit_WorkGroupSize = 256;
4706 envelope.limit_Dimensions = countOf (envelope.limit_Size);
4707 for (size_t u = 0; u < countOf (envelope.limit_Size); ++u) {
4708 envelope.limit_Size[u] = 256;
4711 for( cl_uint i = 0; i < cContextDevices; ++i )
4713 cl_device_id devId = devices[i];
4715 cl_ulong memsize = 0;
4716 unsigned int maxdim = 0;
4717 size_t temp[countOf (envelope.limit_Size)];
4718 memset (&temp, 0, sizeof(temp));
4720 OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( cl_ulong ), &memsize, NULL ),
4721 _T("Getting CL_DEVICE_LOCAL_MEM_SIZE device info ( ::clGetDeviceInfo() )") );
4722 envelope.limit_LocalMemSize = std::min<size_t> (envelope.limit_LocalMemSize, memsize);
4724 OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( unsigned int ), &maxdim, NULL ),
4725 _T("Getting CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS device info ( ::clGetDeviceInfo() )") );
4726 BUG_CHECK (countOf (envelope.limit_Size) >= maxdim);
4727 envelope.limit_Dimensions = std::min<size_t> (envelope.limit_Dimensions, maxdim);
4729 OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( size_t ), &temp[0], NULL ),
4730 _T("Getting CL_DEVICE_MAX_WORK_GROUP_SIZE device info ( ::clGetDeviceInfo() )") );
4731 envelope.limit_WorkGroupSize = std::min<size_t> (envelope.limit_WorkGroupSize, temp[0]);
4733 OPENCL_V( ::clGetDeviceInfo( devId, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( temp ), &temp[0], NULL ),
4734 _T("Getting CL_DEVICE_MAX_WORK_ITEM_SIZES device info ( ::clGetDeviceInfo() )") );
4735 for (size_t u = 0; u < envelope.limit_Dimensions; ++u) {
4736 BUG_CHECK (temp[u] > 0)
4737 envelope.limit_Size[u] = std::min<size_t> (envelope.limit_Size[u], temp[u]);
4741 BUG_CHECK (envelope.limit_LocalMemSize >= 1024)
4744 return CLFFT_SUCCESS;
4747 clfftStatus FFTPlan::AllocateBuffers ()
4749 cl_int status = CL_SUCCESS;
4751 assert (NULL == const_buffer);
4754 assert(4 == sizeof(int));
4757 const_buffer = clCreateBuffer (context,
4759 CLFFT_CB_SIZE * sizeof (int),
4762 if (CL_SUCCESS != status)
4766 return (clfftStatus) status;
4769 clfftStatus FFTPlan::ReleaseBuffers ()
4771 clfftStatus result = CLFFT_SUCCESS;
4774 if( NULL != const_buffer )
4776 tmp = static_cast< clfftStatus >( clReleaseMemObject( const_buffer ) );
4777 const_buffer = NULL;
4778 if( CLFFT_SUCCESS == result )
4782 if( (NULL != intBuffer) && libCreatedIntBuffer )
4784 tmp = static_cast< clfftStatus >( clReleaseMemObject( intBuffer ) );
4786 if( CLFFT_SUCCESS == result )
4790 if( NULL != intBufferRC )
4792 tmp = static_cast< clfftStatus >( clReleaseMemObject( intBufferRC ) );
4794 if( CLFFT_SUCCESS == result )
4798 if( NULL != intBufferC2R )
4800 tmp = static_cast< clfftStatus >( clReleaseMemObject( intBufferC2R ) );
4801 intBufferC2R = NULL;
4802 if( CLFFT_SUCCESS == result )
4811 clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
4815 case Stockham: return GetMax1DLengthStockham(longest);
4816 case Transpose_GCN: *longest = 4096; return CLFFT_SUCCESS;
4817 case Transpose_SQUARE: *longest = 4096; return CLFFT_SUCCESS;
4818 case Transpose_NONSQUARE: *longest = 4096; return CLFFT_SUCCESS;
4819 case Copy: *longest = 4096; return CLFFT_SUCCESS;
4820 default: assert(false); return CLFFT_NOTIMPLEMENTED;
4824 clfftStatus FFTPlan::GetEnvelope (const FFTEnvelope ** ppEnvelope) const
4826 if( &envelope == NULL )
4829 return CLFFT_NOTIMPLEMENTED;
4832 *ppEnvelope = &envelope;
4833 return CLFFT_SUCCESS;
4836 size_t FFTPlan::ElementSize() const
4838 return ( ((precision == CLFFT_DOUBLE) || (precision == CLFFT_DOUBLE_FAST)) ? sizeof( std::complex<double> ) : sizeof( std::complex<float> ) );