// ================================================================================================= // This file is part of the CodeVault project. The project is licensed under Apache Version 2.0. // CodeVault is part of the EU-project PRACE-4IP (WP7.3.C). // // Author(s): // Cedric Nugteren <cedric.nugteren@surfsara.nl> // // This example demonstrates the use of NVIDIA's fast Fourier transform library for CUDA: cuFFT. // The example is set-up to perform an in-place forward transform followed by an in-place inverse // transform such that the output signal should be equal to the input signal. The example uses pre- // processor defines to select the FFT-size and the number of batches. The FFTs performed are 1D // and single-precision complex-to-complex. // // See [http://docs.nvidia.com/cuda/cufft] for the full cuFFT documentation. // // ================================================================================================= #include <cstdio> #include <cstdlib> #include <cmath> #include <complex> // The NVIDIA CUDA cuFFT library #include <cufft.h> // Constants #define NAME "[3_spectral_cufft] " // Settings #define SIZE 128 // The size of the FFT to perform #define BATCHES 20 // The number of batches of FFTs of size 'SIZE' #define ERROR_MARGIN 1e-5 // Acceptable absolute error for validation // ================================================================================================= // Monolithic function without command-line arguments int main() { printf("\n"); // Allocates and initializes example data on the host (==CPU). This creates two arrays with the // same (random) contents: one to be processed (read/write) and one for verification purposes. // The 'std::complex<float>' data-type can also be replaced by 'cufftComplex'. std::complex<float>* example_data = new std::complex<float>[SIZE*BATCHES]; std::complex<float>* signal_host = new std::complex<float>[SIZE*BATCHES]; for (int i=0; i<SIZE*BATCHES; ++i) { example_data[i].real(rand() / ((float)RAND_MAX)); example_data[i].imag(rand() / ((float)RAND_MAX)); signal_host[i].real(example_data[i].real()); signal_host[i].imag(example_data[i].imag()); } // Allocates and initializes memory on the device (==GPU). Initialization is based on the host // input signal: 'cudaMemcpy' is used to perform the host-to-device transfer. printf("%sAllocating and initializing GPU memory\n", NAME); cufftComplex* signal_device; cudaMalloc(&signal_device, sizeof(cufftComplex)*SIZE*BATCHES); cudaMemcpy(signal_device, signal_host, sizeof(cufftComplex)*SIZE*BATCHES, cudaMemcpyHostToDevice); // Creates a cuFFT plan. This describes the size and type of FFTs to be performed. The given // arguments are: // 1) a pointer to the plan itself to store the plan configuration // 2) the FFT-size (e.g. 128-points transform) // 3) the FFT-type (e.g. complex-to-complex or complex-to-real) // 4) the number of batches // More complicated plans can be created using 'cufftPlanMany', which allows for 2D or 3D // transform and custom pitches and strides. printf("%sCreating a cuFFT plan\n", NAME); cufftHandle plan; cufftResult status; status = cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCHES); if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); } // Performs the forward FFTW based on the created plan. Note that this is in-place: the source // and destination buffer (both on the GPU!) are the same. This call is a-synchronous and thus // requires 'cudaDeviceSynchronize' afterwards to ensure completion of the work on the GPU. printf("%sPerforming a 1D forward C2C FFT on the GPU\n", NAME); status = cufftExecC2C(plan, signal_device, signal_device, CUFFT_FORWARD); if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); } cudaDeviceSynchronize(); // As above, but now performs an inverse FFT to 'undo' the previous FFT (allows for easier // validation). printf("%sPerforming a 1D inverse C2C FFT on the GPU\n", NAME); status = cufftExecC2C(plan, signal_device, signal_device, CUFFT_INVERSE); if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); } cudaDeviceSynchronize(); // Copies the results back from the device (==GPU) to the host (==CPU) printf("%sCopying the results back to the CPU\n", NAME); cudaMemcpy(signal_host, signal_device, sizeof(cufftComplex)*SIZE*BATCHES, cudaMemcpyDeviceToHost); // Divides the result by the signal size to obtain the original input signal again. Since cuFFT // does not normalize the signal, this is done on the host. for (int i=0; i<SIZE*BATCHES; ++i) { signal_host[i].real(signal_host[i].real() / ((float)SIZE)); signal_host[i].imag(signal_host[i].imag() / ((float)SIZE)); } // Iterates over all signal elements and verifies the resulting signal against the original // input. There might be some rounding errors, but they should be smaller than the absolute error // margin. int num_errors = 0; for (int i=0; i<SIZE*BATCHES; ++i) { if (fabs(example_data[i].real() - signal_host[i].real()) > ERROR_MARGIN || fabs(example_data[i].imag() - signal_host[i].imag()) > ERROR_MARGIN) { num_errors++; } } printf("%sValidation: %d out of %d correct\n", NAME, SIZE*BATCHES - num_errors, SIZE*BATCHES); // Cleans-up the CPU/GPU memory and cuFFT plan printf("%sAll done\n", NAME); cufftDestroy(plan); cudaFree(signal_device); delete signal_host; delete example_data; // End of the code example printf("\n"); return 0; } // =================================================================================================