Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// =================================================================================================
// This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
// CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
//
// Author(s):
// Cedric Nugteren <cedric.nugteren@surfsara.nl>
//
// This example demonstrates the use of NVIDIA's fast Fourier transform library for CUDA: cuFFT.
// The example is set-up to perform an in-place forward transform followed by an in-place inverse
// transform such that the output signal should be equal to the input signal. The example uses pre-
// processor defines to select the FFT-size and the number of batches. The FFTs performed are 1D
// and single-precision complex-to-complex.
//
// See [http://docs.nvidia.com/cuda/cufft] for the full cuFFT documentation.
//
// =================================================================================================
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <complex>
// The NVIDIA CUDA cuFFT library
#include <cufft.h>
// Constants
#define NAME "[3_spectral_cufft] "
// Settings
#define SIZE 128 // The size of the FFT to perform
#define BATCHES 20 // The number of batches of FFTs of size 'SIZE'
#define ERROR_MARGIN 1e-5 // Acceptable absolute error for validation
// =================================================================================================
// Monolithic function without command-line arguments
int main() {
printf("\n");
// Allocates and initializes example data on the host (==CPU). This creates two arrays with the
// same (random) contents: one to be processed (read/write) and one for verification purposes.
// The 'std::complex<float>' data-type can also be replaced by 'cufftComplex'.
std::complex<float>* example_data = new std::complex<float>[SIZE*BATCHES];
std::complex<float>* signal_host = new std::complex<float>[SIZE*BATCHES];
for (int i=0; i<SIZE*BATCHES; ++i) {
example_data[i].real(rand() / ((float)RAND_MAX));
example_data[i].imag(rand() / ((float)RAND_MAX));
signal_host[i].real(example_data[i].real());
signal_host[i].imag(example_data[i].imag());
}
// Allocates and initializes memory on the device (==GPU). Initialization is based on the host
// input signal: 'cudaMemcpy' is used to perform the host-to-device transfer.
printf("%sAllocating and initializing GPU memory\n", NAME);
cufftComplex* signal_device;
cudaMalloc(&signal_device, sizeof(cufftComplex)*SIZE*BATCHES);
cudaMemcpy(signal_device, signal_host, sizeof(cufftComplex)*SIZE*BATCHES, cudaMemcpyHostToDevice);
// Creates a cuFFT plan. This describes the size and type of FFTs to be performed. The given
// arguments are:
// 1) a pointer to the plan itself to store the plan configuration
// 2) the FFT-size (e.g. 128-points transform)
// 3) the FFT-type (e.g. complex-to-complex or complex-to-real)
// 4) the number of batches
// More complicated plans can be created using 'cufftPlanMany', which allows for 2D or 3D
// transform and custom pitches and strides.
printf("%sCreating a cuFFT plan\n", NAME);
cufftHandle plan;
cufftResult status;
status = cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCHES);
if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); }
// Performs the forward FFTW based on the created plan. Note that this is in-place: the source
// and destination buffer (both on the GPU!) are the same. This call is a-synchronous and thus
// requires 'cudaDeviceSynchronize' afterwards to ensure completion of the work on the GPU.
printf("%sPerforming a 1D forward C2C FFT on the GPU\n", NAME);
status = cufftExecC2C(plan, signal_device, signal_device, CUFFT_FORWARD);
if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); }
cudaDeviceSynchronize();
// As above, but now performs an inverse FFT to 'undo' the previous FFT (allows for easier
// validation).
printf("%sPerforming a 1D inverse C2C FFT on the GPU\n", NAME);
status = cufftExecC2C(plan, signal_device, signal_device, CUFFT_INVERSE);
if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); }
cudaDeviceSynchronize();
// Copies the results back from the device (==GPU) to the host (==CPU)
printf("%sCopying the results back to the CPU\n", NAME);
cudaMemcpy(signal_host, signal_device, sizeof(cufftComplex)*SIZE*BATCHES, cudaMemcpyDeviceToHost);
// Divides the result by the signal size to obtain the original input signal again. Since cuFFT
// does not normalize the signal, this is done on the host.
for (int i=0; i<SIZE*BATCHES; ++i) {
signal_host[i].real(signal_host[i].real() / ((float)SIZE));
signal_host[i].imag(signal_host[i].imag() / ((float)SIZE));
}
// Iterates over all signal elements and verifies the resulting signal against the original
// input. There might be some rounding errors, but they should be smaller than the absolute error
// margin.
int num_errors = 0;
for (int i=0; i<SIZE*BATCHES; ++i) {
if (fabs(example_data[i].real() - signal_host[i].real()) > ERROR_MARGIN ||
fabs(example_data[i].imag() - signal_host[i].imag()) > ERROR_MARGIN) {
num_errors++;
}
}
printf("%sValidation: %d out of %d correct\n", NAME, SIZE*BATCHES - num_errors, SIZE*BATCHES);
// Cleans-up the CPU/GPU memory and cuFFT plan
printf("%sAll done\n", NAME);
cufftDestroy(plan);
cudaFree(signal_device);
delete signal_host;
delete example_data;
// End of the code example
printf("\n");
return 0;
}
// =================================================================================================