// =================================================================================================
// This file is part of the CodeVault project. The project is licensed under Apache Version 2.0.
// CodeVault is part of the EU-project PRACE-4IP (WP7.3.C).
//
// Author(s):
//   Cedric Nugteren <cedric.nugteren@surfsara.nl>
//
// This example demonstrates the use of NVIDIA's fast Fourier transform library for CUDA: cuFFT.
// The example is set-up to perform an in-place forward transform followed by an in-place inverse
// transform such that the output signal should be equal to the input signal. The example uses pre-
// processor defines to select the FFT-size and the number of batches. The FFTs performed are 1D
// and single-precision complex-to-complex.
//
// See [http://docs.nvidia.com/cuda/cufft] for the full cuFFT documentation.
//
// =================================================================================================

#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <complex>

// The NVIDIA CUDA cuFFT library
#include <cufft.h>

// Constants
#define NAME "[3_spectral_cufft] "

// Settings
#define SIZE 128           // The size of the FFT to perform
#define BATCHES 20         // The number of batches of FFTs of size 'SIZE'
#define ERROR_MARGIN 1e-5  // Acceptable absolute error for validation

// =================================================================================================

// Monolithic function without command-line arguments
int main() {
  printf("\n");

  // Allocates and initializes example data on the host (==CPU). This creates two arrays with the
  // same (random) contents: one to be processed (read/write) and one for verification purposes.
  // The 'std::complex<float>' data-type can also be replaced by 'cufftComplex'.
  std::complex<float>* example_data = new std::complex<float>[SIZE*BATCHES];
  std::complex<float>* signal_host = new std::complex<float>[SIZE*BATCHES];
  for (int i=0; i<SIZE*BATCHES; ++i) {
    example_data[i].real(rand() / ((float)RAND_MAX));
    example_data[i].imag(rand() / ((float)RAND_MAX));
    signal_host[i].real(example_data[i].real());
    signal_host[i].imag(example_data[i].imag());
  }

  // Allocates and initializes memory on the device (==GPU). Initialization is based on the host
  // input signal: 'cudaMemcpy' is used to perform the host-to-device transfer.
  printf("%sAllocating and initializing GPU memory\n", NAME);
  cufftComplex* signal_device;
  cudaMalloc(&signal_device, sizeof(cufftComplex)*SIZE*BATCHES);
  cudaMemcpy(signal_device, signal_host, sizeof(cufftComplex)*SIZE*BATCHES, cudaMemcpyHostToDevice);

  // Creates a cuFFT plan. This describes the size and type of FFTs to be performed. The given
  // arguments are:
  //   1) a pointer to the plan itself to store the plan configuration
  //   2) the FFT-size (e.g. 128-points transform)
  //   3) the FFT-type (e.g. complex-to-complex or complex-to-real)
  //   4) the number of batches
  // More complicated plans can be created using 'cufftPlanMany', which allows for 2D or 3D
  // transform and custom pitches and strides.
  printf("%sCreating a cuFFT plan\n", NAME);
  cufftHandle plan;
  cufftResult status;
  status = cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCHES);
  if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); }

  // Performs the forward FFTW based on the created plan. Note that this is in-place: the source
  // and destination buffer (both on the GPU!) are the same. This call is a-synchronous and thus
  // requires 'cudaDeviceSynchronize' afterwards to ensure completion of the work on the GPU.
  printf("%sPerforming a 1D forward C2C FFT on the GPU\n", NAME);
  status = cufftExecC2C(plan, signal_device, signal_device, CUFFT_FORWARD);
  if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); }
  cudaDeviceSynchronize();

  // As above, but now performs an inverse FFT to 'undo' the previous FFT (allows for easier
  // validation).
  printf("%sPerforming a 1D inverse C2C FFT on the GPU\n", NAME);
  status = cufftExecC2C(plan, signal_device, signal_device, CUFFT_INVERSE);
  if (status != CUFFT_SUCCESS) { printf("%s CUFFT ERROR %d\n", NAME, status); }
  cudaDeviceSynchronize();

  // Copies the results back from the device (==GPU) to the host (==CPU)
  printf("%sCopying the results back to the CPU\n", NAME);
  cudaMemcpy(signal_host, signal_device, sizeof(cufftComplex)*SIZE*BATCHES, cudaMemcpyDeviceToHost);

  // Divides the result by the signal size to obtain the original input signal again. Since cuFFT
  // does not normalize the signal, this is done on the host.
  for (int i=0; i<SIZE*BATCHES; ++i) {
    signal_host[i].real(signal_host[i].real() / ((float)SIZE));
    signal_host[i].imag(signal_host[i].imag() / ((float)SIZE));
  }

  // Iterates over all signal elements and verifies the resulting signal against the original
  // input. There might be some rounding errors, but they should be smaller than the absolute error
  // margin.
  int num_errors = 0;
  for (int i=0; i<SIZE*BATCHES; ++i) {
    if (fabs(example_data[i].real() - signal_host[i].real()) > ERROR_MARGIN ||
        fabs(example_data[i].imag() - signal_host[i].imag()) > ERROR_MARGIN) {
      num_errors++;
    }
  }
  printf("%sValidation: %d out of %d correct\n", NAME, SIZE*BATCHES - num_errors, SIZE*BATCHES);

  // Cleans-up the CPU/GPU memory and cuFFT plan
  printf("%sAll done\n", NAME);
  cufftDestroy(plan);
  cudaFree(signal_device);
  delete signal_host;
  delete example_data;

  // End of the code example
  printf("\n");
  return 0;
}

// =================================================================================================