Commit 2c57323d authored by petros.anastasiadis's avatar petros.anastasiadis

Update 27/09/2017 - Added more comments

parent 326a4535
......@@ -11,11 +11,11 @@
#SBATCH --job-name=run_GPU # Job name
#SBATCH --output=J.out # Stdout (%j expands to jobId)
#SBATCH --error=J.err # Stderr (%j expands to jobId)
#SBATCH --ntasks=4 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=4 # Number of nodes requested
#SBATCH --ntasks-per-node=1 # Tasks per node
#SBATCH --ntasks=16 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=8 # Number of nodes requested
#SBATCH --ntasks-per-node=2 # Tasks per node
#SBATCH --cpus-per-task=1 # Threads per task
#SBATCH --gres=gpu:1 # GPUs per node
#SBATCH --gres=gpu:2 # GPUs per node
#SBATCH --time=00:40:00 # walltime
#SBATCH --mem=32G # memory per NODE
#SBATCH --partition=gpu # Partition
......@@ -31,16 +31,19 @@ module load intelmpi
module load binutils
module load cuda
export I_MPI_FABRICS=shm:dapl
output="/users/guest/petyros/Training/Outputs" ##/Inputs
partition="gpu"
## Change this to the directory of your executable!
gpu_prog="/users/guest/petyros/Training/GPUs/cuBLAS"
gpu_prog1="/users/guest/petyros/Training/GPUs/cuBLAS_MultiGPU"
rm -f "$output/Single_GPU.$partition" "$output/Multi_GPU.$partition"
#rm -f "$output/Multi_GPU.$partition" "$output/Single_GPU.$partition"
## Important note!!! For full GPU utilization in MultiGPU version you must use gres=ntasks-per-node values!!!
for n;
do
#srun $gpu_prog $n $n >> "$output/Single_GPU.$partition"
srun $gpu_prog1 $n $n >> "$output/Multi_GPU.$partition"
done
......@@ -5,6 +5,8 @@
07/09/2017: Completed
13/09/2017: Modified to use unified memory
->CUDA (under construction)
->cuBLAS_MultiGPU( cuBLAS implementation in multiple GPUs/Nodes)
26/09/2017: Completed
```
......@@ -58,7 +58,9 @@ int main(int argc, char ** argv)
}
x = (double *) malloc(global_padded_nm[1] * sizeof(*x));
if (rank==0) {
/* Initialize proc 0 memmory/data */
M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
M_cl = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M_cl));
vec_init_rand_p(x, global_nm[1], global_padded_nm[1] - global_nm[1], 1.0);
......@@ -109,7 +111,7 @@ int main(int argc, char ** argv)
cublasHandle_t handle;
stat = cublasCreate(&handle);
/* Initialize local GPU memmory */
/* Initialize local GPU memmory. Unified memmory not recomended for MultiGPU+Multinode because data size tends to be large */
double * gpu_y = (double *) gpu_alloc(local_nm[0] * sizeof(*gpu_y)) ;
double * gpu_xc = (double *) gpu_alloc(local_nm[1] * sizeof(*gpu_xc)) ;
double * gpu_A = (double *) gpu_alloc(local_nm[0] * local_nm[1] * sizeof(*gpu_A)) ;
......@@ -117,6 +119,7 @@ int main(int argc, char ** argv)
copy_to_gpu(local_y, gpu_y, local_nm[0] * sizeof(*local_y));
copy_to_gpu(x_c, gpu_xc, local_nm[1] * sizeof(*x_c));
copy_to_gpu(A, gpu_A, local_nm[0] * local_nm[1] * sizeof(*A));
/* Warmup */
stat=cublasDgemv(handle, CUBLAS_OP_N, local_nm[0], local_nm[1], &alf, gpu_A , local_nm[0], gpu_xc, 1, &beta, gpu_y, 1);
cudaDeviceSynchronize();
......@@ -142,6 +145,7 @@ int main(int argc, char ** argv)
MPI_Reduce(local_y, y, local_nm[0], MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
//MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) comm_t = MPI_Wtime() - comm_t;
if (rank == 0) {
#ifdef _DEBUG_
/* Output y vector to a file for debugging */
......
File added
File added
......@@ -11,8 +11,8 @@
#SBATCH --job-name=run_mpi # Job name
#SBATCH --output=J.out # Stdout (%j expands to jobId)
#SBATCH --error=J.err # Stderr (%j expands to jobId)
#SBATCH --ntasks=20 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=20 # Number of nodes requested
#SBATCH --ntasks=256 # Number of processor cores (i.e. tasks)
#SBATCH --nodes=256 # Number of nodes requested
#SBATCH --ntasks-per-node=1 # Tasks per node
#SBATCH --cpus-per-task=20 # Threads per task
#SBATCH --time=00:10:00 # walltime
......
File added
File added
File added
File added
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
MPI Version(N=10000, M=10000, Tasks=1, Nodes=1, Tasks/Node=1, threads=1): comp_t= 56.147239 ms, comm_t= 282.535076 ms
MPI Version(N=10000, M=10000, Tasks=1, Nodes=1, Tasks/Node=1, threads=2): comp_t= 31.577201 ms, comm_t= 357.761860 ms
MPI Version(N=10000, M=10000, Tasks=1, Nodes=1, Tasks/Node=1, threads=5): comp_t= 14.052680 ms, comm_t= 331.711054 ms
MPI Version(N=10000, M=10000, Tasks=1, Nodes=1, Tasks/Node=1, threads=10): comp_t= 9.305849 ms, comm_t= 350.841999 ms
MPI Version(N=10000, M=10000, Tasks=1, Nodes=1, Tasks/Node=1, threads=20): comp_t= 8.766048 ms, comm_t= 351.128340 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=1, Tasks/Node=2, threads=1): comp_t= 56.985431 ms, comm_t= 623.568058 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=1, Tasks/Node=2, threads=2): comp_t= 33.769939 ms, comm_t= 564.308167 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=1, Tasks/Node=2, threads=5): comp_t= 14.608409 ms, comm_t= 523.460150 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=1, Tasks/Node=2, threads=10): comp_t= 9.698389 ms, comm_t= 781.961203 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=1, Tasks/Node=2, threads=20): comp_t= 8.965359 ms, comm_t= 732.479095 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=2, Tasks/Node=1, threads=1): comp_t= 28.428760 ms, comm_t= 212.807178 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=2, Tasks/Node=1, threads=2): comp_t= 16.325829 ms, comm_t= 250.179052 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=2, Tasks/Node=1, threads=5): comp_t= 7.145081 ms, comm_t= 240.736961 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=2, Tasks/Node=1, threads=10): comp_t= 4.783680 ms, comm_t= 250.171900 ms
MPI Version(N=10000, M=10000, Tasks=2, Nodes=2, Tasks/Node=1, threads=20): comp_t= 4.393589 ms, comm_t= 250.375986 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=1): comp_t= 5.696092 ms, comm_t= 138.546705 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=2): comp_t= 3.199260 ms, comm_t= 148.300886 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=5): comp_t= 1.369910 ms, comm_t= 144.892931 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=10): comp_t= 0.935729 ms, comm_t= 146.556139 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=20): comp_t= 0.849500 ms, comm_t= 146.602154 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=1): comp_t= 5.799670 ms, comm_t= 136.942148 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=2): comp_t= 3.048239 ms, comm_t= 137.932062 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=5): comp_t= 1.597302 ms, comm_t= 138.103962 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=10): comp_t= 1.523440 ms, comm_t= 138.696909 ms
MPI Version(N=10000, M=10000, Tasks=10, Nodes=10, Tasks/Node=1, threads=20): comp_t= 0.845509 ms, comm_t= 147.261143 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=1): comp_t= 2.793679 ms, comm_t= 127.507210 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=2): comp_t= 1.446950 ms, comm_t= 127.897024 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=5): comp_t= 0.753410 ms, comm_t= 127.707005 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=10): comp_t= 0.681820 ms, comm_t= 128.866911 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=20): comp_t= 0.168869 ms, comm_t= 133.314133 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=1): comp_t= 2.796841 ms, comm_t= 127.683878 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=2): comp_t= 1.036122 ms, comm_t= 133.171797 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=5): comp_t= 0.494668 ms, comm_t= 133.764267 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=10): comp_t= 0.266960 ms, comm_t= 135.045052 ms
MPI Version(N=10000, M=10000, Tasks=20, Nodes=20, Tasks/Node=1, threads=20): comp_t= 0.166440 ms, comm_t= 133.502960 ms
MPI Version(N=25000, M=25000, Tasks=2, Nodes=2, Tasks/Node=1, threads=1): comp_t= 194.178829 ms, comm_t= 1332.824230 ms
MPI Version(N=25000, M=25000, Tasks=2, Nodes=2, Tasks/Node=1, threads=2): comp_t= 110.103779 ms, comm_t= 1798.905134 ms
MPI Version(N=25000, M=25000, Tasks=2, Nodes=2, Tasks/Node=1, threads=5): comp_t= 47.298059 ms, comm_t= 1719.721079 ms
MPI Version(N=25000, M=25000, Tasks=2, Nodes=2, Tasks/Node=1, threads=10): comp_t= 30.680361 ms, comm_t= 1669.413805 ms
MPI Version(N=25000, M=25000, Tasks=2, Nodes=2, Tasks/Node=1, threads=20): comp_t= 27.329772 ms, comm_t= 1805.709839 ms
MPI Version(N=25000, M=25000, Tasks=4, Nodes=4, Tasks/Node=1, threads=1): comp_t= 97.456789 ms, comm_t= 1045.057058 ms
MPI Version(N=25000, M=25000, Tasks=4, Nodes=4, Tasks/Node=1, threads=2): comp_t= 55.045478 ms, comm_t= 1212.171316 ms
MPI Version(N=25000, M=25000, Tasks=4, Nodes=4, Tasks/Node=1, threads=5): comp_t= 23.733768 ms, comm_t= 1144.655228 ms
MPI Version(N=25000, M=25000, Tasks=4, Nodes=4, Tasks/Node=1, threads=10): comp_t= 15.427949 ms, comm_t= 1174.911976 ms
MPI Version(N=25000, M=25000, Tasks=4, Nodes=4, Tasks/Node=1, threads=20): comp_t= 13.703361 ms, comm_t= 1173.395872 ms
MPI Version(N=25000, M=25000, Tasks=8, Nodes=8, Tasks/Node=1, threads=1): comp_t= 48.671141 ms, comm_t= 909.532785 ms
MPI Version(N=25000, M=25000, Tasks=8, Nodes=8, Tasks/Node=1, threads=2): comp_t= 27.521832 ms, comm_t= 966.041803 ms
MPI Version(N=25000, M=25000, Tasks=8, Nodes=8, Tasks/Node=1, threads=5): comp_t= 11.969309 ms, comm_t= 951.106071 ms
MPI Version(N=25000, M=25000, Tasks=8, Nodes=8, Tasks/Node=1, threads=10): comp_t= 7.720699 ms, comm_t= 964.529991 ms
MPI Version(N=25000, M=25000, Tasks=8, Nodes=8, Tasks/Node=1, threads=20): comp_t= 6.873391 ms, comm_t= 965.454817 ms
MPI Version(N=25000, M=25000, Tasks=16, Nodes=16, Tasks/Node=1, threads=1): comp_t= 24.376562 ms, comm_t= 851.062775 ms
MPI Version(N=25000, M=25000, Tasks=16, Nodes=16, Tasks/Node=1, threads=2): comp_t= 13.838730 ms, comm_t= 876.939058 ms
MPI Version(N=25000, M=25000, Tasks=16, Nodes=16, Tasks/Node=1, threads=5): comp_t= 6.032262 ms, comm_t= 869.652748 ms
MPI Version(N=25000, M=25000, Tasks=16, Nodes=16, Tasks/Node=1, threads=10): comp_t= 3.906670 ms, comm_t= 872.004032 ms
MPI Version(N=25000, M=25000, Tasks=16, Nodes=16, Tasks/Node=1, threads=20): comp_t= 3.491812 ms, comm_t= 880.733728 ms
MPI Version(N=25000, M=25000, Tasks=32, Nodes=32, Tasks/Node=1, threads=1): comp_t= 12.220020 ms, comm_t= 817.686081 ms
MPI Version(N=25000, M=25000, Tasks=32, Nodes=32, Tasks/Node=1, threads=2): comp_t= 6.964960 ms, comm_t= 830.610991 ms
MPI Version(N=25000, M=25000, Tasks=32, Nodes=32, Tasks/Node=1, threads=5): comp_t= 3.068931 ms, comm_t= 823.021889 ms
MPI Version(N=25000, M=25000, Tasks=32, Nodes=32, Tasks/Node=1, threads=10): comp_t= 2.005892 ms, comm_t= 833.809614 ms
MPI Version(N=25000, M=25000, Tasks=32, Nodes=32, Tasks/Node=1, threads=20): comp_t= 1.813321 ms, comm_t= 825.877905 ms
Multi GPU CUDA-MPI Version(N=10000, M=2500, GPUs/Node=1, Nodes=4, Tasks/Node=1): comp_t= 1.087799 ms, comm_t= 213.823795 ms
Multi GPU CUDA-MPI Version(N=25000, M=12500, GPUs/Node=2, Nodes=1, Tasks/Node=2): comp_t= 13.342609 ms, comm_t= 1756.819963 ms
Multi GPU CUDA-MPI Version(N=25000, M=6250, GPUs/Node=2, Nodes=2, Tasks/Node=2): comp_t= 6.707978 ms, comm_t= 1122.361183 ms
Multi GPU CUDA-MPI Version(N=25000, M=3125, GPUs/Node=2, Nodes=4, Tasks/Node=2): comp_t= 3.318169 ms, comm_t= 955.436945 ms
Multi GPU CUDA-MPI Version(N=25000, M=1563, GPUs/Node=2, Nodes=8, Tasks/Node=2): comp_t= 3.125570 ms, comm_t= 966.177940 ms
Multi GPU CUDA-MPI Version(N=25000, M=782, GPUs/Node=2, Nodes=16, Tasks/Node=2): comp_t= 2.513990 ms, comm_t= 871.755123 ms
OpenMP Version(N=10000, M=10000, Threads=1): t= 177.114110 ms
OpenMP Version(N=10000, M=10000, Threads=2): t= 70.624578 ms
OpenMP Version(N=10000, M=10000, Threads=5): t= 32.878809 ms
OpenMP Version(N=10000, M=10000, Threads=10): t= 28.578160 ms
OpenMP Version(N=10000, M=10000, Threads=20): t= 30.562098 ms
OpenMP Version(N=10000, M=10000, Threads=40): t= 29.978011 ms
OpenMP Version(N=25000, M=25000, Threads=1): t= 1170.268281 ms
OpenMP Version(N=25000, M=25000, Threads=2): t= 462.748051 ms
OpenMP Version(N=25000, M=25000, Threads=5): t= 233.253169 ms
OpenMP Version(N=25000, M=25000, Threads=10): t= 180.470340 ms
OpenMP Version(N=25000, M=25000, Threads=20): t= 189.864190 ms
OpenMP Version(N=25000, M=25000, Threads=40): t= 186.259170 ms
OpenMP Version(N=10000, M=10000, Threads=1): t= 68.127999 ms
OpenMP Version(N=10000, M=10000, Threads=2): t= 30.999739 ms
OpenMP Version(N=10000, M=10000, Threads=5): t= 13.679490 ms
OpenMP Version(N=10000, M=10000, Threads=10): t= 7.440660 ms
OpenMP Version(N=10000, M=10000, Threads=20): t= 4.669971 ms
OpenMP Version(N=10000, M=10000, Threads=40): t= 4.422231 ms
OpenMP Version(N=25000, M=25000, Threads=1): t= 456.858909 ms
OpenMP Version(N=25000, M=25000, Threads=2): t= 228.760691 ms
OpenMP Version(N=25000, M=25000, Threads=5): t= 94.036951 ms
OpenMP Version(N=25000, M=25000, Threads=10): t= 49.852111 ms
OpenMP Version(N=25000, M=25000, Threads=20): t= 30.018451 ms
OpenMP Version(N=25000, M=25000, Threads=40): t= 27.299979 ms
cd /users/guest/petyros/Training/Serial
sbatch Serial.slurm
cd /users/guest/petyros/Training/Multicore/OpenMP
sbatch OpenMP.slurm
cd /users/guest/petyros/Training/GPUs/Single_GPU/cuBLAS
sbatch S_GPU_cuB.slurm
sbatch Serial.slurm 50000
cd /users/guest/petyros/Training/OpenMP
sbatch OpenMP.slurm 50000
cd /users/guest/petyros/Training/GPUs
sbatch GPU.slurm 50000
cd /users/guest/petyros/Training/MPI
sbatch MPI.slurm 50000
Single GPU CUDA Version(N=25000, M=25000): t= 26.747921 ms
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment