Newer
Older
/*
* A Hybrid MPI-OpenMP implementation of the Matrix-Vector multiplication
*
* Author: Petros Anastasiadis(panastas@cslab.ece.ntua.gr)
*
* For more info about hybrid MPI-OpenMP see http://openmp.org/wp-content/uploads/HybridPP_Slides.pdf
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <mpi.h>
/* Need to include External_Functions for these */
#include "matrix_op.h"
#include "util.h"
#include "input.h"
int main(int argc, char ** argv) {
int rank,size;
int global_nm[2],local_nm[2]; //global matrix dimensions and local matrix dimensions (2D-domain, 2D-subdomain)
int global_padded_nm[2]; //padded global matrix dimensions (if padding is not needed, global_padded=global)
int i,j,k;
double * M, * A, * x, * y, *local_y, comm_t, comp_t;
/* MPI basic initializations */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
if (argc < 3) error("Usage: ./Program N M");
else if ( argc == 3) { /*./Program N M */
global_nm[0] = atoi(argv[1]);
global_nm[1] = atoi(argv[2]);
local_nm[1]=global_nm[1];
global_padded_nm[1]=global_nm[1];
if (global_nm[0]%size==0) {
local_nm[0]=global_nm[0]/size;
global_padded_nm[0]=global_nm[0];
}
else {
local_nm[0]=(global_nm[0]/size)+1;
global_padded_nm[0]=local_nm[0]*size;
}
x = (double *) malloc(global_padded_nm[1] * sizeof(*x));
if (rank==0) {
M = (double *) malloc(global_padded_nm[0] * global_padded_nm[1] * sizeof(*M));
vec_init_rand(x, global_padded_nm[1], 1.0);
y = (double *) malloc(global_padded_nm[0] * sizeof(*y));
vec_init(y, global_padded_nm[0], 0.0);
if( !y || !x || !M ) error("memory allocation failed");
/* Initialize matrices */
ser_matrix_init_rand_p(M, global_nm[0], global_nm[1], global_padded_nm[1] * (global_padded_nm[0] - global_nm[0]), 1.0); /* Normal matrices generated randomly */
}
local_y = (double *) malloc(local_nm[0] * sizeof(*local_y));
vec_init(local_y, local_nm[0], 0.0);
//if(rank==0) printf("Local[0]=%d Local[1]=%d global_padded[0]=%d global_padded[1]=%d\n",local[0],local[1],global_padded[0],global_padded[1]);
A = (double *) malloc(local_nm[0] * local_nm[1] * sizeof(*A));
#pragma omp parallel for schedule(static) /* Initialize data for each thread in corresponding socket with first-touch policy */
for( i=0 ; i<local_nm[0] ; ++i){
for ( j=0 ; j< local_nm[1] ; ++j) A[i*local_nm[1]+j]=0.0;
//printf( "Initialize data Thread=%d i=%d\n", omp_get_thread_num(), i);
}
double * gsendbuf;
if (rank == 0){
gsendbuf = &(M[0]);
comm_t= MPI_Wtime();
}
MPI_Scatter(gsendbuf, local_nm[1] * local_nm[0], MPI_DOUBLE, A, local_nm[1] * local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
//if (rank == 0) printf("Scatter complete\n");
MPI_Bcast(x, global_padded_nm[1], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) {
printf("MPI-OpenMP Version(N=%d, M=%d, Tasks=%d, Nodes=%s, Tasks/Node=%s, threads=%s): ", global_nm[0], global_nm[1], size,
getenv("SLURM_JOB_NUM_NODES"), getenv("SLURM_NTASKS_PER_NODE"), getenv("OMP_NUM_THREADS"));
comp_t= MPI_Wtime();
}
for (i = 0; i < NR_ITER; ++i){
register double yi = 0;
#pragma omp parallel for private(j,yi) shared(local_nm,A,local_y) schedule(static)
for (k = 0; k < local_nm[0]; ++k) {
//printf( "Compute Thread=%d i=%d\n", omp_get_thread_num(), k);
yi = 0.0;
for (j = 0; j < local_nm[1]; ++j) yi += A[k*local_nm[1]+j]*x[j];
local_y[k] = yi;
}
}
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) comp_t= MPI_Wtime() - comp_t;
MPI_Gather(local_y, local_nm[0], MPI_DOUBLE, y, local_nm[0], MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (rank==0) {
comm_t = MPI_Wtime() - comm_t - comp_t;
/* Output y vector to a file for debugging */
char filename[] = "MPI-OpenMP.debug" ;
if(( fp = fopen( filename, "w")) == NULL) error("Output file creation failed\n");
for (k = 0; k < global_nm[0]; ++k) fprintf(fp, "%lf ", y[k]) ;
fclose(fp) ;
#endif
report_mpi_results(comm_t, comp_t);
/* Free rank 0 local memmory */