mklspgemm.c

/*includes {{{*/
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <mkl.h>
#include "mmio.h"
/*}}}*/

/* global varibles and defines {{{*/
int micdev = 0;
__declspec(target(mic)) int nrepeat = 100;
__declspec(target(mic)) int option_print_matrices = 0;
#define OPTION_NOPRINT_MATRICES 0
#define OPTION_PRINT_MATRICES 1
__declspec(target(mic)) char transa = 'n';
typedef int csi;
typedef double csv;
csv zero = 0.0;
#define CS_MAX(a,b) (((a) > (b)) ? (a) : (b))
__declspec(target(mic)) int nthreads;
typedef struct csr_t {
  csi  m; 
  csi  n; 
  csi  nzmax;
  csi  nr; 
  csi* r; 
  csi* p; 
  csi* j; 
  csv* x; 
} csr;
/*}}}*/

/* method declarations {{{*/
int  main(int argc, char* argv[]);
__declspec(target(mic)) void printmm_one(int m, double* Aval, int* AJ, int* AI);
__declspec(target(mic)) void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI);
__declspec(target(mic)) void printmm_zero(int m, double* Aval, int* AJ, int* AI);
csr *csr_spfree(csr *A);
/*}}}*/

/*csr util{{{*/

/* free workspace and return a sparse matrix result */
csr *csr_done(csr *C, void *w, void *x, csi ok) {
        return(ok ? C : csr_spfree(C)); /* return result if OK, else free it */
}

/* wrapper for free */
void *cs_free(void *p) {
        if (p)
                free(p); /* free p if it is not already NULL */
        return(NULL); /* return NULL to simplify the use of cs_free */
}

/* wrapper for realloc */
void *csr_realloc(void *p, csi n, size_t size, csi *ok) {
        void *pnew = NULL;
        pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */
        *ok = (pnew != NULL); /* realloc fails if pnew is NULL */
        if (pnew == NULL) {
                printf("%d:reallocation failed, pnew is NULL\n", __LINE__);
        }
//printf("%s:%d: n=%d ok=%d\n", __FUNCTION__, __LINE__, n, *ok);
        return((*ok) ? pnew : p); /* return original p if failure */
}

/* wrapper for realloc */
void *cs_realloc(void *p, csi n, size_t size, csi *ok) {
        void *pnew = NULL;
        pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */
        *ok = (pnew != NULL); /* realloc fails if pnew is NULL */
        if (pnew == NULL) {
                printf("reallocation failed\n");
        }
        return((*ok) ? pnew : p); /* return original p if failure */
}

/* change the max # of entries sparse matrix */
csi csr_sprealloc(csr *A, csi nzmax) {
        csi ok, oki = 0, okj = 1, okx = 1;
        if (!A)
                return(0);
        if (nzmax <= 0)
                nzmax = A->p[A->m];
        A->j = (int*)csr_realloc(A->j, nzmax, sizeof(csi), &oki);
        if (A->x)
                A->x = (csv*)csr_realloc(A->x, nzmax, sizeof(csv), &okx);
        ok = (oki && okj && okx);
        if (ok)
                A->nzmax = nzmax;
        return(ok);
}

/* free a sparse matrix */
csr *csr_spfree(csr *A) {
        if (!A)
                return(NULL); /* do nothing if A already NULL */
        cs_free(A->p);
        A->p = NULL;
        cs_free(A->j);
        A->j = NULL;
        cs_free(A->x);
        A->x = NULL;
        cs_free(A->r);
        A->r = NULL;
        cs_free(A); /* free the cs struct and return NULL */
    return NULL;
}

/* allocate a sparse matrix (triplet form or compressed-ROW form) */
csr *csr_spalloc(csi m, csi n, csi nzmax, int values, int triplet, csv f) {
        csr* A = (csr*)calloc(1, sizeof(csr)); /* allocate the cs struct */
        if (!A) {
                perror("sparse allocation failed");
                return(NULL); /* out of memory */
        }
        A->m = m; /* define dimensions and nzmax */
        A->n = n;
        A->nzmax = nzmax = CS_MAX(nzmax, 0);
        A->nr = 0;  // number of nonzero rows
        A->p = (csi*)calloc(m + 2, sizeof(csi));
        A->j = (csi*)calloc(CS_MAX(nzmax,1), sizeof(csi));
        A->x = (csv*)calloc(CS_MAX(nzmax,1), sizeof(csv));
        return((!A->p || !A->j || !A->x) ? csr_spfree(A) : A);

}/*}}}*/

/** Multiply two sparse matrices which are stored in CSR format. MKL is used */
csr *csr_multiply(csi Am, csi An, csi Anzmax, const csi* Ap, const csi* Aj, const csv* Ax, csi Bm, csi Bn, csi Bnzmax, const csi* Bp, const csi* Bj, const csv* Bx, long* nummult, csi* xb, csv* x) { /*{{{*/
    csv tf = 0;
    csi p, jp, j, kp, k, i, nz = 0, anz, *Cp, *Cj, m, n,
        bnz, values = 1;
    csv *Cx;
    csr *C;
    if (An != Bm)
        return(NULL);
    if (Anzmax == 0 || Bnzmax == 0) {
        C = csr_spalloc(Am, Bn, 0, values, 0, tf);
        return C;
    }
    m = Am;
    anz = Ap[Am];
    n = Bn;
    bnz = Bp[Bm];
    for(i = 0; i < n; i++) xb[i] = 0;
    for(i = 0; i < n; i++)
        xb[i] = 0;
    values = (Ax != NULL) && (Bx != NULL);
    csi tnz = (anz + bnz) * 2;
    C = csr_spalloc(m, n, tnz, values, 0, tf); /* allocate result */
    if (!C || !xb || (values && !x))
        return (csr_done(C, xb, x, 0));
    Cp = C->p;
    for (i = 0; i < m; i++) {
        if ( ( (nz + n) > C->nzmax ) ) {
            if(!csr_sprealloc(C, (2 * (C->nzmax) + n) ) ) {
                return (csr_done(C, xb, x, 0)); // out of memory
            } else {
            }
        }
        Cj = C->j;
        Cx = C->x; /* C->j and C->x may be reallocated */
        Cp[i] = nz; /* row i of C starts here */
        for (jp = Ap[i]; jp < Ap[i + 1]; jp++) {
            j = Aj[jp];
            for (kp = Bp[j]; kp < Bp[j + 1]; kp++) {
                k = Bj[kp]; /* B(i,j) is nonzero */
                if (xb[k] != i + 1) {
                    xb[k] = i + 1; /* i is new entry in column j */
                    Cj[nz++] = k; /* add i to pattern of C(:,j) */
                    if (x) {
                        x[k] = Ax[jp] * Bx[kp]; /* x(i) = beta*A(i,j) */
                        (*nummult)++;
                    }
                } else if (x) {
                    x[k] += (Ax[jp] * Bx[kp]); /* i exists in C(:,j) already */
                    (*nummult)++;
                }
            }
        }
        if (values)
            for (p = Cp[i]; p < nz; p++)
                Cx[p] = x[Cj[p]];
    }
    Cp[m] = nz; /* finalize the last row of C */
    csr_sprealloc(C, 0); /* remove extra space from C */
    xb = NULL;
    x = NULL;
    return C;
}/*}}}*/

/** Multiply two sparse matrices which are stored in CSR format. MKL is used */
void mkl_cpu_spgemm(MKL_INT Am, MKL_INT An, MKL_INT Annz, double* Aval, MKL_INT* AJ, MKL_INT* AI, MKL_INT Bn, MKL_INT Bnnz, double* Bval, MKL_INT* BJ, MKL_INT* BI, double** pCval, MKL_INT** pCJ, MKL_INT** pCI, double* time) {  /*{{{*/

MKL_INT* CJ = NULL;
double* Cval = NULL;
MKL_INT sort = 3;    // sort everything
MKL_INT* CI = (MKL_INT*)mkl_malloc( (Am+2) * sizeof( MKL_INT ), 64 );
MKL_INT nnzmax = 0;    // nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
MKL_INT ierr;
MKL_INT request = 1;    // symbolic multiplication
mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);

request = 2;        //numeric multiplication
int Cnnz = CI[Am]-1;
int Cval_size = Cnnz + 1;
CJ = (MKL_INT*)mkl_malloc( Cval_size    *    sizeof( MKL_INT ), 64 );
Cval = (double*)mkl_malloc( Cval_size    *    sizeof( double ), 64 );
double time_st = dsecnd();
int i;
for(i = 0; i < nrepeat; i++) {
    mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
}
double time_end = dsecnd();
*time = (time_end - time_st)/nrepeat;
*pCval = Cval; *pCJ = CJ; *pCI = CI;
} /*}}}*/ 
void mkl_mic_spgemm(MKL_INT Am, MKL_INT An, MKL_INT Annz, double* Aval, MKL_INT* AJ, MKL_INT* AI, MKL_INT Bn, MKL_INT Bnnz, double* Bval, MKL_INT* BJ, MKL_INT* BI, double** pCval, MKL_INT** pCJ, MKL_INT** pCI, double* time) {/*{{{*/

	int i;
	#pragma offload target(mic:micdev) inout(nthreads)
	{
		if (nthreads > 0) {
			mkl_set_num_threads(nthreads);
		} else {
			nthreads = mkl_get_max_threads();
		}
	}  // printf("MIC started \n"); fflush(stdout);
	#pragma offload target(mic:micdev) \
		in(transa) \
		in(Am) \
		in(An) \
		in(Bn) \
		in(Aval:length(Annz)	free_if(0)) \
		in(AI:length(Am+1)	free_if(0)) \
		in(AJ:length(Annz)	free_if(0)) \
		in(Bval:length(Bnnz)	free_if(0)) \
		in(BI:length(An+1)	free_if(0)) \
		in(BJ:length(Bnnz)	free_if(0)) 
	{}
	

	//void mkl_dcsrmultcsr (char *transa, MKL_INT *job, MKL_INT *sort, MKL_INT *m, MKL_INT *n, MKL_INT *k, double *a, MKL_INT *ja, MKL_INT *ia, double *b, MKL_INT *jb, MKL_INT *ib, double *c, MKL_INT *jc, MKL_INT *ic, MKL_INT *nnzmax, MKL_INT *ierr);


//	double s_initial = dsecnd();
//	double s_elapsed = dsecnd() - s_initial; // seconds

	MKL_INT	Cnnz_host = -1;
	MKL_INT* CI = NULL;
	MKL_INT* CJ = NULL;
	double* Cval = NULL;
	MKL_INT nnzmax = 0;	// nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
	MKL_INT ierr;
	MKL_INT request = 2;	//numeric multiplication
	MKL_INT sort = 3;	// sort everything
	double time_mic_symbolic_mm = 0.0;
	#pragma offload target(mic:micdev) in(i) in(ierr) in(nnzmax) in(request) in(sort) in(transa) in(Am)  in(An) in(Bn) \
		out(time_mic_symbolic_mm) out(Cnnz_host)\
		in(Aval:length(Annz)	alloc_if(0)	free_if(0)) \
		in(AI:length(Am+1)	alloc_if(0)	free_if(0)) \
		in(AJ:length(Annz)	alloc_if(0)	free_if(0)) \
		in(Bval:length(Bnnz)	alloc_if(0)	free_if(0)) \
		in(BI:length(An+1)	alloc_if(0)	free_if(0)) \
		in(BJ:length(Bnnz)	alloc_if(0)	free_if(0)) \
		nocopy(CI:	alloc_if(0)	free_if(0)) \
		nocopy(CJ:	alloc_if(0)	free_if(0)) \
		nocopy(Cval:	alloc_if(0)	free_if(0))
	{
		CI = (MKL_INT*)mkl_malloc( (Am+2) * sizeof( MKL_INT ), 64 );
		MKL_INT nnzmax = 0;	// nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
		MKL_INT ierr;
		MKL_INT request = 1;	// symbolic multiplication
		//MKL_INT sort = 7;	// do not sort anything

		for(i = 0; i < 10; i++) {
			mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
		}	
		double s_initial = dsecnd();
		for(i = 0; i < nrepeat; i++) {
			mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
		}	
		time_mic_symbolic_mm = (dsecnd() - s_initial) / nrepeat; // seconds

		request = 2;		//numeric multiplication
		int Cnnz = CI[Am]-1;
		int Cval_size = Cnnz + 1; Cnnz_host = Cnnz;
		CJ		= (MKL_INT*)mkl_malloc( Cval_size	*	sizeof( MKL_INT ), 64 );
		Cval		= (double*)mkl_malloc( Cval_size	*	sizeof( double ), 64 );
		mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
		printmm_one(Am, Cval, CJ, CI);//printf("Cnnz=%d\n", Cnnz);
		sort = 7;		// do not sort anything
		request = 2;		// numeric multiplication
	}//printf("Cnnz_mic: %d\n", Cnnz_host);exit(-1);
	double time_mic_numeric_mm = 0.0;
	#pragma offload target(mic:micdev) nocopy(request) nocopy(sort) nocopy(i) nocopy(ierr) nocopy(nnzmax)  nocopy(transa) nocopy(Am) nocopy(An) nocopy(Bn) \
		out(time_mic_numeric_mm) \
		nocopy(Aval:length(Annz)	alloc_if(0)	free_if(0)) \
		nocopy(AI:length(Am+1)	alloc_if(0)	free_if(0)) \
		nocopy(AJ:length(Annz)	alloc_if(0)	free_if(0)) \
		nocopy(Bval:length(Bnnz)	alloc_if(0)	free_if(0)) \
		nocopy(BI:length(An+1)	alloc_if(0)	free_if(0)) \
		nocopy(BJ:length(Bnnz)	alloc_if(0)	free_if(0)) \
		nocopy(CI:	alloc_if(0)	free_if(0)) \
		nocopy(CJ:	alloc_if(0)	free_if(0)) \
		nocopy(Cval:	alloc_if(0)	free_if(0))
	{
		//sort = 7
		//request = 1; 2'ye gore sure iki katina cikiyor
		//request = 0; 2'ye gore sure uc katina cikiyor

		//request = 2
		//sort = 3; 7'ye gore %30 daha yavas
		//printf("sort:%d request:%d\n", sort, request); // prints sort:7 request:2
		for(i = 0; i < 10; i++) {
			mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
		}
		double s_initial = dsecnd();
		for(i = 0; i < nrepeat; i++) {
			mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
		}
		time_mic_numeric_mm = (dsecnd() - s_initial) / nrepeat; // seconds
	}
	if (0) {
	int nelm_CI = Am + 2; 
	int nelm_CJ = Cnnz_host + 1; 
	int nelm_Cval = Cnnz_host + 1; 
	__declspec(target(mic)) MKL_INT* CI_host = (MKL_INT*)mkl_malloc( (nelm_CI) * sizeof( MKL_INT ), 64 );
	__declspec(target(mic)) MKL_INT* CJ_host = (MKL_INT*)mkl_malloc( (nelm_CJ) * sizeof( MKL_INT ), 64 );
	__declspec(target(mic)) double* Cval_host = (double*)mkl_malloc( (nelm_Cval) * sizeof( double ), 64 );
	#pragma offload target(mic:micdev) in(nelm_CI)\ 
		inout(CI_host:length(nelm_CI)	alloc_if(1)	free_if(0)) \
		inout(CJ_host:length(nelm_CJ)	alloc_if(1)	free_if(0)) \
		inout(Cval_host:length(nelm_Cval)	alloc_if(1)	free_if(0)) \
		nocopy(CI:	alloc_if(0)	free_if(0)) \
		nocopy(CJ:	alloc_if(0)	free_if(0)) \
		nocopy(Cval:	alloc_if(0)	free_if(0))
	{
		int i;
		for(i = 0; i < nelm_CI; i++) CI_host[i] = CI[i];
		for(i = 0; i < nelm_CJ; i++) {CJ_host[i] = CJ[i]; Cval_host[i] = Cval[i];}
	}
	*pCval = Cval_host; *pCJ = CJ_host; *pCI = CI_host; //printf("Cnnz_host:%d\n", Cnnz_host);
	}
} /* ENDOF mkl_mic_spmm }}}*/

/** Read Matrix Market file into COO matrix */
void read_mm(char* strpath, int* pM, int* pN, int* prealnnz, int** pI, int** pJ, double** pval){ /*{{{*/
/* 
*   taken from Matrix Market I/O library for ANSI C
*
*   See http://math.nist.gov/MatrixMarket for details.
*
*
*/

int i, M, N, nz, *I, *J;
double* val;
int ret_code;
MM_typecode matcode;
FILE* f;
if ((f = fopen(strpath, "r")) == NULL) {fprintf(stderr, "Input matrix file %s cannot be opened to read.", strpath);exit(1);}
/* READ MATRIX */
if (mm_read_banner(f, &matcode) != 0) {
    printf("Could not process Matrix Market banner.\n");    exit(1);
}
/*  This is how one can screen matrix types if their application */
/*  only supports a subset of the Matrix Market data types.      */
if (mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode) ) {
    printf("Sorry, this application does not support ");
    printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));exit(1);
}
/* find out size of sparse matrix .... */
if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) exit(1);
/* reseve memory for matrices */
I = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int));
J = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int));
val = (double *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(double));
*pI = I;
*pJ = J;
*pval = val;
/* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
/*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
/*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
int realnnz = 0;
for (i=0; i<nz; i++) {
    if(mm_is_pattern(matcode)) {
        fscanf(f, "%d %d\n", &I[realnnz], &J[realnnz]);
        val[realnnz] = 1.0;
    }
    else
        fscanf(f, "%d %d %lg\n", &I[realnnz], &J[realnnz], &val[realnnz]);
    I[realnnz]--;  /* adjust from 1-based to 0-based */
        J[realnnz]--;
    if(mm_is_symmetric(matcode) && I[realnnz] != J[realnnz]) {
        I[realnnz+1] = J[realnnz];
        J[realnnz+1] = I[realnnz];
        val[realnnz+1] = val[realnnz];
        realnnz++;
    }
    realnnz++;
}
if (f !=stdin) fclose(f);
*pM = M;
*pN = N;
*prealnnz = realnnz;
} /* ENDOF read_mm }}}*/

/** Converts COO matrix to CSR matrix */
void coo_to_csr(int m, int nnz, int* I, int* J, double* val, MKL_INT* AI, MKL_INT* AJ, double* Aval) { /*{{{*/

MKL_INT info = 0;
MKL_INT job[8];   
//job[1]=0; // zero based indexing in csr
job[1]=1; // one based indexing in csr

job[2]=0; // zero based indexing in coo
job[3]=2; // I don't know
job[4]=nnz; // nnz

job[0]=1;  // coo to csr
job[5]=0;  // Acsr and AJR allocated by user
//void mkl_dcsrcoo (MKL_INT * job, MKL_INT * n, double *Acsr, MKL_INT * AJR, MKL_INT *AIR, MKL_INT * nnz, double *Acoo, MKL_INT * ir, MKL_INT * jc, MKL_INT * info);
mkl_dcsrcoo (job,&m, Aval, AJ, AI, &nnz, val, I, J, &info);
} /* ENDOF coo_to_csr }}}*/

int main(int argc, char* argv[]) { /*{{{*/
/** usage */
int nrequired_args = 7;
if (argc != nrequired_args){
    fprintf(stderr, "NAME:\n\tmkl_spgemm - multiply two sparse matrices\n");
    fprintf(stderr, "\nSYNOPSIS:\n");
    fprintf(stderr, "\tmkl_spgemm MATRIX_A MATRIX_B MATRIX_C NUMBER_OF_THREADS MIC_ID PRINT_MATRICES\n");
    fprintf(stderr, "\nDESCRIPTION:\n");
    fprintf(stderr, "\tNUMBER_OF_THREADS: {0,1,2,...}\n");
    fprintf(stderr, "\t\t0: Use number of threads determined by MKL\n");
    fprintf(stderr, "\tPRINT_MATRICES: PRINT_YES, PRINT_NO\n");
    fprintf(stderr, "\tMIC_ID: {0,1,2,...}\n");
    fprintf(stderr, "\t\t0: The device ID\n");
    fprintf(stderr, "\nSAMPLE EXECUTION:\n");
    fprintf(stderr, "\texport OFFLOAD_INIT=on_offload;%s test.mtx test.mtx out.mtx 2 0 PRINT_YES\n", argv[0]);
    exit(1);
}
/** parse arguments */
int iarg = 1;
char* strpathA = argv[iarg];    iarg++;
char* strpathB = argv[iarg];    iarg++;
char* strpathC = argv[iarg];    iarg++;
nthreads = atoi(argv[iarg]);    iarg++;
if (nthreads > 0) {
    mkl_set_num_threads(nthreads);
} else {
    nthreads = mkl_get_max_threads();
}
micdev = atoi(argv[iarg]);    iarg++;
option_print_matrices = strcmp(argv[iarg], "PRINT_YES")==0?OPTION_PRINT_MATRICES:OPTION_NOPRINT_MATRICES;    iarg++;
assert(nrequired_args == iarg);

/** read matrix market file for A matrix */
MKL_INT Am, An, Annz;
MKL_INT *Ax, *Ay;  
double *Anz;
read_mm(strpathA, &Am, &An, &Annz, &Ax, &Ay, &Anz);

/** construct csr storage for A matrix */
MKL_INT* AJ = (MKL_INT*) mkl_malloc( Annz * sizeof( MKL_INT ), 64 );
MKL_INT* AI = (MKL_INT*) mkl_malloc( (Am+1) * sizeof( MKL_INT ), 64 );
double* Aval = (double*) mkl_malloc( Annz * sizeof( double ),  64 );
coo_to_csr(Am, Annz, Ax, Ay, Anz, AI, AJ, Aval);    

/** read matrix market file for B matrix */
MKL_INT Bm, Bn, Bnnz;   
MKL_INT *Bx, *By;  
double *Bnz;
read_mm(strpathB, &Bm, &Bn, &Bnnz, &Bx, &By, &Bnz);

/** construct csr storage for B matrix */
MKL_INT* BJ = (MKL_INT*) mkl_malloc( Bnnz * sizeof( MKL_INT ), 64 );
MKL_INT* BI = (MKL_INT*) mkl_malloc( (Bm+1) * sizeof( MKL_INT ), 64 );
double* Bval = (double*) mkl_malloc( Bnnz * sizeof( double ),  64 );
coo_to_csr(Bm, Bnnz, Bx, By, Bnz, BI, BJ, Bval);    

/** multiply two matrices */
double* Cval, time; 
MKL_INT* CJ; MKL_INT* CI;
mkl_mic_spgemm(Am, An, Annz, Aval, AJ, AI, Bn, Bnnz, Bval, BJ, BI, &Cval, &CJ, &CI, &time);

int i;
for(i=0;i<=Am;i++)AI[i]--;    for(i=0;i<Annz;i++)AJ[i]--;
for(i=0;i<=Bm;i++)BI[i]--;    for(i=0;i<Bnnz;i++)BJ[i]--;
printmm_one(Am, Cval, CJ, CI);

/** In order to write the output C matrix to file, uncomment the following line */
//printfilemm_one(strpathC, Am, Bn, Cval, CJ, CI);

/** run my SpGEMM routine in order to find number of multiply-and-add operations */
long nummult = 0;
csi* xb = (csi*)calloc(Bn, sizeof(csi));
csv* x = (csv*)calloc(Bn, sizeof(csv));
csr* C = csr_multiply(Am, An, Annz, AI, AJ, Aval, Bm, Bn, Bnnz, BI, BJ, Bval, &nummult, xb, x);
double gflop = 2 * (double) nummult / 1e9;

/** print gflop per second and time */
printf("%d\t", nthreads);
printf("%g\t", (gflop/time));
printf("%g\t", time);
printf("%s\t", strpathA);
printf("%s\t", strpathB);
printf("%s\t", strpathC);
printf("\n");

/** free allocated space */
mkl_free(AI);
mkl_free(AJ);
mkl_free(Aval);
mkl_free(BI);
mkl_free(BJ);
mkl_free(Bval);
return 0;
} /* ENDOF main }}}*/

/** Prints matrix in CSR format */
void printmm_one(int m, double* Aval, int* AJ, int* AI){ //{{{

    if (option_print_matrices == OPTION_NOPRINT_MATRICES)
        return;
    int i;
    for(i = 0; i < m; i++) {
        printf("%d: ", i+1);
        int j;
        for(j = AI[i]-1; j < AI[i+1]-1; j++) {
            printf("%d:%g  ", AJ[j], Aval[j]);
        }
        printf("\n");
    }
    printf("\n");
}//}}}

/** Writes matrix in CSR format in to a file using Matrix Market format */
void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI){//{{{

    FILE* f = fopen(file, "w");
    if(f == NULL){
        printf("%s %s %d: %s cannot be opened to write matrix\n", __FILE__, __PRETTY_FUNCTION__, __LINE__, file);
        exit(1); 
    }
    int i;
    fprintf(f, "%%%%MatrixMarket matrix coordinate real general\n");
    fprintf(f, "%d %d %d\n", m, n, AI[m]-1);
    for(i = 0; i < m; i++) {
        int j;
        for(j = AI[i]-1; j < AI[i+1]-1; j++) {
            fprintf(f, "%d %d %g\n", i+1, AJ[j], Aval[j]);
        }
    }
    fclose(f);
}//}}}