/*includes {{{*/ #include #include #include #include "mkl.h" #include "mkl_types.h" #include "mkl_spblas.h" #include "commonmm.h" #include "mmio.h" /*}}}*/ /* global varibles and defines {{{*/ int option_host; #define OPTION_HOST_CPU -1 #define OPTION_HOST_MIC 0 int micdev; __declspec(target(mic)) int max_threads = 0; __declspec(target(mic)) int nworker = 0; __declspec(target(mic)) int num_repeat; __declspec(target(mic)) int option_print_matrices = 0; #define OPTION_NOPRINT_MATRICES 0 #define OPTION_PRINT_MATRICES 1 __declspec(target(mic)) int option_printfile_matrices = 0; #define OPTION_NOPRINTFILE_MATRICES 0 #define OPTION_PRINTFILE_MATRICES 1 __declspec(target(mic)) char transa = 'n'; __declspec(target(mic)) double time_mic_numeric_mm; __declspec(target(mic)) double time_mic_symbolic_mm; /*}}}*/ /* method declarations {{{*/ int main(int argc, char* argv[]); __declspec(target(mic)) void printmm_one(int m, double* Aval, int* AJ, int* AI); __declspec(target(mic)) void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI); void printmm_zero(int m, double* Aval, int* AJ, int* AI); /*}}}*/ /*csr util{{{*/ //#include "../../csr/sspmxm.h" #define CS_MAX(a,b) (((a) > (b)) ? (a) : (b)) csr *csr_spfree(csr *A); /* free workspace and return a sparse matrix result */ csr *csr_done(csr *C, void *w, void *x, csi ok) { //#ifdef KADEBUG // printf("%d:%s:%s():%d, %s(%lu,%lu,%lu)\n", me, __FILE__, __FUNCTION__, // __LINE__, C->name, C->m, C->n, C->nzmax); //#endif // cs_free(w); /* free workspace */ // cs_free(x); return(ok ? C : csr_spfree(C)); /* return result if OK, else free it */ } /* wrapper for free */ void *cs_free(void *p) { if (p) free(p); /* free p if it is not already NULL */ return(NULL); /* return NULL to simplify the use of cs_free */ } /* wrapper for realloc */ void *csr_realloc(void *p, csi n, size_t size, csi *ok) { void *pnew = NULL; pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */ *ok = (pnew != NULL); /* realloc fails if pnew is NULL */ if (pnew == NULL) { printf("%d:reallocation failed, pnew is NULL\n", __LINE__); } //printf("%s:%d: n=%d ok=%d\n", __FUNCTION__, __LINE__, n, *ok); return((*ok) ? pnew : p); /* return original p if failure */ } /* wrapper for realloc */ void *cs_realloc(void *p, csi n, size_t size, csi *ok) { void *pnew = NULL; pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */ *ok = (pnew != NULL); /* realloc fails if pnew is NULL */ if (pnew == NULL) { printf("reallocation failed\n"); } return((*ok) ? pnew : p); /* return original p if failure */ } /* change the max # of entries sparse matrix */ csi csr_sprealloc(csr *A, csi nzmax) { //printf("%d: nzmax=%d\n", __LINE__, nzmax); csi ok, oki = 0, okj = 1, okx = 1; if (!A) return(0); if (nzmax <= 0) nzmax = A->p[A->m]; A->j = (int*)csr_realloc(A->j, nzmax, sizeof(csi), &oki); if (A->x) A->x = (csv*)csr_realloc(A->x, nzmax, sizeof(csv), &okx); //printf("%d: ok=%d oki=%d okj=%d okx=%d\n", __LINE__, ok, oki, okj, okx); ok = (oki && okj && okx); //printf("%d: ok=%d oki=%d okj=%d okx=%d\n", __LINE__, ok, oki, okj, okx); //printf("%s:%d: A->nzmax=%d nzmax=%d\n", __FUNCTION__, __LINE__, A->nzmax, nzmax); if (ok) A->nzmax = nzmax; //printf("%s:%d: A->nzmax=%d nzmax=%d\n", __FUNCTION__, __LINE__, A->nzmax, nzmax); return(ok); } /* free a sparse matrix */ csr *csr_spfree(csr *A) { if (!A) return(NULL); /* do nothing if A already NULL */ cs_free(A->p); A->p = NULL; cs_free(A->j); A->j = NULL; cs_free(A->x); A->x = NULL; cs_free(A->r); A->r = NULL; // cs_free(A->name); cs_free(A); /* free the cs struct and return NULL */ return NULL; } /* allocate a sparse matrix (triplet form or compressed-ROW form) */ csr *csr_spalloc(csi m, csi n, csi nzmax, int values, int triplet, csv f) // floatType f is to suppress compile error { //printf("m=%d n=%d nzmax=%d\n", m, n, nzmax); csr* A = (csr*)calloc(1, sizeof(csr)); /* allocate the cs struct */ // A->name = calloc(MTX_NAME_LEN, sizeof(char)); // sprintf(A->name, "N/A"); if (!A) { perror("sparse allocation failed"); return(NULL); /* out of memory */ } A->m = m; /* define dimensions and nzmax */ A->n = n; A->nzmax = nzmax = CS_MAX(nzmax, 0); //printf("A m=%d n=%d nzmax=%d\n", A->m, A->n, A->nzmax); A->nr = 0; // number of nonzero rows A->p = (csi*)calloc(m + 2, sizeof(csi)); A->j = (csi*)calloc(CS_MAX(nzmax,1), sizeof(csi)); A->x = (csv*)calloc(CS_MAX(nzmax,1), sizeof(csv)); return((!A->p || !A->j || !A->x) ? csr_spfree(A) : A); }/*}}}*/ /*csr_multiply{{{*/ /* C = A*B If nonzero counts of matrices are zero or number of rows/cols is zero, it is advised not to call this function. */ // run 8 CUT 0.1 NNZ MTX ~/matrices/nlpkkt200.mtx ~/matrices/nlpkkt200.mtx tmp stdout MNAC SERIAL csr *csr_multiply(csi Am, csi An, csi Anzmax, const csi* Ap, const csi* Aj, const csv* Ax, csi Bm, csi Bn, csi Bnzmax, const csi* Bp, const csi* Bj, const csv* Bx, long* nummult, csi* xb, csv* x) { csv tf = 0; csi p, jp, j, kp, k, i, nz = 0, anz, *Cp, *Cj, m, n, bnz, values = 1; csv *Cx; csr *C; //printf("%s:%d\n", __FILE__, __LINE__); if (An != Bm) return(NULL); //printf("%s:%d\n", __FILE__, __LINE__); if (Anzmax == 0 || Bnzmax == 0) { C = csr_spalloc(Am, Bn, 0, values, 0, tf); /*#ifdef KADEBUG printf("%d:%s:%s():%d, A(%s,%lu,%lu,%lu), B(%s,%lu,%lu,%lu), \ One of matrice is Zero! C(%lu,%lu,%lu)\n", me, __FILE__, __FUNCTION__, __LINE__, A->name, A->m, A->n, A->nzmax, B->name, B->m, B->n, B->nzmax, C->m, C->n, C->nzmax); #endif*/ return C; } // printf("%s:%d\n", __FILE__, __LINE__); m = Am; anz = Ap[Am]; n = Bn; bnz = Bp[Bm]; for(i = 0; i < n; i++) xb[i] = 0; //csi* xb = ka_calloc(n, sizeof(csi)); /* get workspace */ for(i = 0; i < n; i++) xb[i] = 0; values = (Ax != NULL) && (Bx != NULL); //csv* x = values ? ka_calloc(n, sizeof(csv)) : NULL; /* get workspace */ csi tnz = (anz + bnz) * 2; //printf("A->m=%d B->n=%d tnz=%d\n", A->m, B->n, tnz); C = csr_spalloc(m, n, tnz, values, 0, tf); /* allocate result */ // sprintf(C->name, "C=(%s)*(%s)", A->name, B->name); /*#ifdef KADEBUG printf("%d:%s:%s():%d, A(%s,%lu,%lu,%lu), B(%s,%lu,%lu,%lu), \ allocated C(%lu,%lu,%lu)\n", me, __FILE__, __FUNCTION__, __LINE__, A->name, A->m, A->n, A->nzmax, B->name, B->m, B->n, B->nzmax, C->m, C->n, C->nzmax); #endif*/ if (!C || !xb || (values && !x)) return (csr_done(C, xb, x, 0)); // csi zero = 0; // for(i = 0; i < n; i++) // xb[i] = zero; Cp = C->p; //csi* oldj = C->j; //printf("C->m=%d C->n=%d C->nzmax=%d C->j[0]=%d\n", C->m, C->n, C->nzmax, C->j[0]); for (i = 0; i < m; i++) { //C->j != oldj ? printf("Changed old=%u new=%u\n", oldj, C->j):0; if ( ( (nz + n) > C->nzmax ) ) { if(!csr_sprealloc(C, (2 * (C->nzmax) + n) ) ) { /*#ifdef KADEBUG printf("%d:%s:%s():%d, A(%s,%lu,%lu,%lu), B(%s,%lu,%lu,%lu), CANNOT BE\ enlargened C(%lu,%lu,%lu)\n", me, __FILE__, __FUNCTION__, __LINE__, A->name, A->m, A->n, A->nzmax, B->name, B->m, B->n, B->nzmax, C->m, C->n, C->nzmax); #endif */ return (csr_done(C, xb, x, 0)); // out of memory } else { /*#ifdef KADEBUG printf("%d:%s:%s():%d, A(%s,%lu,%lu,%lu), B(%s,%lu,%lu,%lu), enlargened \ C(%lu,%lu,%lu)\n", me, __FILE__, __FUNCTION__, __LINE__, A->name, A->m, A->n, A->nzmax, B->name, B->m, B->n, B->nzmax, C->m, C->n, C->nzmax); #endif */ } //printf("After expansion, C->nzmax=%d\n", C->nzmax); } Cj = C->j; Cx = C->x; /* C->j and C->x may be reallocated */ //printf("i=%d C->j[0]=%d\n", i, C->j[0]); Cp[i] = nz; /* row i of C starts here */ for (jp = Ap[i]; jp < Ap[i + 1]; jp++) { j = Aj[jp]; for (kp = Bp[j]; kp < Bp[j + 1]; kp++) { k = Bj[kp]; /* B(i,j) is nonzero */ //if(k > n || k < 0)printf("k=%d kp=%d n=%d: %d\n", k, kp, n,__LINE__); if (xb[k] != i + 1) { xb[k] = i + 1; /* i is new entry in column j */ // if(nz > C->nzmax) { // printf("%d: Error nz>nzmax %d>%d\n", __LINE__, nz, C->nzmax); // } //if(nz > C->nzmax || nz < 0)printf("nz: %d\n", __LINE__); Cj[nz++] = k; /* add i to pattern of C(:,j) */ if (x) { // C->j != oldj ? printf("B Changed old=%u new=%u\n", oldj, C->j):0; x[k] = Ax[jp] * Bx[kp]; /* x(i) = beta*A(i,j) */ // C->j != oldj ? printf("A Changed old=%u new=%u\n", oldj, C->j):0; (*nummult)++; } } else if (x) { x[k] += (Ax[jp] * Bx[kp]); /* i exists in C(:,j) already */ (*nummult)++; } } } if (values) for (p = Cp[i]; p < nz; p++) Cx[p] = x[Cj[p]]; } // printf("%s:%d nz=%d\n", __FILE__, __LINE__, nz); Cp[m] = nz; /* finalize the last row of C */ csr_sprealloc(C, 0); /* remove extra space from C */ /*#ifdef KADEBUG printf("%d:%s:%s():%d, A(%s,%lu,%lu,%lu), B(%s,%lu,%lu,%lu), trimmed \ C(%lu,%lu,%lu)\n", me, __FILE__, __FUNCTION__, __LINE__, A->name, A->m, A->n, A->nzmax, B->name, B->m, B->n, B->nzmax, C->m, C->n, C->nzmax); #endif*/ // cs_free(xb); xb = NULL; // cs_free(x); x = NULL; return C;//(csr_done(C, xb, x, 1)); /* success; free workspace, return C */ }/*}}}*/ void mkl_cpu_spmm(MKL_INT Am, MKL_INT An, MKL_INT Annz, double* Aval, MKL_INT* AJ, MKL_INT* AI, /*{{{*/ MKL_INT Bn, MKL_INT Bnnz, double* Bval, MKL_INT* BJ, MKL_INT* BI, double** pCval, MKL_INT** pCJ, MKL_INT** pCI) { MKL_INT* CJ = NULL; double* Cval = NULL; MKL_INT sort = 3; // sort everything MKL_INT* CI = (MKL_INT*)mkl_malloc( (Am+2) * sizeof( MKL_INT ), 64 ); MKL_INT nnzmax = 0; // nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr MKL_INT ierr; MKL_INT request = 1; // symbolic multiplication mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr); request = 2; //numeric multiplication int Cnnz = CI[Am]-1; int Cval_size = Cnnz + 1; CJ = (MKL_INT*)mkl_malloc( Cval_size * sizeof( MKL_INT ), 64 ); Cval = (double*)mkl_malloc( Cval_size * sizeof( double ), 64 ); mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr); //printmm_one(Am, Cval, CJ, CI);//printf("Cnnz=%d\n", Cnnz); *pCval = Cval; *pCJ = CJ; *pCI = CI; //printf("Cnnz_host:%d\n", Cnnz); } /* ENDOF mkl_cpu_spmm }}}*/ void mkl_mic_spmm(MKL_INT Am, MKL_INT An, MKL_INT Annz, double* Aval, MKL_INT* AJ, MKL_INT* AI,/*{{{*/ MKL_INT Bn, MKL_INT Bnnz, double* Bval, MKL_INT* BJ, MKL_INT* BI, double** pCval, MKL_INT** pCJ, MKL_INT** pCI) { int i; #pragma offload target(mic:micdev) inout(max_threads) { max_threads = mkl_get_max_threads(); mkl_set_num_threads(nworker); } // printf("MIC started \n"); fflush(stdout); #pragma offload target(mic:micdev) \ in(transa) \ in(Am) \ in(An) \ in(Bn) \ in(Aval:length(Annz) free_if(0)) \ in(AI:length(Am+1) free_if(0)) \ in(AJ:length(Annz) free_if(0)) \ in(Bval:length(Bnnz) free_if(0)) \ in(BI:length(An+1) free_if(0)) \ in(BJ:length(Bnnz) free_if(0)) {} //void mkl_dcsrmultcsr (char *transa, MKL_INT *job, MKL_INT *sort, MKL_INT *m, MKL_INT *n, MKL_INT *k, double *a, MKL_INT *ja, MKL_INT *ia, double *b, MKL_INT *jb, MKL_INT *ib, double *c, MKL_INT *jc, MKL_INT *ic, MKL_INT *nnzmax, MKL_INT *ierr); // double s_initial = dsecnd(); // double s_elapsed = dsecnd() - s_initial; // seconds MKL_INT Cnnz_host = -1; MKL_INT* CI = NULL; MKL_INT* CJ = NULL; double* Cval = NULL; MKL_INT nnzmax = 0; // nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr MKL_INT ierr; MKL_INT request = 2; //numeric multiplication MKL_INT sort = 3; // sort everything time_mic_symbolic_mm = 0.0; #pragma offload target(mic:micdev) in(i) in(ierr) in(nnzmax) in(request) in(sort) in(transa) in(Am) in(An) in(Bn) \ out(time_mic_symbolic_mm) out(Cnnz_host)\ in(Aval:length(Annz) alloc_if(0) free_if(0)) \ in(AI:length(Am+1) alloc_if(0) free_if(0)) \ in(AJ:length(Annz) alloc_if(0) free_if(0)) \ in(Bval:length(Bnnz) alloc_if(0) free_if(0)) \ in(BI:length(An+1) alloc_if(0) free_if(0)) \ in(BJ:length(Bnnz) alloc_if(0) free_if(0)) \ nocopy(CI: alloc_if(0) free_if(0)) \ nocopy(CJ: alloc_if(0) free_if(0)) \ nocopy(Cval: alloc_if(0) free_if(0)) { CI = (MKL_INT*)mkl_malloc( (Am+2) * sizeof( MKL_INT ), 64 ); MKL_INT nnzmax = 0; // nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr MKL_INT ierr; MKL_INT request = 1; // symbolic multiplication //MKL_INT sort = 7; // do not sort anything for(i = 0; i < 10; i++) { mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr); } double s_initial = dsecnd(); for(i = 0; i < num_repeat; i++) { mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr); } time_mic_symbolic_mm = (dsecnd() - s_initial) / num_repeat; // seconds request = 2; //numeric multiplication int Cnnz = CI[Am]-1; int Cval_size = Cnnz + 1; Cnnz_host = Cnnz; CJ = (MKL_INT*)mkl_malloc( Cval_size * sizeof( MKL_INT ), 64 ); Cval = (double*)mkl_malloc( Cval_size * sizeof( double ), 64 ); mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr); printmm_one(Am, Cval, CJ, CI);//printf("Cnnz=%d\n", Cnnz); sort = 7; // do not sort anything request = 2; // numeric multiplication }//printf("Cnnz_mic: %d\n", Cnnz_host);exit(-1); time_mic_numeric_mm = 0.0; #pragma offload target(mic:micdev) nocopy(request) nocopy(sort) nocopy(i) nocopy(ierr) nocopy(nnzmax) nocopy(transa) nocopy(Am) nocopy(An) nocopy(Bn) \ out(time_mic_numeric_mm) \ nocopy(Aval:length(Annz) alloc_if(0) free_if(0)) \ nocopy(AI:length(Am+1) alloc_if(0) free_if(0)) \ nocopy(AJ:length(Annz) alloc_if(0) free_if(0)) \ nocopy(Bval:length(Bnnz) alloc_if(0) free_if(0)) \ nocopy(BI:length(An+1) alloc_if(0) free_if(0)) \ nocopy(BJ:length(Bnnz) alloc_if(0) free_if(0)) \ nocopy(CI: alloc_if(0) free_if(0)) \ nocopy(CJ: alloc_if(0) free_if(0)) \ nocopy(Cval: alloc_if(0) free_if(0)) { //sort = 7 //request = 1; 2'ye gore sure iki katina cikiyor //request = 0; 2'ye gore sure uc katina cikiyor //request = 2 //sort = 3; 7'ye gore %30 daha yavas //printf("sort:%d request:%d\n", sort, request); // prints sort:7 request:2 for(i = 0; i < 10; i++) { mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr); } double s_initial = dsecnd(); for(i = 0; i < num_repeat; i++) { mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr); } time_mic_numeric_mm = (dsecnd() - s_initial) / num_repeat; // seconds } if (option_printfile_matrices == OPTION_PRINTFILE_MATRICES) { int nelm_CI = Am + 2; int nelm_CJ = Cnnz_host + 1; int nelm_Cval = Cnnz_host + 1; __declspec(target(mic)) MKL_INT* CI_host = (MKL_INT*)mkl_malloc( (nelm_CI) * sizeof( MKL_INT ), 64 ); __declspec(target(mic)) MKL_INT* CJ_host = (MKL_INT*)mkl_malloc( (nelm_CJ) * sizeof( MKL_INT ), 64 ); __declspec(target(mic)) double* Cval_host = (double*)mkl_malloc( (nelm_Cval) * sizeof( double ), 64 ); #pragma offload target(mic:micdev) in(nelm_CI)\ inout(CI_host:length(nelm_CI) alloc_if(1) free_if(0)) \ inout(CJ_host:length(nelm_CJ) alloc_if(1) free_if(0)) \ inout(Cval_host:length(nelm_Cval) alloc_if(1) free_if(0)) \ nocopy(CI: alloc_if(0) free_if(0)) \ nocopy(CJ: alloc_if(0) free_if(0)) \ nocopy(Cval: alloc_if(0) free_if(0)) { int i; for(i = 0; i < nelm_CI; i++) CI_host[i] = CI[i]; for(i = 0; i < nelm_CJ; i++) {CJ_host[i] = CJ[i]; Cval_host[i] = Cval[i];} } *pCval = Cval_host; *pCJ = CJ_host; *pCI = CI_host; //printf("Cnnz_host:%d\n", Cnnz_host); } } /* ENDOF mkl_mic_spmm }}}*/ void read_mm(char* strpath, int* pM, int* pN, int* prealnnz, int** pI, int** pJ, double** pval){ /*{{{*/ int i, M, N, nz, *I, *J; double* val; int ret_code; MM_typecode matcode; FILE* f; if ((f = fopen(strpath, "r")) == NULL) {fprintf(stderr, "Input matrix file %s cannot be opened to read.", strpath);exit(1);} /* READ MATRIX */ if (mm_read_banner(f, &matcode) != 0) { printf("Could not process Matrix Market banner.\n"); exit(1); } /* This is how one can screen matrix types if their application */ /* only supports a subset of the Matrix Market data types. */ if (mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode) ) { printf("Sorry, this application does not support "); printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));exit(1); } /* find out size of sparse matrix .... */ if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) exit(1); /* reseve memory for matrices */ I = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int)); J = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int)); val = (double *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(double)); *pI = I; *pJ = J; *pval = val; /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ int realnnz = 0; for (i=0; imaxdiff){ printf("%g\t%g\t%g\n", diff, vecout[i], vecoutcpu[i]); } } printf("\n"); *///}}}