diff --git a/common/csr.h b/common/csr.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c919f8a7622fff6c19fa74bf93d2938393097b3
--- /dev/null
+++ b/common/csr.h
@@ -0,0 +1,173 @@
+#ifndef CSR_H
+#define CSR_H
+
+typedef int csi;
+typedef double csv;
+csv zero = 0.0;
+#define CS_MAX(a,b) (((a) > (b)) ? (a) : (b))
+
+typedef struct csr_t {
+    csi m;
+    csi n;
+    csi nzmax;
+    csi nr;
+    csi* r;
+    csi* p;
+    csi* j;
+    csv* x;
+} csr;
+/* method declarations {{{*/
+csr *csr_spfree(csr *A);
+/*}}}*/
+
+/*csr util{{{*/
+
+/* free workspace and return a sparse matrix result */
+csr *csr_done(csr *C, void *w, void *x, csi ok) {
+    return (ok ? C : csr_spfree(C)); /* return result if OK, else free it */
+}
+
+/* wrapper for free */
+void *cs_free(void *p) {
+    if (p)
+        free(p); /* free p if it is not already NULL */
+    return (NULL); /* return NULL to simplify the use of cs_free */
+}
+
+/* wrapper for realloc */
+void *csr_realloc(void *p, csi n, size_t size, csi *ok) {
+    void *pnew = NULL;
+    pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */
+    *ok = (pnew != NULL); /* realloc fails if pnew is NULL */
+    if (pnew == NULL) {
+        printf("%d:reallocation failed, pnew is NULL\n", __LINE__);
+    }
+    return ((*ok) ? pnew : p); /* return original p if failure */
+}
+
+/* wrapper for realloc */
+void *cs_realloc(void *p, csi n, size_t size, csi *ok) {
+    void *pnew = NULL;
+    pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */
+    *ok = (pnew != NULL); /* realloc fails if pnew is NULL */
+    if (pnew == NULL) {
+        printf("reallocation failed\n");
+    }
+    return ((*ok) ? pnew : p); /* return original p if failure */
+}
+
+/* change the max # of entries sparse matrix */
+csi csr_sprealloc(csr *A, csi nzmax) {
+    csi ok, oki = 0, okj = 1, okx = 1;
+    if (!A)
+        return (0);
+    if (nzmax <= 0)
+        nzmax = A->p[A->m];
+    A->j = (int*) csr_realloc(A->j, nzmax, sizeof (csi), &oki);
+    if (A->x)
+        A->x = (csv*) csr_realloc(A->x, nzmax, sizeof (csv), &okx);
+    ok = (oki && okj && okx);
+    if (ok)
+        A->nzmax = nzmax;
+    return (ok);
+}
+
+/* free a sparse matrix */
+csr *csr_spfree(csr *A) {
+    if (!A)
+        return (NULL); /* do nothing if A already NULL */
+    cs_free(A->p);
+    A->p = NULL;
+    cs_free(A->j);
+    A->j = NULL;
+    cs_free(A->x);
+    A->x = NULL;
+    cs_free(A->r);
+    A->r = NULL;
+    cs_free(A); /* free the cs struct and return NULL */
+    return NULL;
+}
+
+/* allocate a sparse matrix (triplet form or compressed-ROW form) */
+csr *csr_spalloc(csi m, csi n, csi nzmax, int values, int triplet, csv f) {
+    csr* A = (csr*) calloc(1, sizeof (csr)); /* allocate the cs struct */
+    if (!A) {
+        perror("sparse allocation failed");
+        return (NULL); /* out of memory */
+    }
+    A->m = m; /* define dimensions and nzmax */
+    A->n = n;
+    A->nzmax = nzmax = CS_MAX(nzmax, 0);
+    A->nr = 0; // number of nonzero rows
+    A->p = (csi*) calloc(m + 2, sizeof (csi));
+    A->j = (csi*) calloc(CS_MAX(nzmax, 1), sizeof (csi));
+    A->x = (csv*) calloc(CS_MAX(nzmax, 1), sizeof (csv));
+    return ((!A->p || !A->j || !A->x) ? csr_spfree(A) : A);
+}/*}}}*/
+
+/** Multiply two sparse matrices which are stored in CSR format. MKL is used */
+csr *csr_multiply(csi Am, csi An, csi Anzmax, const csi* Ap, const csi* Aj, const csv* Ax, csi Bm, csi Bn, csi Bnzmax, const csi* Bp, const csi* Bj, const csv* Bx, long* nummult, csi* xb, csv* x) { /*{{{*/
+    csv tf = 0;
+    csi p, jp, j, kp, k, i, nz = 0, anz, *Cp, *Cj, m, n,
+            bnz, values = 1;
+    csv *Cx;
+    csr *C;
+    if (An != Bm)
+        return (NULL);
+    if (Anzmax == 0 || Bnzmax == 0) {
+        C = csr_spalloc(Am, Bn, 0, values, 0, tf);
+        return C;
+    }
+    m = Am;
+    anz = Ap[Am];
+    n = Bn;
+    bnz = Bp[Bm];
+    for (i = 0; i < n; i++) xb[i] = 0;
+    for (i = 0; i < n; i++)
+        xb[i] = 0;
+    values = (Ax != NULL) && (Bx != NULL);
+    csi tnz = (anz + bnz) * 2;
+    C = csr_spalloc(m, n, tnz, values, 0, tf); /* allocate result */
+    if (!C || !xb || (values && !x))
+        return (csr_done(C, xb, x, 0));
+    Cp = C->p;
+    for (i = 0; i < m; i++) {
+        if (((nz + n) > C->nzmax)) {
+            if (!csr_sprealloc(C, (2 * (C->nzmax) + n))) {
+                return (csr_done(C, xb, x, 0)); // out of memory
+            } else {
+            }
+        }
+        Cj = C->j;
+        Cx = C->x; /* C->j and C->x may be reallocated */
+        Cp[i] = nz; /* row i of C starts here */
+        for (jp = Ap[i]; jp < Ap[i + 1]; jp++) {
+            j = Aj[jp];
+            for (kp = Bp[j]; kp < Bp[j + 1]; kp++) {
+                k = Bj[kp]; /* B(i,j) is nonzero */
+                if (xb[k] != i + 1) {
+                    xb[k] = i + 1; /* i is new entry in column j */
+                    Cj[nz++] = k; /* add i to pattern of C(:,j) */
+                    if (x) {
+                        x[k] = Ax[jp] * Bx[kp]; /* x(i) = beta*A(i,j) */
+                        (*nummult)++;
+                    }
+                } else if (x) {
+                    x[k] += (Ax[jp] * Bx[kp]); /* i exists in C(:,j) already */
+                    (*nummult)++;
+                }
+            }
+        }
+        if (values)
+            for (p = Cp[i]; p < nz; p++)
+                Cx[p] = x[Cj[p]];
+    }
+    Cp[m] = nz; /* finalize the last row of C */
+    csr_sprealloc(C, 0); /* remove extra space from C */
+    xb = NULL;
+    x = NULL;
+    return C;
+}/*}}}*/
+
+#endif
+
diff --git a/common/mmio.h b/common/mmio.h
new file mode 100644
index 0000000000000000000000000000000000000000..196240e2559c8b74c15939ed26366defcb797c83
--- /dev/null
+++ b/common/mmio.h
@@ -0,0 +1,650 @@
+/*
+ *   Matrix Market I/O library for ANSI C
+ *
+ *   See http://math.nist.gov/MatrixMarket for details.
+ *
+ *
+ */
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode); /* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+                                    ojbect 		sparse/   	data        storage
+                                                                dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+                                                        A(array)	C(omplex)   H(ermitian)
+                                                                                        P(attern)   S(ymmetric)
+                                                                                I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate"
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+        MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+        double **val_, int **I_, int **J_);
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+        double **val_, int **I_, int **J_) {
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+
+    if ((f = fopen(fname, "r")) == NULL)
+        return -1;
+
+
+    if (mm_read_banner(f, &matcode) != 0) {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+
+
+
+    if (!(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode))) {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+
+    /* find out size of sparse matrix: M, N, nz .... */
+
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) != 0) {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+
+    /* reseve memory for matrices */
+
+    I = (int *) malloc(nz * sizeof (int));
+    J = (int *) malloc(nz * sizeof (int));
+    val = (double *) malloc(nz * sizeof (double));
+
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+
+    for (i = 0; i < nz; i++) {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--; /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode) {
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) ||
+            mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode) {
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH];
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type,
+            storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p = mtx; *p != '\0'; *p = tolower(*p), p++); /* convert to lower case */
+    for (p = crd; *p != '\0'; *p = tolower(*p), p++);
+    for (p = data_type; *p != '\0'; *p = tolower(*p), p++);
+    for (p = storage_scheme; *p != '\0'; *p = tolower(*p), p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+        if (strcmp(crd, MM_DENSE_STR) == 0)
+        mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+        if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+        if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+        if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+        if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+        if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+        if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz) {
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz) {
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do {
+        if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
+            return MM_PREMATURE_EOF;
+    } while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+
+    else
+        do {
+            num_items_read = fscanf(f, "%d %d %d", M, N, nz);
+            if (num_items_read == EOF) return MM_PREMATURE_EOF;
+        } while (num_items_read != 3);
+
+    return 0;
+}
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N) {
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do {
+        if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
+            return MM_PREMATURE_EOF;
+    } while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+
+    else /* we have a blank line */
+        do {
+            num_items_read = fscanf(f, "%d %d", M, N);
+            if (num_items_read == EOF) return MM_PREMATURE_EOF;
+        } while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N) {
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode) {
+    int i;
+    if (mm_is_complex(matcode)) {
+        for (i = 0; i < nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2 * i], &val[2 * i + 1])
+                    != 4) return MM_PREMATURE_EOF;
+    } else if (mm_is_real(matcode)) {
+        for (i = 0; i < nz; i++) {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                    != 3) return MM_PREMATURE_EOF;
+
+        }
+    } else if (mm_is_pattern(matcode)) {
+        for (i = 0; i < nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                    != 2) return MM_PREMATURE_EOF;
+    } else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode) {
+    if (mm_is_complex(matcode)) {
+        if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    } else if (mm_is_real(matcode)) {
+        if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    } else if (mm_is_pattern(matcode)) {
+        if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    } else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+
+}
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+ ************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
+        double **val, MM_typecode *matcode) {
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f = stdin;
+    else
+        if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) &&
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *) malloc(*nz * sizeof (int));
+    *J = (int *) malloc(*nz * sizeof (int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode)) {
+        *val = (double *) malloc(*nz * 2 * sizeof (double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    } else if (mm_is_real(*matcode)) {
+        *val = (double *) malloc(*nz * sizeof (double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    } else if (mm_is_pattern(*matcode)) {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode) {
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode) {
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0)
+        f = stdout;
+    else
+        if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i = 0; i < nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+        if (mm_is_real(matcode))
+        for (i = 0; i < nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+        if (mm_is_complex(matcode))
+        for (i = 0; i < nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2 * i],
+                val[2 * i + 1]);
+    else {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f != stdout) fclose(f);
+
+    return 0;
+}
+
+/**
+ *  Create a new copy of a string s.  mm_strdup() is a common routine, but
+ *  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+ *
+ */
+char *mm_strdup(const char *s) {
+    int len = strlen(s);
+    char *s2 = (char *) malloc((len + 1) * sizeof (char));
+    return strcpy(s2, s);
+}
+
+char *mm_typecode_to_str(MM_typecode matcode) {
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+    char *mm_strdup(const char *);
+    int error = 0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode))
+        types[0] = MM_MTX_STR;
+    else
+        error = 1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+        if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+        if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+        if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+        if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+        if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else
+        if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else
+        if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer, "%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
+
+/** Read Matrix Market file into COO matrix */
+void read_mm(char* strpath, int* pM, int* pN, int* prealnnz, int** pI, int** pJ, double** pval) { /*{{{*/
+    /*
+     *   taken from Matrix Market I/O library for ANSI C
+     *
+     *   See http://math.nist.gov/MatrixMarket for details.
+     *
+     *
+     */
+
+    int i, M, N, nz, *I, *J;
+    double* val;
+    int ret_code;
+    MM_typecode matcode;
+    FILE* f;
+    if ((f = fopen(strpath, "r")) == NULL) {
+        fprintf(stderr, "Input matrix file %s cannot be opened to read.", strpath);
+        exit(1);
+    }
+    /* READ MATRIX */
+    if (mm_read_banner(f, &matcode) != 0) {
+        printf("Could not process Matrix Market banner.\n");
+        exit(1);
+    }
+    /*  This is how one can screen matrix types if their application */
+    /*  only supports a subset of the Matrix Market data types.      */
+    if (mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode)) {
+        printf("Sorry, this application does not support ");
+        printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));
+        exit(1);
+    }
+    /* find out size of sparse matrix .... */
+    if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) != 0) exit(1);
+    /* reseve memory for matrices */
+    I = (int *) malloc((mm_is_symmetric(matcode) ? 2 * nz : nz) * sizeof (int));
+    J = (int *) malloc((mm_is_symmetric(matcode) ? 2 * nz : nz) * sizeof (int));
+    val = (double *) malloc((mm_is_symmetric(matcode) ? 2 * nz : nz) * sizeof (double));
+    *pI = I;
+    *pJ = J;
+    *pval = val;
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+    int realnnz = 0;
+    for (i = 0; i < nz; i++) {
+        if (mm_is_pattern(matcode)) {
+            fscanf(f, "%d %d\n", &I[realnnz], &J[realnnz]);
+            val[realnnz] = 1.0;
+        } else
+            fscanf(f, "%d %d %lg\n", &I[realnnz], &J[realnnz], &val[realnnz]);
+        I[realnnz]--; /* adjust from 1-based to 0-based */
+        J[realnnz]--;
+        if (mm_is_symmetric(matcode) && I[realnnz] != J[realnnz]) {
+            I[realnnz + 1] = J[realnnz];
+            J[realnnz + 1] = I[realnnz];
+            val[realnnz + 1] = val[realnnz];
+            realnnz++;
+        }
+        realnnz++;
+    }
+    if (f != stdin) fclose(f);
+    *pM = M;
+    *pN = N;
+    *prealnnz = realnnz;
+} /* ENDOF read_mm }}}*/
+
+#endif
diff --git a/common/util.h b/common/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..93095188dfcdd2637110f338ecfc2bcccd3ebe44
--- /dev/null
+++ b/common/util.h
@@ -0,0 +1,59 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#define OPTION_NOPRINT_MATRICES 0
+#define OPTION_PRINT_MATRICES 1
+MIC_TARGET int option_print_matrices = 0;
+
+/** Converts COO matrix to CSR matrix */
+void coo_to_csr(int m, int nnz, int* I, int* J, double* val, MKL_INT* AI, MKL_INT* AJ, double* Aval) { /*{{{*/
+    MKL_INT info = 0;
+    MKL_INT job[8];
+    job[1] = 1; // one based indexing in csr
+
+    job[2] = 0; // zero based indexing in coo
+    job[3] = 2; // I don't know
+    job[4] = nnz; // nnz
+
+    job[0] = 1; // coo to csr
+    job[5] = 0; // Acsr and AJR allocated by user
+    mkl_dcsrcoo(job, &m, Aval, AJ, AI, &nnz, val, I, J, &info);
+} /* ENDOF coo_to_csr }}}*/
+
+/** Prints matrix in CSR format */
+void MIC_TARGET printmm_one(int m, double* Aval, int* AJ, int* AI) { //{{{
+
+    if (option_print_matrices == OPTION_NOPRINT_MATRICES)
+        return;
+    int i;
+    for (i = 0; i < m; i++) {
+        printf("%d: ", i + 1);
+        int j;
+        for (j = AI[i] - 1; j < AI[i + 1] - 1; j++) {
+            printf("%d:%g  ", AJ[j], Aval[j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}//}}}
+
+/** Writes matrix in CSR format in to a file using Matrix Market format */
+void MIC_TARGET printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI) {//{{{
+
+    FILE* f = fopen(file, "w");
+    if (f == NULL) {
+        printf("%s %s %d: %s cannot be opened to write matrix\n", __FILE__, __PRETTY_FUNCTION__, __LINE__, file);
+        exit(1);
+    }
+    int i;
+    fprintf(f, "%%%%MatrixMarket matrix coordinate real general\n");
+    fprintf(f, "%d %d %d\n", m, n, AI[m] - 1);
+    for (i = 0; i < m; i++) {
+        int j;
+        for (j = AI[i] - 1; j < AI[i + 1] - 1; j++) {
+            fprintf(f, "%d %d %g\n", i + 1, AJ[j], Aval[j]);
+        }
+    }
+    fclose(f);
+}//}}}
+#endif
diff --git a/krylov/ksp_solver_multi_rhs.c b/krylov/ksp_solver_multi_rhs.c
index 8b6569580035f5f7150272de1cdef562740f5211..7da70b7d30117aefc385d4fe27d88fe7bbe74f39 100644
--- a/krylov/ksp_solver_multi_rhs.c
+++ b/krylov/ksp_solver_multi_rhs.c
@@ -7,9 +7,9 @@ All rights reserved.
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 
-* Redistributions of source code must retain the above copyright notice, this
+ * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice, this
+ * Redistributions in binary form must reproduce the above copyright notice, this
   list of conditions and the following disclaimer in the documentation and/or
   other materials provided with the distribution.
 
@@ -23,7 +23,7 @@ are permitted provided that the following conditions are met:
   ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ */
 
 static char help[] = "Solves a sequence of linear systems with different right-hand-side vectors.\n\
 Input parameters include:\n\
@@ -46,176 +46,232 @@ T*/
      petscmat.h - matrices
      petscis.h     - index sets            petscksp.h - Krylov subspace methods
      petscviewer.h - viewers               petscpc.h  - preconditioners
-*/
+ */
 #include <petscksp.h>
 
 #undef __FUNCT__
 #define __FUNCT__ "main"
-int main(int argc,char **args)
-{
-  Vec            x,b,u;  /* approx solution, RHS, exact solution */
-  Mat            A;        /* linear system matrix */
-  KSP            ksp;     /* linear solver context */
-  PetscReal      norm;     /* norm of solution error */
-  PetscErrorCode ierr;
-  PetscInt       ntimes,i,j,k,Ii,J,Istart,Iend;
-  PetscInt       m   = 8,n = 7,its;
-  PetscBool      flg = PETSC_FALSE;
-  PetscScalar    v,one = 1.0,neg_one = -1.0,rhs;
-
-  PetscInitialize(&argc,&args,(char*)0,help);
-  ierr = PetscOptionsGetInt(NULL,"-m",&m,NULL);CHKERRQ(ierr);
-  ierr = PetscOptionsGetInt(NULL,"-n",&n,NULL);CHKERRQ(ierr);
-
-  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-         Compute the matrix for use in solving a series of
-         linear systems of the form, A x_i = b_i, for i=1,2,...
-     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-  /*
-     Create parallel matrix, specifying only its global dimensions.
-     When using MatCreate(), the matrix format can be specified at
-     runtime. Also, the parallel partitioning of the matrix is
-     determined by PETSc at runtime.
-  */
-  ierr = MatCreate(PETSC_COMM_WORLD,&A);CHKERRQ(ierr);
-  ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,m*n,m*n);CHKERRQ(ierr);
-  ierr = MatSetFromOptions(A);CHKERRQ(ierr);
-  ierr = MatSetUp(A);CHKERRQ(ierr);
-
-  /*
-     Currently, all PETSc parallel matrix formats are partitioned by
-     contiguous chunks of rows across the processors.  Determine which
-     rows of the matrix are locally owned.
-  */
-  ierr = MatGetOwnershipRange(A,&Istart,&Iend);CHKERRQ(ierr);
-
-  /*
-     Set matrix elements for the 2-D, five-point stencil in parallel.
-      - Each processor needs to insert only elements that it owns
-        locally (but any non-local elements will be sent to the
-        appropriate processor during matrix assembly).
-      - Always specify global rows and columns of matrix entries.
-   */
-  for (Ii=Istart; Ii<Iend; Ii++) {
-    v = -1.0; i = Ii/n; j = Ii - i*n;
-    if (i>0)   {J = Ii - n; ierr = MatSetValues(A,1,&Ii,1,&J,&v,INSERT_VALUES);CHKERRQ(ierr);}
-    if (i<m-1) {J = Ii + n; ierr = MatSetValues(A,1,&Ii,1,&J,&v,INSERT_VALUES);CHKERRQ(ierr);}
-    if (j>0)   {J = Ii - 1; ierr = MatSetValues(A,1,&Ii,1,&J,&v,INSERT_VALUES);CHKERRQ(ierr);}
-    if (j<n-1) {J = Ii + 1; ierr = MatSetValues(A,1,&Ii,1,&J,&v,INSERT_VALUES);CHKERRQ(ierr);}
-    v = 4.0; ierr = MatSetValues(A,1,&Ii,1,&Ii,&v,INSERT_VALUES);CHKERRQ(ierr);
-  }
-
-  /*
-     Assemble matrix, using the 2-step process:
-       MatAssemblyBegin(), MatAssemblyEnd()
-     Computations can be done while messages are in transition
-     by placing code between these two statements.
-  */
-  ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-  ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-
-  /*
-     Create parallel vectors.
-      - When using VecCreate(), VecSetSizes() and VecSetFromOptions(),
-        we specify only the vector's global
-        dimension; the parallel partitioning is determined at runtime.
-      - When solving a linear system, the vectors and matrices MUST
-        be partitioned accordingly.  PETSc automatically generates
-        appropriately partitioned matrices and vectors when MatCreate()
-        and VecCreate() are used with the same communicator.
-      - Note: We form 1 vector from scratch and then duplicate as needed.
-  */
-  ierr = VecCreate(PETSC_COMM_WORLD,&u);CHKERRQ(ierr);
-  ierr = VecSetSizes(u,PETSC_DECIDE,m*n);CHKERRQ(ierr);
-  ierr = VecSetFromOptions(u);CHKERRQ(ierr);
-  ierr = VecDuplicate(u,&b);CHKERRQ(ierr);
-  ierr = VecDuplicate(b,&x);CHKERRQ(ierr);
-
-  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-                Create the linear solver and set various options
-     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-  /*
-     Create linear solver context
-  */
-  ierr = KSPCreate(PETSC_COMM_WORLD,&ksp);CHKERRQ(ierr);
-
-  /*
-     Set operators. Here the matrix that defines the linear system
-     also serves as the preconditioning matrix.
-  */
-  ierr = KSPSetOperators(ksp,A,A);CHKERRQ(ierr);
-
-  /*
-    Set runtime options, e.g.,
-        -ksp_type <type> -pc_type <type> -ksp_monitor -ksp_rtol <rtol>
-    These options will override those specified above as long as
-    KSPSetFromOptions() is called _after_ any other customization
-    routines.
-  */
-  ierr = KSPSetFromOptions(ksp);CHKERRQ(ierr);
-
-  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-       Solve several linear systems of the form  A x_i = b_i
-       I.e., we retain the same matrix (A) for all systems, but
-       change the right-hand-side vector (b_i) at each step.
-
-       In this case, we simply call KSPSolve() multiple times.  The
-       preconditioner setup operations (e.g., factorization for ILU)
-       be done during the first call to KSPSolve() only; such operations
-       will NOT be repeated for successive solves.
-     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-  ntimes = 2;
-  ierr   = PetscOptionsGetInt(NULL,"-ntimes",&ntimes,NULL);CHKERRQ(ierr);
-  for (k=1; k<ntimes+1; k++) {
+
+int main(int argc, char **args) {
+    Vec x, b, u; /* approx solution, RHS, exact solution */
+    Mat A; /* linear system matrix */
+    KSP ksp; /* linear solver context */
+    PetscReal norm; /* norm of solution error */
+    PetscErrorCode ierr;
+    PetscInt ntimes, i, j, k, Ii, J, Istart, Iend;
+    PetscInt m = 8, n = 7, its;
+    PetscBool flg = PETSC_FALSE;
+    PetscScalar v, one = 1.0, neg_one = -1.0, rhs;
+
+    PetscInitialize(&argc, &args, (char*) 0, help);
+    ierr = PetscOptionsGetInt(NULL, "-m", &m, NULL);
+    CHKERRQ(ierr);
+    ierr = PetscOptionsGetInt(NULL, "-n", &n, NULL);
+    CHKERRQ(ierr);
+
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+           Compute the matrix for use in solving a series of
+           linear systems of the form, A x_i = b_i, for i=1,2,...
+       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+    /*
+       Create parallel matrix, specifying only its global dimensions.
+       When using MatCreate(), the matrix format can be specified at
+       runtime. Also, the parallel partitioning of the matrix is
+       determined by PETSc at runtime.
+     */
+    ierr = MatCreate(PETSC_COMM_WORLD, &A);
+    CHKERRQ(ierr);
+    ierr = MatSetSizes(A, PETSC_DECIDE, PETSC_DECIDE, m*n, m * n);
+    CHKERRQ(ierr);
+    ierr = MatSetFromOptions(A);
+    CHKERRQ(ierr);
+    ierr = MatSetUp(A);
+    CHKERRQ(ierr);
+
+    /*
+       Currently, all PETSc parallel matrix formats are partitioned by
+       contiguous chunks of rows across the processors.  Determine which
+       rows of the matrix are locally owned.
+     */
+    ierr = MatGetOwnershipRange(A, &Istart, &Iend);
+    CHKERRQ(ierr);
+
+    /*
+       Set matrix elements for the 2-D, five-point stencil in parallel.
+        - Each processor needs to insert only elements that it owns
+          locally (but any non-local elements will be sent to the
+          appropriate processor during matrix assembly).
+        - Always specify global rows and columns of matrix entries.
+     */
+    for (Ii = Istart; Ii < Iend; Ii++) {
+        v = -1.0;
+        i = Ii / n;
+        j = Ii - i*n;
+        if (i > 0) {
+            J = Ii - n;
+            ierr = MatSetValues(A, 1, &Ii, 1, &J, &v, INSERT_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (i < m - 1) {
+            J = Ii + n;
+            ierr = MatSetValues(A, 1, &Ii, 1, &J, &v, INSERT_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (j > 0) {
+            J = Ii - 1;
+            ierr = MatSetValues(A, 1, &Ii, 1, &J, &v, INSERT_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (j < n - 1) {
+            J = Ii + 1;
+            ierr = MatSetValues(A, 1, &Ii, 1, &J, &v, INSERT_VALUES);
+            CHKERRQ(ierr);
+        }
+        v = 4.0;
+        ierr = MatSetValues(A, 1, &Ii, 1, &Ii, &v, INSERT_VALUES);
+        CHKERRQ(ierr);
+    }
+
+    /*
+       Assemble matrix, using the 2-step process:
+         MatAssemblyBegin(), MatAssemblyEnd()
+       Computations can be done while messages are in transition
+       by placing code between these two statements.
+     */
+    ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
+    CHKERRQ(ierr);
+    ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
+    CHKERRQ(ierr);
+
+    /*
+       Create parallel vectors.
+        - When using VecCreate(), VecSetSizes() and VecSetFromOptions(),
+          we specify only the vector's global
+          dimension; the parallel partitioning is determined at runtime.
+        - When solving a linear system, the vectors and matrices MUST
+          be partitioned accordingly.  PETSc automatically generates
+          appropriately partitioned matrices and vectors when MatCreate()
+          and VecCreate() are used with the same communicator.
+        - Note: We form 1 vector from scratch and then duplicate as needed.
+     */
+    ierr = VecCreate(PETSC_COMM_WORLD, &u);
+    CHKERRQ(ierr);
+    ierr = VecSetSizes(u, PETSC_DECIDE, m * n);
+    CHKERRQ(ierr);
+    ierr = VecSetFromOptions(u);
+    CHKERRQ(ierr);
+    ierr = VecDuplicate(u, &b);
+    CHKERRQ(ierr);
+    ierr = VecDuplicate(b, &x);
+    CHKERRQ(ierr);
+
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+                  Create the linear solver and set various options
+       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+    /*
+       Create linear solver context
+     */
+    ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
+    CHKERRQ(ierr);
 
     /*
-       Set exact solution; then compute right-hand-side vector.  We use
-       an exact solution of a vector with all elements equal to 1.0*k.
-    */
-    rhs  = one * (PetscReal)k;
-    ierr = VecSet(u,rhs);CHKERRQ(ierr);
-    ierr = MatMult(A,u,b);CHKERRQ(ierr);
+       Set operators. Here the matrix that defines the linear system
+       also serves as the preconditioning matrix.
+     */
+    ierr = KSPSetOperators(ksp, A, A);
+    CHKERRQ(ierr);
 
     /*
-       View the exact solution vector if desired
-    */
-    ierr = PetscOptionsGetBool(NULL,"-view_exact_sol",&flg,NULL);CHKERRQ(ierr);
-    if (flg) {ierr = VecView(u,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);}
+      Set runtime options, e.g.,
+          -ksp_type <type> -pc_type <type> -ksp_monitor -ksp_rtol <rtol>
+      These options will override those specified above as long as
+      KSPSetFromOptions() is called _after_ any other customization
+      routines.
+     */
+    ierr = KSPSetFromOptions(ksp);
+    CHKERRQ(ierr);
+
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+         Solve several linear systems of the form  A x_i = b_i
+         I.e., we retain the same matrix (A) for all systems, but
+         change the right-hand-side vector (b_i) at each step.
+
+         In this case, we simply call KSPSolve() multiple times.  The
+         preconditioner setup operations (e.g., factorization for ILU)
+         be done during the first call to KSPSolve() only; such operations
+         will NOT be repeated for successive solves.
+       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 
-    ierr = KSPSolve(ksp,b,x);CHKERRQ(ierr);
+    ntimes = 2;
+    ierr = PetscOptionsGetInt(NULL, "-ntimes", &ntimes, NULL);
+    CHKERRQ(ierr);
+    for (k = 1; k < ntimes + 1; k++) {
 
+        /*
+           Set exact solution; then compute right-hand-side vector.  We use
+           an exact solution of a vector with all elements equal to 1.0*k.
+         */
+        rhs = one * (PetscReal) k;
+        ierr = VecSet(u, rhs);
+        CHKERRQ(ierr);
+        ierr = MatMult(A, u, b);
+        CHKERRQ(ierr);
+
+        /*
+           View the exact solution vector if desired
+         */
+        ierr = PetscOptionsGetBool(NULL, "-view_exact_sol", &flg, NULL);
+        CHKERRQ(ierr);
+        if (flg) {
+            ierr = VecView(u, PETSC_VIEWER_STDOUT_WORLD);
+            CHKERRQ(ierr);
+        }
+
+        ierr = KSPSolve(ksp, b, x);
+        CHKERRQ(ierr);
+
+        /*
+           Check the error
+         */
+        ierr = VecAXPY(x, neg_one, u);
+        CHKERRQ(ierr);
+        ierr = VecNorm(x, NORM_2, &norm);
+        CHKERRQ(ierr);
+        ierr = KSPGetIterationNumber(ksp, &its);
+        CHKERRQ(ierr);
+        /*
+           Print convergence information.  PetscPrintf() produces a single
+           print statement from all processes that share a communicator.
+         */
+        ierr = PetscPrintf(PETSC_COMM_WORLD, "Norm of error %g System %D: iterations %D\n", (double) norm, k, its);
+        CHKERRQ(ierr);
+    }
+
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+                        Clean up
+       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
     /*
-       Check the error
-    */
-    ierr = VecAXPY(x,neg_one,u);CHKERRQ(ierr);
-    ierr = VecNorm(x,NORM_2,&norm);CHKERRQ(ierr);
-    ierr = KSPGetIterationNumber(ksp,&its);CHKERRQ(ierr);
+       Free work space.  All PETSc objects should be destroyed when they
+       are no longer needed.
+     */
+    ierr = KSPDestroy(&ksp);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&u);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&x);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&b);
+    CHKERRQ(ierr);
+    ierr = MatDestroy(&A);
+    CHKERRQ(ierr);
+
     /*
-       Print convergence information.  PetscPrintf() produces a single
-       print statement from all processes that share a communicator.
-    */
-    ierr = PetscPrintf(PETSC_COMM_WORLD,"Norm of error %g System %D: iterations %D\n",(double)norm,k,its);CHKERRQ(ierr);
-  }
-
-  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-                      Clean up
-     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-  /*
-     Free work space.  All PETSc objects should be destroyed when they
-     are no longer needed.
-  */
-  ierr = KSPDestroy(&ksp);CHKERRQ(ierr);
-  ierr = VecDestroy(&u);CHKERRQ(ierr);  ierr = VecDestroy(&x);CHKERRQ(ierr);
-  ierr = VecDestroy(&b);CHKERRQ(ierr);  ierr = MatDestroy(&A);CHKERRQ(ierr);
-
-  /*
-     Always call PetscFinalize() before exiting a program.  This routine
-       - finalizes the PETSc libraries as well as MPI
-       - provides summary and diagnostic information if certain runtime
-         options are chosen (e.g., -log_summary).
-  */
-  ierr = PetscFinalize();
-  return 0;
+       Always call PetscFinalize() before exiting a program.  This routine
+         - finalizes the PETSc libraries as well as MPI
+         - provides summary and diagnostic information if certain runtime
+           options are chosen (e.g., -log_summary).
+     */
+    ierr = PetscFinalize();
+    return 0;
 }
diff --git a/krylov/ksp_solver_simple.c b/krylov/ksp_solver_simple.c
index e8815f16a2d5679296bdea1b9d0c28757506cdcb..d1595c2d7a431549e27d32ccc47b9e5cb8a4e230 100644
--- a/krylov/ksp_solver_simple.c
+++ b/krylov/ksp_solver_simple.c
@@ -7,9 +7,9 @@ All rights reserved.
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 
-* Redistributions of source code must retain the above copyright notice, this
+ * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice, this
+ * Redistributions in binary form must reproduce the above copyright notice, this
   list of conditions and the following disclaimer in the documentation and/or
   other materials provided with the distribution.
 
@@ -23,7 +23,7 @@ are permitted provided that the following conditions are met:
   ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ */
 
 static char help[] = "Solves a linear system in parallel with KSP.\n\
 Input parameters include:\n\
@@ -46,214 +46,280 @@ T*/
      petscmat.h - matrices
      petscis.h     - index sets            petscksp.h - Krylov subspace methods
      petscviewer.h - viewers               petscpc.h  - preconditioners
-*/
+ */
 #include <petscksp.h>
 
 #undef __FUNCT__
 #define __FUNCT__ "main"
-int main(int argc,char **args)
-{
-  Vec            x,b,u;  /* approx solution, RHS, exact solution */
-  Mat            A;        /* linear system matrix */
-  KSP            ksp;     /* linear solver context */
-  PetscRandom    rctx;     /* random number generator context */
-  PetscReal      norm;     /* norm of solution error */
-  PetscInt       i,j,Ii,J,Istart,Iend,m = 8,n = 7,its;
-  PetscErrorCode ierr;
-  PetscBool      flg = PETSC_FALSE;
-  PetscScalar    v;
+
+int main(int argc, char **args) {
+    Vec x, b, u; /* approx solution, RHS, exact solution */
+    Mat A; /* linear system matrix */
+    KSP ksp; /* linear solver context */
+    PetscRandom rctx; /* random number generator context */
+    PetscReal norm; /* norm of solution error */
+    PetscInt i, j, Ii, J, Istart, Iend, m = 8, n = 7, its;
+    PetscErrorCode ierr;
+    PetscBool flg = PETSC_FALSE;
+    PetscScalar v;
 #if defined(PETSC_USE_LOG)
-  PetscLogStage stage;
+    PetscLogStage stage;
 #endif
 
-  PetscInitialize(&argc,&args,(char*)0,help);
-  ierr = PetscOptionsGetInt(NULL,"-m",&m,NULL);CHKERRQ(ierr);
-  ierr = PetscOptionsGetInt(NULL,"-n",&n,NULL);CHKERRQ(ierr);
-  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-         Compute the matrix and right-hand-side vector that define
-         the linear system, Ax = b.
-     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-  /*
-     Create parallel matrix, specifying only its global dimensions.
-     When using MatCreate(), the matrix format can be specified at
-     runtime. Also, the parallel partitioning of the matrix is
-     determined by PETSc at runtime.
-
-     Performance tuning note:  For problems of substantial size,
-     preallocation of matrix memory is crucial for attaining good
-     performance. See the matrix chapter of the users manual for details.
-  */
-  ierr = MatCreate(PETSC_COMM_WORLD,&A);CHKERRQ(ierr);
-  ierr = MatSetSizes(A,PETSC_DECIDE,PETSC_DECIDE,m*n,m*n);CHKERRQ(ierr);
-  ierr = MatSetFromOptions(A);CHKERRQ(ierr);
-  ierr = MatMPIAIJSetPreallocation(A,5,NULL,5,NULL);CHKERRQ(ierr);
-  ierr = MatSeqAIJSetPreallocation(A,5,NULL);CHKERRQ(ierr);
-
-  /*
-     Currently, all PETSc parallel matrix formats are partitioned by
-     contiguous chunks of rows across the processors.  Determine which
-     rows of the matrix are locally owned.
-  */
-  ierr = MatGetOwnershipRange(A,&Istart,&Iend);CHKERRQ(ierr);
-
-  /*
-     Set matrix elements for the 2-D, five-point stencil in parallel.
-      - Each processor needs to insert only elements that it owns
-        locally (but any non-local elements will be sent to the
-        appropriate processor during matrix assembly).
-      - Always specify global rows and columns of matrix entries.
-
-     Note: this uses the less common natural ordering that orders first
-     all the unknowns for x = h then for x = 2h etc; Hence you see J = Ii +- n
-     instead of J = I +- m as you might expect. The more standard ordering
-     would first do all variables for y = h, then y = 2h etc.
-
-   */
-  ierr = PetscLogStageRegister("Assembly", &stage);CHKERRQ(ierr);
-  ierr = PetscLogStagePush(stage);CHKERRQ(ierr);
-  for (Ii=Istart; Ii<Iend; Ii++) {
-    v = -1.0; i = Ii/n; j = Ii - i*n;
-    if (i>0)   {J = Ii - n; ierr = MatSetValues(A,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-    if (i<m-1) {J = Ii + n; ierr = MatSetValues(A,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-    if (j>0)   {J = Ii - 1; ierr = MatSetValues(A,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-    if (j<n-1) {J = Ii + 1; ierr = MatSetValues(A,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-    v = 4.0; ierr = MatSetValues(A,1,&Ii,1,&Ii,&v,ADD_VALUES);CHKERRQ(ierr);
-  }
-
-  /*
-     Assemble matrix, using the 2-step process:
-       MatAssemblyBegin(), MatAssemblyEnd()
-     Computations can be done while messages are in transition
-     by placing code between these two statements.
-  */
-  ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-  ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-  ierr = PetscLogStagePop();CHKERRQ(ierr);
-
-  /* A is symmetric. Set symmetric flag to enable ICC/Cholesky preconditioner */
-  ierr = MatSetOption(A,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
-
-  /*
-     Create parallel vectors.
-      - We form 1 vector from scratch and then duplicate as needed.
-      - When using VecCreate(), VecSetSizes and VecSetFromOptions()
-        in this example, we specify only the
-        vector's global dimension; the parallel partitioning is determined
-        at runtime.
-      - When solving a linear system, the vectors and matrices MUST
-        be partitioned accordingly.  PETSc automatically generates
-        appropriately partitioned matrices and vectors when MatCreate()
-        and VecCreate() are used with the same communicator.
-      - The user can alternatively specify the local vector and matrix
-        dimensions when more sophisticated partitioning is needed
-        (replacing the PETSC_DECIDE argument in the VecSetSizes() statement
-        below).
-  */
-  ierr = VecCreate(PETSC_COMM_WORLD,&u);CHKERRQ(ierr);
-  ierr = VecSetSizes(u,PETSC_DECIDE,m*n);CHKERRQ(ierr);
-  ierr = VecSetFromOptions(u);CHKERRQ(ierr);
-  ierr = VecDuplicate(u,&b);CHKERRQ(ierr);
-  ierr = VecDuplicate(b,&x);CHKERRQ(ierr);
-
-  /*
-     Set exact solution; then compute right-hand-side vector.
-     By default we use an exact solution of a vector with all
-     elements of 1.0;  Alternatively, using the runtime option
-     -random_sol forms a solution vector with random components.
-  */
-  ierr = PetscOptionsGetBool(NULL,"-random_exact_sol",&flg,NULL);CHKERRQ(ierr);
-  if (flg) {
-    ierr = PetscRandomCreate(PETSC_COMM_WORLD,&rctx);CHKERRQ(ierr);
-    ierr = PetscRandomSetFromOptions(rctx);CHKERRQ(ierr);
-    ierr = VecSetRandom(u,rctx);CHKERRQ(ierr);
-    ierr = PetscRandomDestroy(&rctx);CHKERRQ(ierr);
-  } else {
-    ierr = VecSet(u,1.0);CHKERRQ(ierr);
-  }
-  ierr = MatMult(A,u,b);CHKERRQ(ierr);
-
-  /*
-     View the exact solution vector if desired
-  */
-  flg  = PETSC_FALSE;
-  ierr = PetscOptionsGetBool(NULL,"-view_exact_sol",&flg,NULL);CHKERRQ(ierr);
-  if (flg) {ierr = VecView(u,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);}
-
-  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-                Create the linear solver and set various options
-     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-  /*
-     Create linear solver context
-  */
-  ierr = KSPCreate(PETSC_COMM_WORLD,&ksp);CHKERRQ(ierr);
-
-  /*
-     Set operators. Here the matrix that defines the linear system
-     also serves as the preconditioning matrix.
-  */
-  ierr = KSPSetOperators(ksp,A,A);CHKERRQ(ierr);
-
-  /*
-     Set linear solver defaults for this problem (optional).
-     - By extracting the KSP and PC contexts from the KSP context,
-       we can then directly call any KSP and PC routines to set
-       various options.
-     - The following two statements are optional; all of these
-       parameters could alternatively be specified at runtime via
-       KSPSetFromOptions().  All of these defaults can be
-       overridden at runtime, as indicated below.
-  */
-  ierr = KSPSetTolerances(ksp,1.e-2/((m+1)*(n+1)),1.e-50,PETSC_DEFAULT,
-                          PETSC_DEFAULT);CHKERRQ(ierr);
-
-  /*
-    Set runtime options, e.g.,
-        -ksp_type <type> -pc_type <type> -ksp_monitor -ksp_rtol <rtol>
-    These options will override those specified above as long as
-    KSPSetFromOptions() is called _after_ any other customization
-    routines.
-  */
-  ierr = KSPSetFromOptions(ksp);CHKERRQ(ierr);
-
-  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-                      Solve the linear system
-     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-  ierr = KSPSolve(ksp,b,x);CHKERRQ(ierr);
-
-  /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-                      Check solution and clean up
-     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-
-  /*
-     Check the error
-  */
-  ierr = VecAXPY(x,-1.0,u);CHKERRQ(ierr);
-  ierr = VecNorm(x,NORM_2,&norm);CHKERRQ(ierr);
-  ierr = KSPGetIterationNumber(ksp,&its);CHKERRQ(ierr);
-
-  /*
-     Print convergence information.  PetscPrintf() produces a single
-     print statement from all processes that share a communicator.
-     An alternative is PetscFPrintf(), which prints to a file.
-  */
-  ierr = PetscPrintf(PETSC_COMM_WORLD,"Norm of error %g iterations %D\n",(double)norm,its);CHKERRQ(ierr);
-
-  /*
-     Free work space.  All PETSc objects should be destroyed when they
-     are no longer needed.
-  */
-  ierr = KSPDestroy(&ksp);CHKERRQ(ierr);
-  ierr = VecDestroy(&u);CHKERRQ(ierr);  ierr = VecDestroy(&x);CHKERRQ(ierr);
-  ierr = VecDestroy(&b);CHKERRQ(ierr);  ierr = MatDestroy(&A);CHKERRQ(ierr);
-
-  /*
-     Always call PetscFinalize() before exiting a program.  This routine
-       - finalizes the PETSc libraries as well as MPI
-       - provides summary and diagnostic information if certain runtime
-         options are chosen (e.g., -log_summary).
-  */
-  ierr = PetscFinalize();
-  return 0;
+    PetscInitialize(&argc, &args, (char*) 0, help);
+    ierr = PetscOptionsGetInt(NULL, "-m", &m, NULL);
+    CHKERRQ(ierr);
+    ierr = PetscOptionsGetInt(NULL, "-n", &n, NULL);
+    CHKERRQ(ierr);
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+           Compute the matrix and right-hand-side vector that define
+           the linear system, Ax = b.
+       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+    /*
+       Create parallel matrix, specifying only its global dimensions.
+       When using MatCreate(), the matrix format can be specified at
+       runtime. Also, the parallel partitioning of the matrix is
+       determined by PETSc at runtime.
+
+       Performance tuning note:  For problems of substantial size,
+       preallocation of matrix memory is crucial for attaining good
+       performance. See the matrix chapter of the users manual for details.
+     */
+    ierr = MatCreate(PETSC_COMM_WORLD, &A);
+    CHKERRQ(ierr);
+    ierr = MatSetSizes(A, PETSC_DECIDE, PETSC_DECIDE, m*n, m * n);
+    CHKERRQ(ierr);
+    ierr = MatSetFromOptions(A);
+    CHKERRQ(ierr);
+    ierr = MatMPIAIJSetPreallocation(A, 5, NULL, 5, NULL);
+    CHKERRQ(ierr);
+    ierr = MatSeqAIJSetPreallocation(A, 5, NULL);
+    CHKERRQ(ierr);
+
+    /*
+       Currently, all PETSc parallel matrix formats are partitioned by
+       contiguous chunks of rows across the processors.  Determine which
+       rows of the matrix are locally owned.
+     */
+    ierr = MatGetOwnershipRange(A, &Istart, &Iend);
+    CHKERRQ(ierr);
+
+    /*
+       Set matrix elements for the 2-D, five-point stencil in parallel.
+        - Each processor needs to insert only elements that it owns
+          locally (but any non-local elements will be sent to the
+          appropriate processor during matrix assembly).
+        - Always specify global rows and columns of matrix entries.
+
+       Note: this uses the less common natural ordering that orders first
+       all the unknowns for x = h then for x = 2h etc; Hence you see J = Ii +- n
+       instead of J = I +- m as you might expect. The more standard ordering
+       would first do all variables for y = h, then y = 2h etc.
+
+     */
+    ierr = PetscLogStageRegister("Assembly", &stage);
+    CHKERRQ(ierr);
+    ierr = PetscLogStagePush(stage);
+    CHKERRQ(ierr);
+    for (Ii = Istart; Ii < Iend; Ii++) {
+        v = -1.0;
+        i = Ii / n;
+        j = Ii - i*n;
+        if (i > 0) {
+            J = Ii - n;
+            ierr = MatSetValues(A, 1, &Ii, 1, &J, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (i < m - 1) {
+            J = Ii + n;
+            ierr = MatSetValues(A, 1, &Ii, 1, &J, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (j > 0) {
+            J = Ii - 1;
+            ierr = MatSetValues(A, 1, &Ii, 1, &J, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (j < n - 1) {
+            J = Ii + 1;
+            ierr = MatSetValues(A, 1, &Ii, 1, &J, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+        v = 4.0;
+        ierr = MatSetValues(A, 1, &Ii, 1, &Ii, &v, ADD_VALUES);
+        CHKERRQ(ierr);
+    }
+
+    /*
+       Assemble matrix, using the 2-step process:
+         MatAssemblyBegin(), MatAssemblyEnd()
+       Computations can be done while messages are in transition
+       by placing code between these two statements.
+     */
+    ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
+    CHKERRQ(ierr);
+    ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
+    CHKERRQ(ierr);
+    ierr = PetscLogStagePop();
+    CHKERRQ(ierr);
+
+    /* A is symmetric. Set symmetric flag to enable ICC/Cholesky preconditioner */
+    ierr = MatSetOption(A, MAT_SYMMETRIC, PETSC_TRUE);
+    CHKERRQ(ierr);
+
+    /*
+       Create parallel vectors.
+        - We form 1 vector from scratch and then duplicate as needed.
+        - When using VecCreate(), VecSetSizes and VecSetFromOptions()
+          in this example, we specify only the
+          vector's global dimension; the parallel partitioning is determined
+          at runtime.
+        - When solving a linear system, the vectors and matrices MUST
+          be partitioned accordingly.  PETSc automatically generates
+          appropriately partitioned matrices and vectors when MatCreate()
+          and VecCreate() are used with the same communicator.
+        - The user can alternatively specify the local vector and matrix
+          dimensions when more sophisticated partitioning is needed
+          (replacing the PETSC_DECIDE argument in the VecSetSizes() statement
+          below).
+     */
+    ierr = VecCreate(PETSC_COMM_WORLD, &u);
+    CHKERRQ(ierr);
+    ierr = VecSetSizes(u, PETSC_DECIDE, m * n);
+    CHKERRQ(ierr);
+    ierr = VecSetFromOptions(u);
+    CHKERRQ(ierr);
+    ierr = VecDuplicate(u, &b);
+    CHKERRQ(ierr);
+    ierr = VecDuplicate(b, &x);
+    CHKERRQ(ierr);
+
+    /*
+       Set exact solution; then compute right-hand-side vector.
+       By default we use an exact solution of a vector with all
+       elements of 1.0;  Alternatively, using the runtime option
+       -random_sol forms a solution vector with random components.
+     */
+    ierr = PetscOptionsGetBool(NULL, "-random_exact_sol", &flg, NULL);
+    CHKERRQ(ierr);
+    if (flg) {
+        ierr = PetscRandomCreate(PETSC_COMM_WORLD, &rctx);
+        CHKERRQ(ierr);
+        ierr = PetscRandomSetFromOptions(rctx);
+        CHKERRQ(ierr);
+        ierr = VecSetRandom(u, rctx);
+        CHKERRQ(ierr);
+        ierr = PetscRandomDestroy(&rctx);
+        CHKERRQ(ierr);
+    } else {
+        ierr = VecSet(u, 1.0);
+        CHKERRQ(ierr);
+    }
+    ierr = MatMult(A, u, b);
+    CHKERRQ(ierr);
+
+    /*
+       View the exact solution vector if desired
+     */
+    flg = PETSC_FALSE;
+    ierr = PetscOptionsGetBool(NULL, "-view_exact_sol", &flg, NULL);
+    CHKERRQ(ierr);
+    if (flg) {
+        ierr = VecView(u, PETSC_VIEWER_STDOUT_WORLD);
+        CHKERRQ(ierr);
+    }
+
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+                  Create the linear solver and set various options
+       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+    /*
+       Create linear solver context
+     */
+    ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
+    CHKERRQ(ierr);
+
+    /*
+       Set operators. Here the matrix that defines the linear system
+       also serves as the preconditioning matrix.
+     */
+    ierr = KSPSetOperators(ksp, A, A);
+    CHKERRQ(ierr);
+
+    /*
+       Set linear solver defaults for this problem (optional).
+       - By extracting the KSP and PC contexts from the KSP context,
+         we can then directly call any KSP and PC routines to set
+         various options.
+       - The following two statements are optional; all of these
+         parameters could alternatively be specified at runtime via
+         KSPSetFromOptions().  All of these defaults can be
+         overridden at runtime, as indicated below.
+     */
+    ierr = KSPSetTolerances(ksp, 1.e-2 / ((m + 1)*(n + 1)), 1.e-50, PETSC_DEFAULT,
+            PETSC_DEFAULT);
+    CHKERRQ(ierr);
+
+    /*
+      Set runtime options, e.g.,
+          -ksp_type <type> -pc_type <type> -ksp_monitor -ksp_rtol <rtol>
+      These options will override those specified above as long as
+      KSPSetFromOptions() is called _after_ any other customization
+      routines.
+     */
+    ierr = KSPSetFromOptions(ksp);
+    CHKERRQ(ierr);
+
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+                        Solve the linear system
+       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+    ierr = KSPSolve(ksp, b, x);
+    CHKERRQ(ierr);
+
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+                        Check solution and clean up
+       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+    /*
+       Check the error
+     */
+    ierr = VecAXPY(x, -1.0, u);
+    CHKERRQ(ierr);
+    ierr = VecNorm(x, NORM_2, &norm);
+    CHKERRQ(ierr);
+    ierr = KSPGetIterationNumber(ksp, &its);
+    CHKERRQ(ierr);
+
+    /*
+       Print convergence information.  PetscPrintf() produces a single
+       print statement from all processes that share a communicator.
+       An alternative is PetscFPrintf(), which prints to a file.
+     */
+    ierr = PetscPrintf(PETSC_COMM_WORLD, "Norm of error %g iterations %D\n", (double) norm, its);
+    CHKERRQ(ierr);
+
+    /*
+       Free work space.  All PETSc objects should be destroyed when they
+       are no longer needed.
+     */
+    ierr = KSPDestroy(&ksp);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&u);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&x);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&b);
+    CHKERRQ(ierr);
+    ierr = MatDestroy(&A);
+    CHKERRQ(ierr);
+
+    /*
+       Always call PetscFinalize() before exiting a program.  This routine
+         - finalizes the PETSc libraries as well as MPI
+         - provides summary and diagnostic information if certain runtime
+           options are chosen (e.g., -log_summary).
+     */
+    ierr = PetscFinalize();
+    return 0;
 }
diff --git a/krylov/ksp_solver_two_sys.c b/krylov/ksp_solver_two_sys.c
index de5edd688d7b45c68fd03b52c2d954544aab7b11..1feb7bec31fbcbfe2b9d1011bd868c0ef5ecc913 100644
--- a/krylov/ksp_solver_two_sys.c
+++ b/krylov/ksp_solver_two_sys.c
@@ -7,9 +7,9 @@ All rights reserved.
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 
-* Redistributions of source code must retain the above copyright notice, this
+ * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice, this
+ * Redistributions in binary form must reproduce the above copyright notice, this
   list of conditions and the following disclaimer in the documentation and/or
   other materials provided with the distribution.
 
@@ -23,7 +23,7 @@ are permitted provided that the following conditions are met:
   ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ */
 
 static char help[] = "Solves two linear systems in parallel with KSP.  The code\n\
 illustrates repeated solution of linear systems with the same preconditioner\n\
@@ -45,291 +45,399 @@ T*/
      petscmat.h - matrices
      petscis.h     - index sets            petscksp.h - Krylov subspace methods
      petscviewer.h - viewers               petscpc.h  - preconditioners
-*/
+ */
 #include <petscksp.h>
 
 #undef __FUNCT__
 #define __FUNCT__ "main"
-int main(int argc,char **args)
-{
-  KSP            ksp;              /* linear solver context */
-  Mat            C;                /* matrix */
-  Vec            x,u,b;            /* approx solution, RHS, exact solution */
-  PetscReal      norm;             /* norm of solution error */
-  PetscScalar    v,none = -1.0;
-  PetscInt       Ii,J,ldim,low,high,iglobal,Istart,Iend;
-  PetscErrorCode ierr;
-  PetscInt       i,j,m = 3,n = 2,its;
-  PetscMPIInt    size,rank;
-  PetscBool      mat_nonsymmetric = PETSC_FALSE;
-  PetscBool      testnewC         = PETSC_FALSE;
+
+int main(int argc, char **args) {
+    KSP ksp; /* linear solver context */
+    Mat C; /* matrix */
+    Vec x, u, b; /* approx solution, RHS, exact solution */
+    PetscReal norm; /* norm of solution error */
+    PetscScalar v, none = -1.0;
+    PetscInt Ii, J, ldim, low, high, iglobal, Istart, Iend;
+    PetscErrorCode ierr;
+    PetscInt i, j, m = 3, n = 2, its;
+    PetscMPIInt size, rank;
+    PetscBool mat_nonsymmetric = PETSC_FALSE;
+    PetscBool testnewC = PETSC_FALSE;
 #if defined(PETSC_USE_LOG)
-  PetscLogStage stages[2];
+    PetscLogStage stages[2];
 #endif
 
-  PetscInitialize(&argc,&args,(char*)0,help);
-  ierr = PetscOptionsGetInt(NULL,"-m",&m,NULL);CHKERRQ(ierr);
-  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
-  ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
-  n    = 2*size;
-
-  /*
-     Set flag if we are doing a nonsymmetric problem; the default is symmetric.
-  */
-  ierr = PetscOptionsGetBool(NULL,"-mat_nonsym",&mat_nonsymmetric,NULL);CHKERRQ(ierr);
-
-  /*
-     Register two stages for separate profiling of the two linear solves.
-     Use the runtime option -log_summary for a printout of performance
-     statistics at the program's conlusion.
-  */
-  ierr = PetscLogStageRegister("Original Solve",&stages[0]);CHKERRQ(ierr);
-  ierr = PetscLogStageRegister("Second Solve",&stages[1]);CHKERRQ(ierr);
-
-  /* -------------- Stage 0: Solve Original System ---------------------- */
-  /*
-     Indicate to PETSc profiling that we're beginning the first stage
-  */
-  ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
-
-  /*
-     Create parallel matrix, specifying only its global dimensions.
-     When using MatCreate(), the matrix format can be specified at
-     runtime. Also, the parallel partitioning of the matrix is
-     determined by PETSc at runtime.
-  */
-  ierr = MatCreate(PETSC_COMM_WORLD,&C);CHKERRQ(ierr);
-  ierr = MatSetSizes(C,PETSC_DECIDE,PETSC_DECIDE,m*n,m*n);CHKERRQ(ierr);
-  ierr = MatSetFromOptions(C);CHKERRQ(ierr);
-  ierr = MatSetUp(C);CHKERRQ(ierr);
-
-  /*
-     Currently, all PETSc parallel matrix formats are partitioned by
-     contiguous chunks of rows across the processors.  Determine which
-     rows of the matrix are locally owned.
-  */
-  ierr = MatGetOwnershipRange(C,&Istart,&Iend);CHKERRQ(ierr);
-
-  /*
-     Set matrix entries matrix in parallel.
-      - Each processor needs to insert only elements that it owns
-        locally (but any non-local elements will be sent to the
-        appropriate processor during matrix assembly).
-      - Always specify global row and columns of matrix entries.
-  */
-  for (Ii=Istart; Ii<Iend; Ii++) {
-    v = -1.0; i = Ii/n; j = Ii - i*n;
-    if (i>0)   {J = Ii - n; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-    if (i<m-1) {J = Ii + n; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-    if (j>0)   {J = Ii - 1; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-    if (j<n-1) {J = Ii + 1; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-    v = 4.0; ierr = MatSetValues(C,1,&Ii,1,&Ii,&v,ADD_VALUES);
-  }
-
-  /*
-     Make the matrix nonsymmetric if desired
-  */
-  if (mat_nonsymmetric) {
-    for (Ii=Istart; Ii<Iend; Ii++) {
-      v = -1.5; i = Ii/n;
-      if (i>1)   {J = Ii-n-1; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
+    PetscInitialize(&argc, &args, (char*) 0, help);
+    ierr = PetscOptionsGetInt(NULL, "-m", &m, NULL);
+    CHKERRQ(ierr);
+    ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
+    CHKERRQ(ierr);
+    ierr = MPI_Comm_size(PETSC_COMM_WORLD, &size);
+    CHKERRQ(ierr);
+    n = 2 * size;
+
+    /*
+       Set flag if we are doing a nonsymmetric problem; the default is symmetric.
+     */
+    ierr = PetscOptionsGetBool(NULL, "-mat_nonsym", &mat_nonsymmetric, NULL);
+    CHKERRQ(ierr);
+
+    /*
+       Register two stages for separate profiling of the two linear solves.
+       Use the runtime option -log_summary for a printout of performance
+       statistics at the program's conlusion.
+     */
+    ierr = PetscLogStageRegister("Original Solve", &stages[0]);
+    CHKERRQ(ierr);
+    ierr = PetscLogStageRegister("Second Solve", &stages[1]);
+    CHKERRQ(ierr);
+
+    /* -------------- Stage 0: Solve Original System ---------------------- */
+    /*
+       Indicate to PETSc profiling that we're beginning the first stage
+     */
+    ierr = PetscLogStagePush(stages[0]);
+    CHKERRQ(ierr);
+
+    /*
+       Create parallel matrix, specifying only its global dimensions.
+       When using MatCreate(), the matrix format can be specified at
+       runtime. Also, the parallel partitioning of the matrix is
+       determined by PETSc at runtime.
+     */
+    ierr = MatCreate(PETSC_COMM_WORLD, &C);
+    CHKERRQ(ierr);
+    ierr = MatSetSizes(C, PETSC_DECIDE, PETSC_DECIDE, m*n, m * n);
+    CHKERRQ(ierr);
+    ierr = MatSetFromOptions(C);
+    CHKERRQ(ierr);
+    ierr = MatSetUp(C);
+    CHKERRQ(ierr);
+
+    /*
+       Currently, all PETSc parallel matrix formats are partitioned by
+       contiguous chunks of rows across the processors.  Determine which
+       rows of the matrix are locally owned.
+     */
+    ierr = MatGetOwnershipRange(C, &Istart, &Iend);
+    CHKERRQ(ierr);
+
+    /*
+       Set matrix entries matrix in parallel.
+        - Each processor needs to insert only elements that it owns
+          locally (but any non-local elements will be sent to the
+          appropriate processor during matrix assembly).
+        - Always specify global row and columns of matrix entries.
+     */
+    for (Ii = Istart; Ii < Iend; Ii++) {
+        v = -1.0;
+        i = Ii / n;
+        j = Ii - i*n;
+        if (i > 0) {
+            J = Ii - n;
+            ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (i < m - 1) {
+            J = Ii + n;
+            ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (j > 0) {
+            J = Ii - 1;
+            ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+        if (j < n - 1) {
+            J = Ii + 1;
+            ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+        v = 4.0;
+        ierr = MatSetValues(C, 1, &Ii, 1, &Ii, &v, ADD_VALUES);
     }
-  } else {
-    ierr = MatSetOption(C,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
-    ierr = MatSetOption(C,MAT_SYMMETRY_ETERNAL,PETSC_TRUE);CHKERRQ(ierr);
-  }
-
-  /*
-     Assemble matrix, using the 2-step process:
-       MatAssemblyBegin(), MatAssemblyEnd()
-     Computations can be done while messages are in transition
-     by placing code between these two statements.
-  */
-  ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-  ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-
-  /*
-     Create parallel vectors.
-      - When using VecSetSizes(), we specify only the vector's global
-        dimension; the parallel partitioning is determined at runtime.
-      - Note: We form 1 vector from scratch and then duplicate as needed.
-  */
-  ierr = VecCreate(PETSC_COMM_WORLD,&u);CHKERRQ(ierr);
-  ierr = VecSetSizes(u,PETSC_DECIDE,m*n);CHKERRQ(ierr);
-  ierr = VecSetFromOptions(u);CHKERRQ(ierr);
-  ierr = VecDuplicate(u,&b);CHKERRQ(ierr);
-  ierr = VecDuplicate(b,&x);CHKERRQ(ierr);
-
-  /*
-     Currently, all parallel PETSc vectors are partitioned by
-     contiguous chunks across the processors.  Determine which
-     range of entries are locally owned.
-  */
-  ierr = VecGetOwnershipRange(x,&low,&high);CHKERRQ(ierr);
-
-  /*
-    Set elements within the exact solution vector in parallel.
-     - Each processor needs to insert only elements that it owns
-       locally (but any non-local entries will be sent to the
-       appropriate processor during vector assembly).
-     - Always specify global locations of vector entries.
-  */
-  ierr = VecGetLocalSize(x,&ldim);CHKERRQ(ierr);
-  for (i=0; i<ldim; i++) {
-    iglobal = i + low;
-    v       = (PetscScalar)(i + 100*rank);
-    ierr    = VecSetValues(u,1,&iglobal,&v,INSERT_VALUES);CHKERRQ(ierr);
-  }
-
-  /*
-     Assemble vector, using the 2-step process:
-       VecAssemblyBegin(), VecAssemblyEnd()
-     Computations can be done while messages are in transition,
-     by placing code between these two statements.
-  */
-  ierr = VecAssemblyBegin(u);CHKERRQ(ierr);
-  ierr = VecAssemblyEnd(u);CHKERRQ(ierr);
-
-  /*
-     Compute right-hand-side vector
-  */
-  ierr = MatMult(C,u,b);CHKERRQ(ierr);
-
-  /*
-    Create linear solver context
-  */
-  ierr = KSPCreate(PETSC_COMM_WORLD,&ksp);CHKERRQ(ierr);
-
-  /*
-     Set operators. Here the matrix that defines the linear system
-     also serves as the preconditioning matrix.
-  */
-  ierr = KSPSetOperators(ksp,C,C);CHKERRQ(ierr);
-
-  /*
-     Set runtime options (e.g., -ksp_type <type> -pc_type <type>)
-  */
-  ierr = KSPSetFromOptions(ksp);CHKERRQ(ierr);
-
-  /*
-     Solve linear system.  Here we explicitly call KSPSetUp() for more
-     detailed performance monitoring of certain preconditioners, such
-     as ICC and ILU.  This call is optional, as KSPSetUp() will
-     automatically be called within KSPSolve() if it hasn't been
-     called already.
-  */
-  ierr = KSPSetUp(ksp);CHKERRQ(ierr);
-  ierr = KSPSolve(ksp,b,x);CHKERRQ(ierr);
-
-  /*
-     Check the error
-  */
-  ierr = VecAXPY(x,none,u);CHKERRQ(ierr);
-  ierr = VecNorm(x,NORM_2,&norm);CHKERRQ(ierr);
-  ierr = KSPGetIterationNumber(ksp,&its);CHKERRQ(ierr);
-  if (norm > 1.e-13) {
-    ierr = PetscPrintf(PETSC_COMM_WORLD,"Norm of error %g, Iterations %D\n",(double)norm,its);CHKERRQ(ierr);
-  }
-
-  /* -------------- Stage 1: Solve Second System ---------------------- */
-  /*
-     Solve another linear system with the same method.  We reuse the KSP
-     context, matrix and vector data structures, and hence save the
-     overhead of creating new ones.
-
-     Indicate to PETSc profiling that we're concluding the first
-     stage with PetscLogStagePop(), and beginning the second stage with
-     PetscLogStagePush().
-  */
-  ierr = PetscLogStagePop();CHKERRQ(ierr);
-  ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr);
-
-  /*
-     Initialize all matrix entries to zero.  MatZeroEntries() retains the
-     nonzero structure of the matrix for sparse formats.
-  */
-  ierr = MatZeroEntries(C);CHKERRQ(ierr);
-
-  /*
-     Assemble matrix again.  Note that we retain the same matrix data
-     structure and the same nonzero pattern; we just change the values
-     of the matrix entries.
-  */
-  for (i=0; i<m; i++) {
-    for (j=2*rank; j<2*rank+2; j++) {
-      v = -1.0;  Ii = j + n*i;
-      if (i>0)   {J = Ii - n; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-      if (i<m-1) {J = Ii + n; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-      if (j>0)   {J = Ii - 1; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-      if (j<n-1) {J = Ii + 1; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
-      v = 6.0; ierr = MatSetValues(C,1,&Ii,1,&Ii,&v,ADD_VALUES);CHKERRQ(ierr);
+
+    /*
+       Make the matrix nonsymmetric if desired
+     */
+    if (mat_nonsymmetric) {
+        for (Ii = Istart; Ii < Iend; Ii++) {
+            v = -1.5;
+            i = Ii / n;
+            if (i > 1) {
+                J = Ii - n - 1;
+                ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+                CHKERRQ(ierr);
+            }
+        }
+    } else {
+        ierr = MatSetOption(C, MAT_SYMMETRIC, PETSC_TRUE);
+        CHKERRQ(ierr);
+        ierr = MatSetOption(C, MAT_SYMMETRY_ETERNAL, PETSC_TRUE);
+        CHKERRQ(ierr);
     }
-  }
-  if (mat_nonsymmetric) {
-    for (Ii=Istart; Ii<Iend; Ii++) {
-      v = -1.5; i = Ii/n;
-      if (i>1)   {J = Ii-n-1; ierr = MatSetValues(C,1,&Ii,1,&J,&v,ADD_VALUES);CHKERRQ(ierr);}
+
+    /*
+       Assemble matrix, using the 2-step process:
+         MatAssemblyBegin(), MatAssemblyEnd()
+       Computations can be done while messages are in transition
+       by placing code between these two statements.
+     */
+    ierr = MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
+    CHKERRQ(ierr);
+    ierr = MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
+    CHKERRQ(ierr);
+
+    /*
+       Create parallel vectors.
+        - When using VecSetSizes(), we specify only the vector's global
+          dimension; the parallel partitioning is determined at runtime.
+        - Note: We form 1 vector from scratch and then duplicate as needed.
+     */
+    ierr = VecCreate(PETSC_COMM_WORLD, &u);
+    CHKERRQ(ierr);
+    ierr = VecSetSizes(u, PETSC_DECIDE, m * n);
+    CHKERRQ(ierr);
+    ierr = VecSetFromOptions(u);
+    CHKERRQ(ierr);
+    ierr = VecDuplicate(u, &b);
+    CHKERRQ(ierr);
+    ierr = VecDuplicate(b, &x);
+    CHKERRQ(ierr);
+
+    /*
+       Currently, all parallel PETSc vectors are partitioned by
+       contiguous chunks across the processors.  Determine which
+       range of entries are locally owned.
+     */
+    ierr = VecGetOwnershipRange(x, &low, &high);
+    CHKERRQ(ierr);
+
+    /*
+      Set elements within the exact solution vector in parallel.
+       - Each processor needs to insert only elements that it owns
+         locally (but any non-local entries will be sent to the
+         appropriate processor during vector assembly).
+       - Always specify global locations of vector entries.
+     */
+    ierr = VecGetLocalSize(x, &ldim);
+    CHKERRQ(ierr);
+    for (i = 0; i < ldim; i++) {
+        iglobal = i + low;
+        v = (PetscScalar) (i + 100 * rank);
+        ierr = VecSetValues(u, 1, &iglobal, &v, INSERT_VALUES);
+        CHKERRQ(ierr);
     }
-  }
-  ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
-  ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
 
-  ierr = PetscOptionsGetBool(NULL,"-test_newMat",&testnewC,NULL);CHKERRQ(ierr);
-  if (testnewC) {
     /*
-     User may use a new matrix C with same nonzero pattern, e.g.
-      ./ex5 -ksp_monitor -mat_type sbaij -pc_type cholesky -pc_factor_mat_solver_package mumps -test_newMat
-    */
-    Mat Ctmp;
-    ierr = MatDuplicate(C,MAT_COPY_VALUES,&Ctmp);CHKERRQ(ierr);
-    ierr = MatDestroy(&C);CHKERRQ(ierr);
-    ierr = MatDuplicate(Ctmp,MAT_COPY_VALUES,&C);CHKERRQ(ierr);
-    ierr = MatDestroy(&Ctmp);CHKERRQ(ierr);
-  }
-  /*
-     Compute another right-hand-side vector
-  */
-  ierr = MatMult(C,u,b);CHKERRQ(ierr);
-
-  /*
-     Set operators. Here the matrix that defines the linear system
-     also serves as the preconditioning matrix.
-  */
-  ierr = KSPSetOperators(ksp,C,C);CHKERRQ(ierr);
-
-  /*
-     Solve linear system
-  */
-  ierr = KSPSetUp(ksp);CHKERRQ(ierr);
-  ierr = KSPSolve(ksp,b,x);CHKERRQ(ierr);
-
-  /*
-     Check the error
-  */
-  ierr = VecAXPY(x,none,u);CHKERRQ(ierr);
-  ierr = VecNorm(x,NORM_2,&norm);CHKERRQ(ierr);
-  ierr = KSPGetIterationNumber(ksp,&its);CHKERRQ(ierr);
-  if (norm > 1.e-4) {
-    ierr = PetscPrintf(PETSC_COMM_WORLD,"Norm of error %g, Iterations %D\n",(double)norm,its);CHKERRQ(ierr);
-  }
-
-  /*
-     Free work space.  All PETSc objects should be destroyed when they
-     are no longer needed.
-  */
-  ierr = KSPDestroy(&ksp);CHKERRQ(ierr);
-  ierr = VecDestroy(&u);CHKERRQ(ierr);
-  ierr = VecDestroy(&x);CHKERRQ(ierr);
-  ierr = VecDestroy(&b);CHKERRQ(ierr);
-  ierr = MatDestroy(&C);CHKERRQ(ierr);
-
-  /*
-     Indicate to PETSc profiling that we're concluding the second stage
-  */
-  ierr = PetscLogStagePop();CHKERRQ(ierr);
-
-  ierr = PetscFinalize();
-  return 0;
+       Assemble vector, using the 2-step process:
+         VecAssemblyBegin(), VecAssemblyEnd()
+       Computations can be done while messages are in transition,
+       by placing code between these two statements.
+     */
+    ierr = VecAssemblyBegin(u);
+    CHKERRQ(ierr);
+    ierr = VecAssemblyEnd(u);
+    CHKERRQ(ierr);
+
+    /*
+       Compute right-hand-side vector
+     */
+    ierr = MatMult(C, u, b);
+    CHKERRQ(ierr);
+
+    /*
+      Create linear solver context
+     */
+    ierr = KSPCreate(PETSC_COMM_WORLD, &ksp);
+    CHKERRQ(ierr);
+
+    /*
+       Set operators. Here the matrix that defines the linear system
+       also serves as the preconditioning matrix.
+     */
+    ierr = KSPSetOperators(ksp, C, C);
+    CHKERRQ(ierr);
+
+    /*
+       Set runtime options (e.g., -ksp_type <type> -pc_type <type>)
+     */
+    ierr = KSPSetFromOptions(ksp);
+    CHKERRQ(ierr);
+
+    /*
+       Solve linear system.  Here we explicitly call KSPSetUp() for more
+       detailed performance monitoring of certain preconditioners, such
+       as ICC and ILU.  This call is optional, as KSPSetUp() will
+       automatically be called within KSPSolve() if it hasn't been
+       called already.
+     */
+    ierr = KSPSetUp(ksp);
+    CHKERRQ(ierr);
+    ierr = KSPSolve(ksp, b, x);
+    CHKERRQ(ierr);
+
+    /*
+       Check the error
+     */
+    ierr = VecAXPY(x, none, u);
+    CHKERRQ(ierr);
+    ierr = VecNorm(x, NORM_2, &norm);
+    CHKERRQ(ierr);
+    ierr = KSPGetIterationNumber(ksp, &its);
+    CHKERRQ(ierr);
+    if (norm > 1.e-13) {
+        ierr = PetscPrintf(PETSC_COMM_WORLD, "Norm of error %g, Iterations %D\n", (double) norm, its);
+        CHKERRQ(ierr);
+    }
+
+    /* -------------- Stage 1: Solve Second System ---------------------- */
+    /*
+       Solve another linear system with the same method.  We reuse the KSP
+       context, matrix and vector data structures, and hence save the
+       overhead of creating new ones.
+
+       Indicate to PETSc profiling that we're concluding the first
+       stage with PetscLogStagePop(), and beginning the second stage with
+       PetscLogStagePush().
+     */
+    ierr = PetscLogStagePop();
+    CHKERRQ(ierr);
+    ierr = PetscLogStagePush(stages[1]);
+    CHKERRQ(ierr);
+
+    /*
+       Initialize all matrix entries to zero.  MatZeroEntries() retains the
+       nonzero structure of the matrix for sparse formats.
+     */
+    ierr = MatZeroEntries(C);
+    CHKERRQ(ierr);
+
+    /*
+       Assemble matrix again.  Note that we retain the same matrix data
+       structure and the same nonzero pattern; we just change the values
+       of the matrix entries.
+     */
+    for (i = 0; i < m; i++) {
+        for (j = 2 * rank; j < 2 * rank + 2; j++) {
+            v = -1.0;
+            Ii = j + n*i;
+            if (i > 0) {
+                J = Ii - n;
+                ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+                CHKERRQ(ierr);
+            }
+            if (i < m - 1) {
+                J = Ii + n;
+                ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+                CHKERRQ(ierr);
+            }
+            if (j > 0) {
+                J = Ii - 1;
+                ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+                CHKERRQ(ierr);
+            }
+            if (j < n - 1) {
+                J = Ii + 1;
+                ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+                CHKERRQ(ierr);
+            }
+            v = 6.0;
+            ierr = MatSetValues(C, 1, &Ii, 1, &Ii, &v, ADD_VALUES);
+            CHKERRQ(ierr);
+        }
+    }
+    if (mat_nonsymmetric) {
+        for (Ii = Istart; Ii < Iend; Ii++) {
+            v = -1.5;
+            i = Ii / n;
+            if (i > 1) {
+                J = Ii - n - 1;
+                ierr = MatSetValues(C, 1, &Ii, 1, &J, &v, ADD_VALUES);
+                CHKERRQ(ierr);
+            }
+        }
+    }
+    ierr = MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
+    CHKERRQ(ierr);
+    ierr = MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
+    CHKERRQ(ierr);
+
+    ierr = PetscOptionsGetBool(NULL, "-test_newMat", &testnewC, NULL);
+    CHKERRQ(ierr);
+    if (testnewC) {
+        /*
+         User may use a new matrix C with same nonzero pattern, e.g.
+          ./ex5 -ksp_monitor -mat_type sbaij -pc_type cholesky -pc_factor_mat_solver_package mumps -test_newMat
+         */
+        Mat Ctmp;
+        ierr = MatDuplicate(C, MAT_COPY_VALUES, &Ctmp);
+        CHKERRQ(ierr);
+        ierr = MatDestroy(&C);
+        CHKERRQ(ierr);
+        ierr = MatDuplicate(Ctmp, MAT_COPY_VALUES, &C);
+        CHKERRQ(ierr);
+        ierr = MatDestroy(&Ctmp);
+        CHKERRQ(ierr);
+    }
+    /*
+       Compute another right-hand-side vector
+     */
+    ierr = MatMult(C, u, b);
+    CHKERRQ(ierr);
+
+    /*
+       Set operators. Here the matrix that defines the linear system
+       also serves as the preconditioning matrix.
+     */
+    ierr = KSPSetOperators(ksp, C, C);
+    CHKERRQ(ierr);
+
+    /*
+       Solve linear system
+     */
+    ierr = KSPSetUp(ksp);
+    CHKERRQ(ierr);
+    ierr = KSPSolve(ksp, b, x);
+    CHKERRQ(ierr);
+
+    /*
+       Check the error
+     */
+    ierr = VecAXPY(x, none, u);
+    CHKERRQ(ierr);
+    ierr = VecNorm(x, NORM_2, &norm);
+    CHKERRQ(ierr);
+    ierr = KSPGetIterationNumber(ksp, &its);
+    CHKERRQ(ierr);
+    if (norm > 1.e-4) {
+        ierr = PetscPrintf(PETSC_COMM_WORLD, "Norm of error %g, Iterations %D\n", (double) norm, its);
+        CHKERRQ(ierr);
+    }
+
+    /*
+       Free work space.  All PETSc objects should be destroyed when they
+       are no longer needed.
+     */
+    ierr = KSPDestroy(&ksp);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&u);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&x);
+    CHKERRQ(ierr);
+    ierr = VecDestroy(&b);
+    CHKERRQ(ierr);
+    ierr = MatDestroy(&C);
+    CHKERRQ(ierr);
+
+    /*
+       Indicate to PETSc profiling that we're concluding the second stage
+     */
+    ierr = PetscLogStagePop();
+    CHKERRQ(ierr);
+
+    ierr = PetscFinalize();
+    return 0;
 }
 
 
diff --git a/spgemm/mkl_shmem/CMakeLists.txt b/spgemm/mkl_shmem/CMakeLists.txt
index dcf8ae4e2c83c16097a9882353f2fee267d4a5ad..721ee15cfff00f3f2e9f7a0a9b6c1b405f492aea 100644
--- a/spgemm/mkl_shmem/CMakeLists.txt
+++ b/spgemm/mkl_shmem/CMakeLists.txt
@@ -47,7 +47,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}")
 if (MKL_FOUND)
 	include_directories(${MKL_INCLUDE_DIR})
 	link_directories(${MKL_LIBRARY_DIR})
-	add_executable(${NAME} mklspgemm.c mmio.c)
+	add_executable(${NAME} mklspgemm.c)
 	target_link_libraries(${NAME} mkl_intel_lp64 mkl_sequential mkl_core pthread m)
 	install(TARGETS ${NAME} DESTINATION bin)
 else ()
diff --git a/spgemm/mkl_shmem/mklspgemm.c b/spgemm/mkl_shmem/mklspgemm.c
index d3d006be40ff553b7d48de7369a9c96fb8205fba..08e34e04b72c5c34af05876c05907217b1cb1208 100644
--- a/spgemm/mkl_shmem/mklspgemm.c
+++ b/spgemm/mkl_shmem/mklspgemm.c
@@ -3,424 +3,132 @@
 #include <assert.h>
 #include <math.h>
 #include <mkl.h>
-#include "mmio.h"
+#define MIC_TARGET
+#include "../../common/mmio.h"
+#include "../../common/util.h"
+#include "../../common/csr.h"
 /*}}}*/
 
 /* global varibles and defines {{{*/
 int nrepeat = 100;
-int option_print_matrices = 0;
-#define OPTION_NOPRINT_MATRICES 0
-#define OPTION_PRINT_MATRICES 1
 char transa = 'n';
-typedef int csi;
-typedef double csv;
-csv zero = 0.0;
-#define CS_MAX(a,b) (((a) > (b)) ? (a) : (b))
-
-typedef struct csr_t {
-  csi  m; 
-  csi  n; 
-  csi  nzmax;
-  csi  nr; 
-  csi* r; 
-  csi* p; 
-  csi* j; 
-  csv* x; 
-} csr;
-/*}}}*/
-
-/* method declarations {{{*/
-int  main(int argc, char* argv[]);
-void printmm_one(int m, double* Aval, int* AJ, int* AI);
-void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI);
-void printmm_zero(int m, double* Aval, int* AJ, int* AI);
-csr *csr_spfree(csr *A);
 /*}}}*/
 
-/*csr util{{{*/
-
-/* free workspace and return a sparse matrix result */
-csr *csr_done(csr *C, void *w, void *x, csi ok) {
-        return(ok ? C : csr_spfree(C)); /* return result if OK, else free it */
-}
-
-/* wrapper for free */
-void *cs_free(void *p) {
-        if (p)
-                free(p); /* free p if it is not already NULL */
-        return(NULL); /* return NULL to simplify the use of cs_free */
-}
-
-/* wrapper for realloc */
-void *csr_realloc(void *p, csi n, size_t size, csi *ok) {
-        void *pnew = NULL;
-        pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */
-        *ok = (pnew != NULL); /* realloc fails if pnew is NULL */
-        if (pnew == NULL) {
-                printf("%d:reallocation failed, pnew is NULL\n", __LINE__);
-        }
-//printf("%s:%d: n=%d ok=%d\n", __FUNCTION__, __LINE__, n, *ok);
-        return((*ok) ? pnew : p); /* return original p if failure */
-}
-
-/* wrapper for realloc */
-void *cs_realloc(void *p, csi n, size_t size, csi *ok) {
-        void *pnew = NULL;
-        pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */
-        *ok = (pnew != NULL); /* realloc fails if pnew is NULL */
-        if (pnew == NULL) {
-                printf("reallocation failed\n");
-        }
-        return((*ok) ? pnew : p); /* return original p if failure */
-}
-
-/* change the max # of entries sparse matrix */
-csi csr_sprealloc(csr *A, csi nzmax) {
-        csi ok, oki = 0, okj = 1, okx = 1;
-        if (!A)
-                return(0);
-        if (nzmax <= 0)
-                nzmax = A->p[A->m];
-        A->j = (int*)csr_realloc(A->j, nzmax, sizeof(csi), &oki);
-        if (A->x)
-                A->x = (csv*)csr_realloc(A->x, nzmax, sizeof(csv), &okx);
-        ok = (oki && okj && okx);
-        if (ok)
-                A->nzmax = nzmax;
-        return(ok);
-}
-
-/* free a sparse matrix */
-csr *csr_spfree(csr *A) {
-        if (!A)
-                return(NULL); /* do nothing if A already NULL */
-        cs_free(A->p);
-        A->p = NULL;
-        cs_free(A->j);
-        A->j = NULL;
-        cs_free(A->x);
-        A->x = NULL;
-        cs_free(A->r);
-        A->r = NULL;
-        cs_free(A); /* free the cs struct and return NULL */
-    return NULL;
-}
-
-/* allocate a sparse matrix (triplet form or compressed-ROW form) */
-csr *csr_spalloc(csi m, csi n, csi nzmax, int values, int triplet, csv f) {
-        csr* A = (csr*)calloc(1, sizeof(csr)); /* allocate the cs struct */
-        if (!A) {
-                perror("sparse allocation failed");
-                return(NULL); /* out of memory */
-        }
-        A->m = m; /* define dimensions and nzmax */
-        A->n = n;
-        A->nzmax = nzmax = CS_MAX(nzmax, 0);
-        A->nr = 0;  // number of nonzero rows
-        A->p = (csi*)calloc(m + 2, sizeof(csi));
-        A->j = (csi*)calloc(CS_MAX(nzmax,1), sizeof(csi));
-        A->x = (csv*)calloc(CS_MAX(nzmax,1), sizeof(csv));
-        return((!A->p || !A->j || !A->x) ? csr_spfree(A) : A);
-
-}/*}}}*/
-
-/** Multiply two sparse matrices which are stored in CSR format. MKL is used */
-csr *csr_multiply(csi Am, csi An, csi Anzmax, const csi* Ap, const csi* Aj, const csv* Ax, csi Bm, csi Bn, csi Bnzmax, const csi* Bp, const csi* Bj, const csv* Bx, long* nummult, csi* xb, csv* x) { /*{{{*/
-    csv tf = 0;
-    csi p, jp, j, kp, k, i, nz = 0, anz, *Cp, *Cj, m, n,
-        bnz, values = 1;
-    csv *Cx;
-    csr *C;
-    if (An != Bm)
-        return(NULL);
-    if (Anzmax == 0 || Bnzmax == 0) {
-        C = csr_spalloc(Am, Bn, 0, values, 0, tf);
-        return C;
-    }
-    m = Am;
-    anz = Ap[Am];
-    n = Bn;
-    bnz = Bp[Bm];
-    for(i = 0; i < n; i++) xb[i] = 0;
-    for(i = 0; i < n; i++)
-        xb[i] = 0;
-    values = (Ax != NULL) && (Bx != NULL);
-    csi tnz = (anz + bnz) * 2;
-    C = csr_spalloc(m, n, tnz, values, 0, tf); /* allocate result */
-    if (!C || !xb || (values && !x))
-        return (csr_done(C, xb, x, 0));
-    Cp = C->p;
-    for (i = 0; i < m; i++) {
-        if ( ( (nz + n) > C->nzmax ) ) {
-            if(!csr_sprealloc(C, (2 * (C->nzmax) + n) ) ) {
-                return (csr_done(C, xb, x, 0)); // out of memory
-            } else {
-            }
-        }
-        Cj = C->j;
-        Cx = C->x; /* C->j and C->x may be reallocated */
-        Cp[i] = nz; /* row i of C starts here */
-        for (jp = Ap[i]; jp < Ap[i + 1]; jp++) {
-            j = Aj[jp];
-            for (kp = Bp[j]; kp < Bp[j + 1]; kp++) {
-                k = Bj[kp]; /* B(i,j) is nonzero */
-                if (xb[k] != i + 1) {
-                    xb[k] = i + 1; /* i is new entry in column j */
-                    Cj[nz++] = k; /* add i to pattern of C(:,j) */
-                    if (x) {
-                        x[k] = Ax[jp] * Bx[kp]; /* x(i) = beta*A(i,j) */
-                        (*nummult)++;
-                    }
-                } else if (x) {
-                    x[k] += (Ax[jp] * Bx[kp]); /* i exists in C(:,j) already */
-                    (*nummult)++;
-                }
-            }
-        }
-        if (values)
-            for (p = Cp[i]; p < nz; p++)
-                Cx[p] = x[Cj[p]];
-    }
-    Cp[m] = nz; /* finalize the last row of C */
-    csr_sprealloc(C, 0); /* remove extra space from C */
-    xb = NULL;
-    x = NULL;
-    return C;
-}/*}}}*/
-
 /** Multiply two sparse matrices which are stored in CSR format. MKL is used */
 void mkl_cpu_spgemm(MKL_INT Am, MKL_INT An, MKL_INT Annz, double* Aval, MKL_INT* AJ, MKL_INT* AI, MKL_INT Bn, MKL_INT Bnnz, double* Bval, MKL_INT* BJ, MKL_INT* BI, double** pCval, MKL_INT** pCJ, MKL_INT** pCI, double* time) {  /*{{{*/
-
-MKL_INT* CJ = NULL;
-double* Cval = NULL;
-MKL_INT sort = 3;    // sort everything
-MKL_INT* CI = (MKL_INT*)mkl_malloc( (Am+2) * sizeof( MKL_INT ), 64 );
-MKL_INT nnzmax = 0;    // nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
-MKL_INT ierr;
-MKL_INT request = 1;    // symbolic multiplication
-mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
-
-request = 2;        //numeric multiplication
-int Cnnz = CI[Am]-1;
-int Cval_size = Cnnz + 1;
-CJ = (MKL_INT*)mkl_malloc( Cval_size    *    sizeof( MKL_INT ), 64 );
-Cval = (double*)mkl_malloc( Cval_size    *    sizeof( double ), 64 );
-double time_st = dsecnd();
-int i;
-for(i = 0; i < nrepeat; i++) {
+    MKL_INT* CJ = NULL;
+    double* Cval = NULL;
+    MKL_INT sort = 3;    // sort everything
+    MKL_INT* CI = (MKL_INT*)mkl_malloc( (Am+2) * sizeof( MKL_INT ), 64 );
+    MKL_INT nnzmax = 0;    // nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
+    MKL_INT ierr;
+    MKL_INT request = 1;    // symbolic multiplication
     mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
-}
-double time_end = dsecnd();
-*time = (time_end - time_st)/nrepeat;
-*pCval = Cval; *pCJ = CJ; *pCI = CI;
-} /*}}}*/ 
 
-/** Read Matrix Market file into COO matrix */
-void read_mm(char* strpath, int* pM, int* pN, int* prealnnz, int** pI, int** pJ, double** pval){ /*{{{*/
-/* 
-*   taken from Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-int i, M, N, nz, *I, *J;
-double* val;
-int ret_code;
-MM_typecode matcode;
-FILE* f;
-if ((f = fopen(strpath, "r")) == NULL) {fprintf(stderr, "Input matrix file %s cannot be opened to read.", strpath);exit(1);}
-/* READ MATRIX */
-if (mm_read_banner(f, &matcode) != 0) {
-    printf("Could not process Matrix Market banner.\n");    exit(1);
-}
-/*  This is how one can screen matrix types if their application */
-/*  only supports a subset of the Matrix Market data types.      */
-if (mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode) ) {
-    printf("Sorry, this application does not support ");
-    printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));exit(1);
-}
-/* find out size of sparse matrix .... */
-if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) exit(1);
-/* reseve memory for matrices */
-I = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int));
-J = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int));
-val = (double *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(double));
-*pI = I;
-*pJ = J;
-*pval = val;
-/* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-/*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-/*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
-int realnnz = 0;
-for (i=0; i<nz; i++) {
-    if(mm_is_pattern(matcode)) {
-        fscanf(f, "%d %d\n", &I[realnnz], &J[realnnz]);
-        val[realnnz] = 1.0;
-    }
-    else
-        fscanf(f, "%d %d %lg\n", &I[realnnz], &J[realnnz], &val[realnnz]);
-    I[realnnz]--;  /* adjust from 1-based to 0-based */
-        J[realnnz]--;
-    if(mm_is_symmetric(matcode) && I[realnnz] != J[realnnz]) {
-        I[realnnz+1] = J[realnnz];
-        J[realnnz+1] = I[realnnz];
-        val[realnnz+1] = val[realnnz];
-        realnnz++;
+    request = 2;        //numeric multiplication
+    int Cnnz = CI[Am]-1;
+    int Cval_size = Cnnz + 1;
+    CJ = (MKL_INT*)mkl_malloc( Cval_size    *    sizeof( MKL_INT ), 64 );
+    Cval = (double*)mkl_malloc( Cval_size    *    sizeof( double ), 64 );
+    double time_st = dsecnd();
+    int i;
+    for(i = 0; i < nrepeat; i++) {
+        mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
     }
-    realnnz++;
-}
-if (f !=stdin) fclose(f);
-*pM = M;
-*pN = N;
-*prealnnz = realnnz;
-} /* ENDOF read_mm }}}*/
-
-/** Converts COO matrix to CSR matrix */
-void coo_to_csr(int m, int nnz, int* I, int* J, double* val, MKL_INT* AI, MKL_INT* AJ, double* Aval) { /*{{{*/
-
-MKL_INT info = 0;
-MKL_INT job[8];   
-//job[1]=0; // zero based indexing in csr
-job[1]=1; // one based indexing in csr
-
-job[2]=0; // zero based indexing in coo
-job[3]=2; // I don't know
-job[4]=nnz; // nnz
-
-job[0]=1;  // coo to csr
-job[5]=0;  // Acsr and AJR allocated by user
-//void mkl_dcsrcoo (MKL_INT * job, MKL_INT * n, double *Acsr, MKL_INT * AJR, MKL_INT *AIR, MKL_INT * nnz, double *Acoo, MKL_INT * ir, MKL_INT * jc, MKL_INT * info);
-mkl_dcsrcoo (job,&m, Aval, AJ, AI, &nnz, val, I, J, &info);
-} /* ENDOF coo_to_csr }}}*/
+    double time_end = dsecnd();
+    *time = (time_end - time_st)/nrepeat;
+    *pCval = Cval; *pCJ = CJ; *pCI = CI;
+} /*}}}*/
 
 int main(int argc, char* argv[]) { /*{{{*/
-/** usage */
-int nrequired_args = 6;
-if (argc != nrequired_args){
-    fprintf(stderr, "NAME:\n\tmkl_spgemm - multiply two sparse matrices\n");
-    fprintf(stderr, "\nSYNOPSIS:\n");
-    fprintf(stderr, "\tmkl_spgemm MATRIX_A MATRIX_B MATRIX_C NUMBER_OF_THREADS PRINT_MATRICES\n");
-    fprintf(stderr, "\nDESCRIPTION:\n");
-    fprintf(stderr, "\tNUMBER_OF_THREADS: {0,1,2,...}\n");
-    fprintf(stderr, "\t\t0: Use number of threads determined by MKL\n");
-    fprintf(stderr, "\tPRINT_MATRICES: PRINT_YES, PRINT_NO\n");
-    fprintf(stderr, "\nSAMPLE EXECUTION:\n");
-    fprintf(stderr, "\t%s test.mtx test.mtx out.mtx 2 PRINT_YES\n", argv[0]);
-    exit(1);
-}
-/** parse arguments */
-int iarg = 1;
-char* strpathA = argv[iarg];    iarg++;
-char* strpathB = argv[iarg];    iarg++;
-char* strpathC = argv[iarg];    iarg++;
-int nthreads = atoi(argv[iarg]);    iarg++;
-if (nthreads > 0) {
-    mkl_set_num_threads(nthreads);
-} else {
-    nthreads = mkl_get_max_threads();
-}
-option_print_matrices = strcmp(argv[iarg], "PRINT_YES")==0?OPTION_PRINT_MATRICES:OPTION_NOPRINT_MATRICES;    iarg++;
-assert(nrequired_args == iarg);
-
-/** read matrix market file for A matrix */
-MKL_INT Am, An, Annz;
-MKL_INT *Ax, *Ay;  
-double *Anz;
-read_mm(strpathA, &Am, &An, &Annz, &Ax, &Ay, &Anz);
-
-/** construct csr storage for A matrix */
-MKL_INT* AJ = (MKL_INT*) mkl_malloc( Annz * sizeof( MKL_INT ), 64 );
-MKL_INT* AI = (MKL_INT*) mkl_malloc( (Am+1) * sizeof( MKL_INT ), 64 );
-double* Aval = (double*) mkl_malloc( Annz * sizeof( double ),  64 );
-coo_to_csr(Am, Annz, Ax, Ay, Anz, AI, AJ, Aval);    
-
-/** read matrix market file for B matrix */
-MKL_INT Bm, Bn, Bnnz;   
-MKL_INT *Bx, *By;  
-double *Bnz;
-read_mm(strpathB, &Bm, &Bn, &Bnnz, &Bx, &By, &Bnz);
-
-/** construct csr storage for B matrix */
-MKL_INT* BJ = (MKL_INT*) mkl_malloc( Bnnz * sizeof( MKL_INT ), 64 );
-MKL_INT* BI = (MKL_INT*) mkl_malloc( (Bm+1) * sizeof( MKL_INT ), 64 );
-double* Bval = (double*) mkl_malloc( Bnnz * sizeof( double ),  64 );
-coo_to_csr(Bm, Bnnz, Bx, By, Bnz, BI, BJ, Bval);    
-
-/** multiply two matrices */
-double* Cval, time; 
-MKL_INT* CJ; MKL_INT* CI;
-mkl_cpu_spgemm(Am, An, Annz, Aval, AJ, AI, Bn, Bnnz, Bval, BJ, BI, &Cval, &CJ, &CI, &time);
-
-int i;
-for(i=0;i<=Am;i++)AI[i]--;    for(i=0;i<Annz;i++)AJ[i]--;
-for(i=0;i<=Bm;i++)BI[i]--;    for(i=0;i<Bnnz;i++)BJ[i]--;
-printmm_one(Am, Cval, CJ, CI);
-
-/** In order to write the output C matrix to file, uncomment the following line */
-printfilemm_one(strpathC, Am, Bn, Cval, CJ, CI);
-
-/** run my SpGEMM routine in order to find number of multiply-and-add operations */
-long nummult = 0;
-csi* xb = (csi*)calloc(Bn, sizeof(csi));
-csv* x = (csv*)calloc(Bn, sizeof(csv));
-csr* C = csr_multiply(Am, An, Annz, AI, AJ, Aval, Bm, Bn, Bnnz, BI, BJ, Bval, &nummult, xb, x);
-double gflop = 2 * (double) nummult / 1e9;
-
-/** print gflop per second and time */
-printf("%d\t", nthreads);
-printf("%g\t", (gflop/time));
-printf("%g\t", time);
-printf("%s\t", strpathA);
-printf("%s\t", strpathB);
-printf("%s\t", strpathC);
-printf("\n");
-
-/** free allocated space */
-mkl_free(AI);
-mkl_free(AJ);
-mkl_free(Aval);
-mkl_free(BI);
-mkl_free(BJ);
-mkl_free(Bval);
-return 0;
-} /* ENDOF main }}}*/
-
-/** Prints matrix in CSR format */
-void printmm_one(int m, double* Aval, int* AJ, int* AI){ //{{{
+    /** usage */
+    int nrequired_args = 6;
+    if (argc != nrequired_args){
+        fprintf(stderr, "NAME:\n\tmkl_spgemm - multiply two sparse matrices\n");
+        fprintf(stderr, "\nSYNOPSIS:\n");
+        fprintf(stderr, "\tmkl_spgemm MATRIX_A MATRIX_B MATRIX_C NUMBER_OF_THREADS PRINT_MATRICES\n");
+        fprintf(stderr, "\nDESCRIPTION:\n");
+        fprintf(stderr, "\tNUMBER_OF_THREADS: {0,1,2,...}\n");
+        fprintf(stderr, "\t\t0: Use number of threads determined by MKL\n");
+        fprintf(stderr, "\tPRINT_MATRICES: PRINT_YES, PRINT_NO\n");
+        fprintf(stderr, "\nSAMPLE EXECUTION:\n");
+        fprintf(stderr, "\t%s test.mtx test.mtx out.mtx 2 PRINT_YES\n", argv[0]);
+        exit(1);
+    }
+    /** parse arguments */
+    int iarg = 1;
+    char* strpathA = argv[iarg];    iarg++;
+    char* strpathB = argv[iarg];    iarg++;
+    char* strpathC = argv[iarg];    iarg++;
+    int nthreads = atoi(argv[iarg]);    iarg++;
+    if (nthreads > 0) {
+        mkl_set_num_threads(nthreads);
+    } else {
+        nthreads = mkl_get_max_threads();
+    }
+    option_print_matrices = strcmp(argv[iarg], "PRINT_YES")==0?OPTION_PRINT_MATRICES:OPTION_NOPRINT_MATRICES;    iarg++;
+    assert(nrequired_args == iarg);
+
+    /** read matrix market file for A matrix */
+    MKL_INT Am, An, Annz;
+    MKL_INT *Ax, *Ay;
+    double *Anz;
+    read_mm(strpathA, &Am, &An, &Annz, &Ax, &Ay, &Anz);
+
+    /** construct csr storage for A matrix */
+    MKL_INT* AJ = (MKL_INT*) mkl_malloc( Annz * sizeof( MKL_INT ), 64 );
+    MKL_INT* AI = (MKL_INT*) mkl_malloc( (Am+1) * sizeof( MKL_INT ), 64 );
+    double* Aval = (double*) mkl_malloc( Annz * sizeof( double ),  64 );
+    coo_to_csr(Am, Annz, Ax, Ay, Anz, AI, AJ, Aval);
+
+    /** read matrix market file for B matrix */
+    MKL_INT Bm, Bn, Bnnz;
+    MKL_INT *Bx, *By;
+    double *Bnz;
+    read_mm(strpathB, &Bm, &Bn, &Bnnz, &Bx, &By, &Bnz);
+
+    /** construct csr storage for B matrix */
+    MKL_INT* BJ = (MKL_INT*) mkl_malloc( Bnnz * sizeof( MKL_INT ), 64 );
+    MKL_INT* BI = (MKL_INT*) mkl_malloc( (Bm+1) * sizeof( MKL_INT ), 64 );
+    double* Bval = (double*) mkl_malloc( Bnnz * sizeof( double ),  64 );
+    coo_to_csr(Bm, Bnnz, Bx, By, Bnz, BI, BJ, Bval);
+
+    /** multiply two matrices */
+    double* Cval, time;
+    MKL_INT* CJ; MKL_INT* CI;
+    mkl_cpu_spgemm(Am, An, Annz, Aval, AJ, AI, Bn, Bnnz, Bval, BJ, BI, &Cval, &CJ, &CI, &time);
 
-    if (option_print_matrices == OPTION_NOPRINT_MATRICES)
-        return;
     int i;
-    for(i = 0; i < m; i++) {
-        printf("%d: ", i+1);
-        int j;
-        for(j = AI[i]-1; j < AI[i+1]-1; j++) {
-            printf("%d:%g  ", AJ[j], Aval[j]);
-        }
-        printf("\n");
-    }
+    for(i=0;i<=Am;i++)AI[i]--;    for(i=0;i<Annz;i++)AJ[i]--;
+    for(i=0;i<=Bm;i++)BI[i]--;    for(i=0;i<Bnnz;i++)BJ[i]--;
+    printmm_one(Am, Cval, CJ, CI);
+
+    /** In order to write the output C matrix to file, uncomment the following line */
+    printfilemm_one(strpathC, Am, Bn, Cval, CJ, CI);
+
+    /** run my SpGEMM routine in order to find number of multiply-and-add operations */
+    long nummult = 0;
+    csi* xb = (csi*)calloc(Bn, sizeof(csi));
+    csv* x = (csv*)calloc(Bn, sizeof(csv));
+    csr* C = csr_multiply(Am, An, Annz, AI, AJ, Aval, Bm, Bn, Bnnz, BI, BJ, Bval, &nummult, xb, x);
+    double gflop = 2 * (double) nummult / 1e9;
+
+    /** print gflop per second and time */
+    printf("%d\t", nthreads);
+    printf("%g\t", (gflop/time));
+    printf("%g\t", time);
+    printf("%s\t", strpathA);
+    printf("%s\t", strpathB);
+    printf("%s\t", strpathC);
     printf("\n");
-}//}}}
 
-/** Writes matrix in CSR format in to a file using Matrix Market format */
-void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI){//{{{
-
-    FILE* f = fopen(file, "w");
-    if(f == NULL){
-        printf("%s %s %d: %s cannot be opened to write matrix\n", __FILE__, __PRETTY_FUNCTION__, __LINE__, file);
-        exit(1); 
-    }
-    int i;
-    fprintf(f, "%%%%MatrixMarket matrix coordinate real general\n");
-    fprintf(f, "%d %d %d\n", m, n, AI[m]-1);
-    for(i = 0; i < m; i++) {
-        int j;
-        for(j = AI[i]-1; j < AI[i+1]-1; j++) {
-            fprintf(f, "%d %d %g\n", i+1, AJ[j], Aval[j]);
-        }
-    }
-    fclose(f);
-}//}}}
+    /** free allocated space */
+    mkl_free(AI);
+    mkl_free(AJ);
+    mkl_free(Aval);
+    mkl_free(BI);
+    mkl_free(BJ);
+    mkl_free(Bval);
+    return 0;
+} /* ENDOF main }}}*/
 
diff --git a/spgemm/mkl_shmem/mmio.c b/spgemm/mkl_shmem/mmio.c
deleted file mode 100644
index c250ff2aed998fe65248537a8b19a359206187ce..0000000000000000000000000000000000000000
--- a/spgemm/mkl_shmem/mmio.c
+++ /dev/null
@@ -1,511 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-
-#include "mmio.h"
-
-int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
-                double **val_, int **I_, int **J_)
-{
-    FILE *f;
-    MM_typecode matcode;
-    int M, N, nz;
-    int i;
-    double *val;
-    int *I, *J;
- 
-    if ((f = fopen(fname, "r")) == NULL)
-            return -1;
- 
- 
-    if (mm_read_banner(f, &matcode) != 0)
-    {
-        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
-        printf(" in file [%s]\n", fname);
-        return -1;
-    }
- 
- 
- 
-    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
-            mm_is_sparse(matcode)))
-    {
-        fprintf(stderr, "Sorry, this application does not support ");
-        fprintf(stderr, "Market Market type: [%s]\n",
-                mm_typecode_to_str(matcode));
-        return -1;
-    }
- 
-    /* find out size of sparse matrix: M, N, nz .... */
- 
-    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
-    {
-        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
-        return -1;
-    }
- 
-    *M_ = M;
-    *N_ = N;
-    *nz_ = nz;
- 
-    /* reseve memory for matrices */
- 
-    I = (int *) malloc(nz * sizeof(int));
-    J = (int *) malloc(nz * sizeof(int));
-    val = (double *) malloc(nz * sizeof(double));
- 
-    *val_ = val;
-    *I_ = I;
-    *J_ = J;
- 
-    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
- 
-    for (i=0; i<nz; i++)
-    {
-        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
-        I[i]--;  /* adjust from 1-based to 0-based */
-        J[i]--;
-    }
-    fclose(f);
- 
-    return 0;
-}
-
-int mm_is_valid(MM_typecode matcode)
-{
-    if (!mm_is_matrix(matcode)) return 0;
-    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
-    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
-    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
-                mm_is_skew(matcode))) return 0;
-    return 1;
-}
-
-int mm_read_banner(FILE *f, MM_typecode *matcode)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    char banner[MM_MAX_TOKEN_LENGTH];
-    char mtx[MM_MAX_TOKEN_LENGTH]; 
-    char crd[MM_MAX_TOKEN_LENGTH];
-    char data_type[MM_MAX_TOKEN_LENGTH];
-    char storage_scheme[MM_MAX_TOKEN_LENGTH];
-    char *p;
-
-
-    mm_clear_typecode(matcode);  
-
-    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
-        return MM_PREMATURE_EOF;
-
-    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
-        storage_scheme) != 5)
-        return MM_PREMATURE_EOF;
-
-    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
-    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
-    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
-    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
-
-    /* check for banner */
-    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
-        return MM_NO_HEADER;
-
-    /* first field should be "mtx" */
-    if (strcmp(mtx, MM_MTX_STR) != 0)
-        return  MM_UNSUPPORTED_TYPE;
-    mm_set_matrix(matcode);
-
-
-    /* second field describes whether this is a sparse matrix (in coordinate
-            storgae) or a dense array */
-
-
-    if (strcmp(crd, MM_SPARSE_STR) == 0)
-        mm_set_sparse(matcode);
-    else
-    if (strcmp(crd, MM_DENSE_STR) == 0)
-            mm_set_dense(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* third field */
-
-    if (strcmp(data_type, MM_REAL_STR) == 0)
-        mm_set_real(matcode);
-    else
-    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
-        mm_set_complex(matcode);
-    else
-    if (strcmp(data_type, MM_PATTERN_STR) == 0)
-        mm_set_pattern(matcode);
-    else
-    if (strcmp(data_type, MM_INT_STR) == 0)
-        mm_set_integer(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* fourth field */
-
-    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
-        mm_set_general(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
-        mm_set_symmetric(matcode);
-    else
-    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
-        mm_set_hermitian(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
-        mm_set_skew(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-        
-
-    return 0;
-}
-
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
-{
-    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
-
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = *nz = 0;
-
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
-        return 0;
-        
-    else
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 3);
-
-    return 0;
-}
-
-
-int mm_read_mtx_array_size(FILE *f, int *M, int *N)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = 0;
-	
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d", M, N) == 2)
-        return 0;
-        
-    else /* we have a blank line */
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d", M, N); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 2);
-
-    return 0;
-}
-
-int mm_write_mtx_array_size(FILE *f, int M, int N)
-{
-    if (fprintf(f, "%d %d\n", M, N) != 2)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
-
-
-
-/*-------------------------------------------------------------------------*/
-
-/******************************************************************/
-/* use when I[], J[], and val[]J, and val[] are already allocated */
-/******************************************************************/
-
-int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    int i;
-    if (mm_is_complex(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-        for (i=0; i<nz; i++)
-        {
-            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
-                != 3) return MM_PREMATURE_EOF;
-
-        }
-    }
-
-    else if (mm_is_pattern(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d", &I[i], &J[i])
-                != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-        
-}
-
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
-        double *real, double *imag, MM_typecode matcode)
-{
-    if (mm_is_complex(matcode))
-    {
-            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-            if (fscanf(f, "%d %d %lg\n", I, J, real)
-                != 3) return MM_PREMATURE_EOF;
-
-    }
-
-    else if (mm_is_pattern(matcode))
-    {
-            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-        
-}
-
-
-/************************************************************************
-    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
-                        type code, e.g. 'MCRS'
-
-                        if matrix is complex, values[] is of size 2*nz,
-                            (nz pairs of real/imaginary values)
-************************************************************************/
-
-int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
-        double **val, MM_typecode *matcode)
-{
-    int ret_code;
-    FILE *f;
-
-    if (strcmp(fname, "stdin") == 0) f=stdin;
-    else
-    if ((f = fopen(fname, "r")) == NULL)
-        return MM_COULD_NOT_READ_FILE;
-
-
-    if ((ret_code = mm_read_banner(f, matcode)) != 0)
-        return ret_code;
-
-    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
-            mm_is_matrix(*matcode)))
-        return MM_UNSUPPORTED_TYPE;
-
-    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
-        return ret_code;
-
-
-    *I = (int *)  malloc(*nz * sizeof(int));
-    *J = (int *)  malloc(*nz * sizeof(int));
-    *val = NULL;
-
-    if (mm_is_complex(*matcode))
-    {
-        *val = (double *) malloc(*nz * 2 * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-    else if (mm_is_real(*matcode))
-    {
-        *val = (double *) malloc(*nz * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-
-    else if (mm_is_pattern(*matcode))
-    {
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-
-    if (f != stdin) fclose(f);
-    return 0;
-}
-
-int mm_write_banner(FILE *f, MM_typecode matcode)
-{
-    char *str = mm_typecode_to_str(matcode);
-    int ret_code;
-
-    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
-    free(str);
-    if (ret_code !=2 )
-        return MM_COULD_NOT_WRITE_FILE;
-    else
-        return 0;
-}
-
-int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    FILE *f;
-    int i;
-
-    if (strcmp(fname, "stdout") == 0) 
-        f = stdout;
-    else
-    if ((f = fopen(fname, "w")) == NULL)
-        return MM_COULD_NOT_WRITE_FILE;
-    
-    /* print banner followed by typecode */
-    fprintf(f, "%s ", MatrixMarketBanner);
-    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
-
-    /* print matrix sizes and nonzeros */
-    fprintf(f, "%d %d %d\n", M, N, nz);
-
-    /* print values */
-    if (mm_is_pattern(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d\n", I[i], J[i]);
-    else
-    if (mm_is_real(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
-    else
-    if (mm_is_complex(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
-                        val[2*i+1]);
-    else
-    {
-        if (f != stdout) fclose(f);
-        return MM_UNSUPPORTED_TYPE;
-    }
-
-    if (f !=stdout) fclose(f);
-
-    return 0;
-}
-  
-
-/**
-*  Create a new copy of a string s.  mm_strdup() is a common routine, but
-*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
-*
-*/
-char *mm_strdup(const char *s)
-{
-	int len = strlen(s);
-	char *s2 = (char *) malloc((len+1)*sizeof(char));
-	return strcpy(s2, s);
-}
-
-char  *mm_typecode_to_str(MM_typecode matcode)
-{
-    char buffer[MM_MAX_LINE_LENGTH];
-    char *types[4];
-	char *mm_strdup(const char *);
-    int error =0;
-
-    /* check for MTX type */
-    if (mm_is_matrix(matcode)) 
-        types[0] = MM_MTX_STR;
-    else
-        error=1;
-
-    /* check for CRD or ARR matrix */
-    if (mm_is_sparse(matcode))
-        types[1] = MM_SPARSE_STR;
-    else
-    if (mm_is_dense(matcode))
-        types[1] = MM_DENSE_STR;
-    else
-        return NULL;
-
-    /* check for element data type */
-    if (mm_is_real(matcode))
-        types[2] = MM_REAL_STR;
-    else
-    if (mm_is_complex(matcode))
-        types[2] = MM_COMPLEX_STR;
-    else
-    if (mm_is_pattern(matcode))
-        types[2] = MM_PATTERN_STR;
-    else
-    if (mm_is_integer(matcode))
-        types[2] = MM_INT_STR;
-    else
-        return NULL;
-
-
-    /* check for symmetry type */
-    if (mm_is_general(matcode))
-        types[3] = MM_GENERAL_STR;
-    else
-    if (mm_is_symmetric(matcode))
-        types[3] = MM_SYMM_STR;
-    else 
-    if (mm_is_hermitian(matcode))
-        types[3] = MM_HERM_STR;
-    else 
-    if (mm_is_skew(matcode))
-        types[3] = MM_SKEW_STR;
-    else
-        return NULL;
-
-    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
-    return mm_strdup(buffer);
-
-}
diff --git a/spgemm/mkl_shmem/mmio.h b/spgemm/mkl_shmem/mmio.h
deleted file mode 100644
index 7cfd0a1b7ae7572e85b8c10bcb2fd0b3333ad0b6..0000000000000000000000000000000000000000
--- a/spgemm/mkl_shmem/mmio.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-#ifndef MM_IO_H
-#define MM_IO_H
-
-#define MM_MAX_LINE_LENGTH 1025
-#define MatrixMarketBanner "%%MatrixMarket"
-#define MM_MAX_TOKEN_LENGTH 64
-
-typedef char MM_typecode[4];
-
-char *mm_typecode_to_str(MM_typecode matcode);
-
-int mm_read_banner(FILE *f, MM_typecode *matcode);
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
-int mm_read_mtx_array_size(FILE *f, int *M, int *N);
-
-int mm_write_banner(FILE *f, MM_typecode matcode);
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
-int mm_write_mtx_array_size(FILE *f, int M, int N);
-
-
-/********************* MM_typecode query fucntions ***************************/
-
-#define mm_is_matrix(typecode)	((typecode)[0]=='M')
-
-#define mm_is_sparse(typecode)	((typecode)[1]=='C')
-#define mm_is_coordinate(typecode)((typecode)[1]=='C')
-#define mm_is_dense(typecode)	((typecode)[1]=='A')
-#define mm_is_array(typecode)	((typecode)[1]=='A')
-
-#define mm_is_complex(typecode)	((typecode)[2]=='C')
-#define mm_is_real(typecode)		((typecode)[2]=='R')
-#define mm_is_pattern(typecode)	((typecode)[2]=='P')
-#define mm_is_integer(typecode) ((typecode)[2]=='I')
-
-#define mm_is_symmetric(typecode)((typecode)[3]=='S')
-#define mm_is_general(typecode)	((typecode)[3]=='G')
-#define mm_is_skew(typecode)	((typecode)[3]=='K')
-#define mm_is_hermitian(typecode)((typecode)[3]=='H')
-
-int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
-
-
-/********************* MM_typecode modify fucntions ***************************/
-
-#define mm_set_matrix(typecode)	((*typecode)[0]='M')
-#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
-#define mm_set_array(typecode)	((*typecode)[1]='A')
-#define mm_set_dense(typecode)	mm_set_array(typecode)
-#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
-
-#define mm_set_complex(typecode)((*typecode)[2]='C')
-#define mm_set_real(typecode)	((*typecode)[2]='R')
-#define mm_set_pattern(typecode)((*typecode)[2]='P')
-#define mm_set_integer(typecode)((*typecode)[2]='I')
-
-
-#define mm_set_symmetric(typecode)((*typecode)[3]='S')
-#define mm_set_general(typecode)((*typecode)[3]='G')
-#define mm_set_skew(typecode)	((*typecode)[3]='K')
-#define mm_set_hermitian(typecode)((*typecode)[3]='H')
-
-#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
-									(*typecode)[2]=' ',(*typecode)[3]='G')
-
-#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
-
-
-/********************* Matrix Market error codes ***************************/
-
-
-#define MM_COULD_NOT_READ_FILE	11
-#define MM_PREMATURE_EOF		12
-#define MM_NOT_MTX				13
-#define MM_NO_HEADER			14
-#define MM_UNSUPPORTED_TYPE		15
-#define MM_LINE_TOO_LONG		16
-#define MM_COULD_NOT_WRITE_FILE	17
-
-
-/******************** Matrix Market internal definitions ********************
-
-   MM_matrix_typecode: 4-character sequence
-
-				    ojbect 		sparse/   	data        storage 
-						  		dense     	type        scheme
-
-   string position:	 [0]        [1]			[2]         [3]
-
-   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
-						        A(array)	C(omplex)   H(ermitian)
-											P(attern)   S(ymmetric)
-								    		I(nteger)	K(kew)
-
- ***********************************************************************/
-
-#define MM_MTX_STR		"matrix"
-#define MM_ARRAY_STR	"array"
-#define MM_DENSE_STR	"array"
-#define MM_COORDINATE_STR "coordinate" 
-#define MM_SPARSE_STR	"coordinate"
-#define MM_COMPLEX_STR	"complex"
-#define MM_REAL_STR		"real"
-#define MM_INT_STR		"integer"
-#define MM_GENERAL_STR  "general"
-#define MM_SYMM_STR		"symmetric"
-#define MM_HERM_STR		"hermitian"
-#define MM_SKEW_STR		"skew-symmetric"
-#define MM_PATTERN_STR  "pattern"
-
-
-/*  high level routines */
-
-int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-		 double val[], MM_typecode matcode);
-int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-		double val[], MM_typecode matcode);
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
-			MM_typecode matcode);
-
-int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
-                double **val_, int **I_, int **J_);
-
-
-
-#endif
diff --git a/spgemm/mkl_xphi/CMakeLists.txt b/spgemm/mkl_xphi/CMakeLists.txt
index 3763f0fc15fc11544b3007c38c2eb268eb2ab420..17f601e7ec099d355baadb328aa6a312c769ed90 100644
--- a/spgemm/mkl_xphi/CMakeLists.txt
+++ b/spgemm/mkl_xphi/CMakeLists.txt
@@ -52,7 +52,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}")
 #if (MKL_FOUND)
 	include_directories(${MKL_INCLUDE_DIR})
 	link_directories(${MKL_LIBRARY_DIR})
-	add_executable(${NAME} mklspgemm.c mmio.c)
+	add_executable(${NAME} mklspgemm.c) 
 	#	target_link_libraries(${NAME} mkl_intel_lp64 mkl_sequential mkl_core pthread m)
 	install(TARGETS ${NAME} DESTINATION bin)
 	#else ()
diff --git a/spgemm/mkl_xphi/mklspgemm.c b/spgemm/mkl_xphi/mklspgemm.c
index 2b4873249c623bd0524010e338afff07a8c53c42..12661eed01dcc90b6f7cccf430759196e903c3ed 100644
--- a/spgemm/mkl_xphi/mklspgemm.c
+++ b/spgemm/mkl_xphi/mklspgemm.c
@@ -3,518 +3,227 @@
 #include <assert.h>
 #include <math.h>
 #include <mkl.h>
-#include "mmio.h"
+#define MIC_TARGET __declspec(target(mic))
+#include "../../common/mmio.h"
+#include "../../common/util.h"
+#include "../../common/csr.h"
 /*}}}*/
 
 /* global varibles and defines {{{*/
 int micdev = 0;
 __declspec(target(mic)) int nrepeat = 100;
-__declspec(target(mic)) int option_print_matrices = 0;
-#define OPTION_NOPRINT_MATRICES 0
-#define OPTION_PRINT_MATRICES 1
 __declspec(target(mic)) char transa = 'n';
-typedef int csi;
-typedef double csv;
-csv zero = 0.0;
-#define CS_MAX(a,b) (((a) > (b)) ? (a) : (b))
 __declspec(target(mic)) int nthreads;
-typedef struct csr_t {
-  csi  m; 
-  csi  n; 
-  csi  nzmax;
-  csi  nr; 
-  csi* r; 
-  csi* p; 
-  csi* j; 
-  csv* x; 
-} csr;
 /*}}}*/
 
-/* method declarations {{{*/
-int  main(int argc, char* argv[]);
-__declspec(target(mic)) void printmm_one(int m, double* Aval, int* AJ, int* AI);
-__declspec(target(mic)) void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI);
-__declspec(target(mic)) void printmm_zero(int m, double* Aval, int* AJ, int* AI);
-csr *csr_spfree(csr *A);
-/*}}}*/
-
-/*csr util{{{*/
-
-/* free workspace and return a sparse matrix result */
-csr *csr_done(csr *C, void *w, void *x, csi ok) {
-        return(ok ? C : csr_spfree(C)); /* return result if OK, else free it */
-}
-
-/* wrapper for free */
-void *cs_free(void *p) {
-        if (p)
-                free(p); /* free p if it is not already NULL */
-        return(NULL); /* return NULL to simplify the use of cs_free */
-}
-
-/* wrapper for realloc */
-void *csr_realloc(void *p, csi n, size_t size, csi *ok) {
-        void *pnew = NULL;
-        pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */
-        *ok = (pnew != NULL); /* realloc fails if pnew is NULL */
-        if (pnew == NULL) {
-                printf("%d:reallocation failed, pnew is NULL\n", __LINE__);
+/** Multiply two sparse matrices which are stored in CSR format. MKL is used */
+void mkl_mic_spgemm(MKL_INT Am, MKL_INT An, MKL_INT Annz, double* Aval, MKL_INT* AJ, MKL_INT* AI, MKL_INT Bn, MKL_INT Bnnz, double* Bval, MKL_INT* BJ, MKL_INT* BI, double** pCval, MKL_INT** pCJ, MKL_INT** pCI, double* time) {/*{{{*/
+    int i;
+    #pragma offload target(mic:micdev) inout(nthreads)
+    {
+        if (nthreads > 0) {
+            mkl_set_num_threads(nthreads);
+        } else {
+            nthreads = mkl_get_max_threads();
         }
-//printf("%s:%d: n=%d ok=%d\n", __FUNCTION__, __LINE__, n, *ok);
-        return((*ok) ? pnew : p); /* return original p if failure */
-}
-
-/* wrapper for realloc */
-void *cs_realloc(void *p, csi n, size_t size, csi *ok) {
-        void *pnew = NULL;
-        pnew = realloc(p, CS_MAX(n, 1) * size); /* realloc the block */
-        *ok = (pnew != NULL); /* realloc fails if pnew is NULL */
-        if (pnew == NULL) {
-                printf("reallocation failed\n");
+    }
+    #pragma offload target(mic:micdev) \
+        in(transa) \
+        in(Am) \
+        in(An) \
+        in(Bn) \
+        in(Aval:length(Annz)    free_if(0)) \
+        in(AI:length(Am+1)    free_if(0)) \
+        in(AJ:length(Annz)    free_if(0)) \
+        in(Bval:length(Bnnz)    free_if(0)) \
+        in(BI:length(An+1)    free_if(0)) \
+        in(BJ:length(Bnnz)    free_if(0))
+    {}
+
+    MKL_INT    Cnnz_host = -1;
+    MKL_INT* CI = NULL;
+    MKL_INT* CJ = NULL;
+    double* Cval = NULL;
+    MKL_INT nnzmax = 0;    // nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
+    MKL_INT ierr;
+    MKL_INT request = 2;    //numeric multiplication
+    MKL_INT sort = 3;    // sort everything
+    double time_mic_symbolic_mm = 0.0;
+    #pragma offload target(mic:micdev) in(i) in(ierr) in(nnzmax) in(request) in(sort) in(transa) in(Am)  in(An) in(Bn) \
+        out(time_mic_symbolic_mm) out(Cnnz_host)\
+        in(Aval:length(Annz)    alloc_if(0)    free_if(0)) \
+        in(AI:length(Am+1)    alloc_if(0)    free_if(0)) \
+        in(AJ:length(Annz)    alloc_if(0)    free_if(0)) \
+        in(Bval:length(Bnnz)    alloc_if(0)    free_if(0)) \
+        in(BI:length(An+1)    alloc_if(0)    free_if(0)) \
+        in(BJ:length(Bnnz)    alloc_if(0)    free_if(0)) \
+        nocopy(CI:    alloc_if(0)    free_if(0)) \
+        nocopy(CJ:    alloc_if(0)    free_if(0)) \
+        nocopy(Cval:    alloc_if(0)    free_if(0))
+    {
+        CI = (MKL_INT*)mkl_malloc( (Am+2) * sizeof( MKL_INT ), 64 );
+        MKL_INT nnzmax = 0;    // nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
+        MKL_INT ierr;
+        MKL_INT request = 1;    // symbolic multiplication
+
+        for(i = 0; i < 10; i++) {
+            mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
         }
-        return((*ok) ? pnew : p); /* return original p if failure */
-}
-
-/* change the max # of entries sparse matrix */
-csi csr_sprealloc(csr *A, csi nzmax) {
-        csi ok, oki = 0, okj = 1, okx = 1;
-        if (!A)
-                return(0);
-        if (nzmax <= 0)
-                nzmax = A->p[A->m];
-        A->j = (int*)csr_realloc(A->j, nzmax, sizeof(csi), &oki);
-        if (A->x)
-                A->x = (csv*)csr_realloc(A->x, nzmax, sizeof(csv), &okx);
-        ok = (oki && okj && okx);
-        if (ok)
-                A->nzmax = nzmax;
-        return(ok);
-}
-
-/* free a sparse matrix */
-csr *csr_spfree(csr *A) {
-        if (!A)
-                return(NULL); /* do nothing if A already NULL */
-        cs_free(A->p);
-        A->p = NULL;
-        cs_free(A->j);
-        A->j = NULL;
-        cs_free(A->x);
-        A->x = NULL;
-        cs_free(A->r);
-        A->r = NULL;
-        cs_free(A); /* free the cs struct and return NULL */
-    return NULL;
-}
-
-/* allocate a sparse matrix (triplet form or compressed-ROW form) */
-csr *csr_spalloc(csi m, csi n, csi nzmax, int values, int triplet, csv f) {
-        csr* A = (csr*)calloc(1, sizeof(csr)); /* allocate the cs struct */
-        if (!A) {
-                perror("sparse allocation failed");
-                return(NULL); /* out of memory */
+        double s_initial = dsecnd();
+        for(i = 0; i < nrepeat; i++) {
+            mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
         }
-        A->m = m; /* define dimensions and nzmax */
-        A->n = n;
-        A->nzmax = nzmax = CS_MAX(nzmax, 0);
-        A->nr = 0;  // number of nonzero rows
-        A->p = (csi*)calloc(m + 2, sizeof(csi));
-        A->j = (csi*)calloc(CS_MAX(nzmax,1), sizeof(csi));
-        A->x = (csv*)calloc(CS_MAX(nzmax,1), sizeof(csv));
-        return((!A->p || !A->j || !A->x) ? csr_spfree(A) : A);
-
-}/*}}}*/
-
-/** Multiply two sparse matrices which are stored in CSR format. MKL is used */
-csr *csr_multiply(csi Am, csi An, csi Anzmax, const csi* Ap, const csi* Aj, const csv* Ax, csi Bm, csi Bn, csi Bnzmax, const csi* Bp, const csi* Bj, const csv* Bx, long* nummult, csi* xb, csv* x) { /*{{{*/
-    csv tf = 0;
-    csi p, jp, j, kp, k, i, nz = 0, anz, *Cp, *Cj, m, n,
-        bnz, values = 1;
-    csv *Cx;
-    csr *C;
-    if (An != Bm)
-        return(NULL);
-    if (Anzmax == 0 || Bnzmax == 0) {
-        C = csr_spalloc(Am, Bn, 0, values, 0, tf);
-        return C;
+        time_mic_symbolic_mm = (dsecnd() - s_initial) / nrepeat; // seconds
+
+        request = 2;        //numeric multiplication
+        int Cnnz = CI[Am]-1;
+        int Cval_size = Cnnz + 1; Cnnz_host = Cnnz;
+        CJ        = (MKL_INT*)mkl_malloc( Cval_size    *    sizeof( MKL_INT ), 64 );
+        Cval        = (double*)mkl_malloc( Cval_size    *    sizeof( double ), 64 );
+        mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
+        printmm_one(Am, Cval, CJ, CI);
+        sort = 7;        // do not sort anything
+        request = 2;        // numeric multiplication
     }
-    m = Am;
-    anz = Ap[Am];
-    n = Bn;
-    bnz = Bp[Bm];
-    for(i = 0; i < n; i++) xb[i] = 0;
-    for(i = 0; i < n; i++)
-        xb[i] = 0;
-    values = (Ax != NULL) && (Bx != NULL);
-    csi tnz = (anz + bnz) * 2;
-    C = csr_spalloc(m, n, tnz, values, 0, tf); /* allocate result */
-    if (!C || !xb || (values && !x))
-        return (csr_done(C, xb, x, 0));
-    Cp = C->p;
-    for (i = 0; i < m; i++) {
-        if ( ( (nz + n) > C->nzmax ) ) {
-            if(!csr_sprealloc(C, (2 * (C->nzmax) + n) ) ) {
-                return (csr_done(C, xb, x, 0)); // out of memory
-            } else {
-            }
+    double time_mic_numeric_mm = 0.0;
+    #pragma offload target(mic:micdev) nocopy(request) nocopy(sort) nocopy(i) nocopy(ierr) nocopy(nnzmax)  nocopy(transa) nocopy(Am) nocopy(An) nocopy(Bn) \
+        out(time_mic_numeric_mm) \
+        nocopy(Aval:length(Annz)    alloc_if(0)    free_if(0)) \
+        nocopy(AI:length(Am+1)    alloc_if(0)    free_if(0)) \
+        nocopy(AJ:length(Annz)    alloc_if(0)    free_if(0)) \
+        nocopy(Bval:length(Bnnz)    alloc_if(0)    free_if(0)) \
+        nocopy(BI:length(An+1)    alloc_if(0)    free_if(0)) \
+        nocopy(BJ:length(Bnnz)    alloc_if(0)    free_if(0)) \
+        nocopy(CI:    alloc_if(0)    free_if(0)) \
+        nocopy(CJ:    alloc_if(0)    free_if(0)) \
+        nocopy(Cval:    alloc_if(0)    free_if(0))
+    {
+        for(i = 0; i < 10; i++) {
+            mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
         }
-        Cj = C->j;
-        Cx = C->x; /* C->j and C->x may be reallocated */
-        Cp[i] = nz; /* row i of C starts here */
-        for (jp = Ap[i]; jp < Ap[i + 1]; jp++) {
-            j = Aj[jp];
-            for (kp = Bp[j]; kp < Bp[j + 1]; kp++) {
-                k = Bj[kp]; /* B(i,j) is nonzero */
-                if (xb[k] != i + 1) {
-                    xb[k] = i + 1; /* i is new entry in column j */
-                    Cj[nz++] = k; /* add i to pattern of C(:,j) */
-                    if (x) {
-                        x[k] = Ax[jp] * Bx[kp]; /* x(i) = beta*A(i,j) */
-                        (*nummult)++;
-                    }
-                } else if (x) {
-                    x[k] += (Ax[jp] * Bx[kp]); /* i exists in C(:,j) already */
-                    (*nummult)++;
-                }
-            }
+        double s_initial = dsecnd();
+        for(i = 0; i < nrepeat; i++) {
+            mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
         }
-        if (values)
-            for (p = Cp[i]; p < nz; p++)
-                Cx[p] = x[Cj[p]];
+        time_mic_numeric_mm = (dsecnd() - s_initial) / nrepeat; // seconds
     }
-    Cp[m] = nz; /* finalize the last row of C */
-    csr_sprealloc(C, 0); /* remove extra space from C */
-    xb = NULL;
-    x = NULL;
-    return C;
-}/*}}}*/
-
-/** Multiply two sparse matrices which are stored in CSR format. MKL is used */
-void mkl_mic_spgemm(MKL_INT Am, MKL_INT An, MKL_INT Annz, double* Aval, MKL_INT* AJ, MKL_INT* AI, MKL_INT Bn, MKL_INT Bnnz, double* Bval, MKL_INT* BJ, MKL_INT* BI, double** pCval, MKL_INT** pCJ, MKL_INT** pCI, double* time) {/*{{{*/
-
-	int i;
-	#pragma offload target(mic:micdev) inout(nthreads)
-	{
-		if (nthreads > 0) {
-			mkl_set_num_threads(nthreads);
-		} else {
-			nthreads = mkl_get_max_threads();
-		}
-	} 
-	#pragma offload target(mic:micdev) \
-		in(transa) \
-		in(Am) \
-		in(An) \
-		in(Bn) \
-		in(Aval:length(Annz)	free_if(0)) \
-		in(AI:length(Am+1)	free_if(0)) \
-		in(AJ:length(Annz)	free_if(0)) \
-		in(Bval:length(Bnnz)	free_if(0)) \
-		in(BI:length(An+1)	free_if(0)) \
-		in(BJ:length(Bnnz)	free_if(0)) 
-	{}
-
-	MKL_INT	Cnnz_host = -1;
-	MKL_INT* CI = NULL;
-	MKL_INT* CJ = NULL;
-	double* Cval = NULL;
-	MKL_INT nnzmax = 0;	// nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
-	MKL_INT ierr;
-	MKL_INT request = 2;	//numeric multiplication
-	MKL_INT sort = 3;	// sort everything
-	double time_mic_symbolic_mm = 0.0;
-	#pragma offload target(mic:micdev) in(i) in(ierr) in(nnzmax) in(request) in(sort) in(transa) in(Am)  in(An) in(Bn) \
-		out(time_mic_symbolic_mm) out(Cnnz_host)\
-		in(Aval:length(Annz)	alloc_if(0)	free_if(0)) \
-		in(AI:length(Am+1)	alloc_if(0)	free_if(0)) \
-		in(AJ:length(Annz)	alloc_if(0)	free_if(0)) \
-		in(Bval:length(Bnnz)	alloc_if(0)	free_if(0)) \
-		in(BI:length(An+1)	alloc_if(0)	free_if(0)) \
-		in(BJ:length(Bnnz)	alloc_if(0)	free_if(0)) \
-		nocopy(CI:	alloc_if(0)	free_if(0)) \
-		nocopy(CJ:	alloc_if(0)	free_if(0)) \
-		nocopy(Cval:	alloc_if(0)	free_if(0))
-	{
-		CI = (MKL_INT*)mkl_malloc( (Am+2) * sizeof( MKL_INT ), 64 );
-		MKL_INT nnzmax = 0;	// nnzmax is zero in case of symbolic&numeric usage of mkl_?csrmultcsr
-		MKL_INT ierr;
-		MKL_INT request = 1;	// symbolic multiplication
-		//MKL_INT sort = 7;	// do not sort anything
-
-		for(i = 0; i < 10; i++) {
-			mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
-		}	
-		double s_initial = dsecnd();
-		for(i = 0; i < nrepeat; i++) {
-			mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &Bn, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
-		}	
-		time_mic_symbolic_mm = (dsecnd() - s_initial) / nrepeat; // seconds
-
-		request = 2;		//numeric multiplication
-		int Cnnz = CI[Am]-1;
-		int Cval_size = Cnnz + 1; Cnnz_host = Cnnz;
-		CJ		= (MKL_INT*)mkl_malloc( Cval_size	*	sizeof( MKL_INT ), 64 );
-		Cval		= (double*)mkl_malloc( Cval_size	*	sizeof( double ), 64 );
-		mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
-		printmm_one(Am, Cval, CJ, CI);
-		sort = 7;		// do not sort anything
-		request = 2;		// numeric multiplication
-	}
-	double time_mic_numeric_mm = 0.0;
-	#pragma offload target(mic:micdev) nocopy(request) nocopy(sort) nocopy(i) nocopy(ierr) nocopy(nnzmax)  nocopy(transa) nocopy(Am) nocopy(An) nocopy(Bn) \
-		out(time_mic_numeric_mm) \
-		nocopy(Aval:length(Annz)	alloc_if(0)	free_if(0)) \
-		nocopy(AI:length(Am+1)	alloc_if(0)	free_if(0)) \
-		nocopy(AJ:length(Annz)	alloc_if(0)	free_if(0)) \
-		nocopy(Bval:length(Bnnz)	alloc_if(0)	free_if(0)) \
-		nocopy(BI:length(An+1)	alloc_if(0)	free_if(0)) \
-		nocopy(BJ:length(Bnnz)	alloc_if(0)	free_if(0)) \
-		nocopy(CI:	alloc_if(0)	free_if(0)) \
-		nocopy(CJ:	alloc_if(0)	free_if(0)) \
-		nocopy(Cval:	alloc_if(0)	free_if(0))
-	{
-		for(i = 0; i < 10; i++) {
-			mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
-		}
-		double s_initial = dsecnd();
-		for(i = 0; i < nrepeat; i++) {
-			mkl_dcsrmultcsr(&transa, &request, &sort, &Am, &An, &An, Aval, AJ, AI, Bval, BJ, BI, Cval, CJ, CI, &nnzmax, &ierr);
-		}
-		time_mic_numeric_mm = (dsecnd() - s_initial) / nrepeat; // seconds
-	}
-	*time = time_mic_numeric_mm;
-	if (1) { // Copy result matrix from MIC to Host
-	int nelm_CI = Am + 2; 
-	int nelm_CJ = Cnnz_host + 1; 
-	int nelm_Cval = Cnnz_host + 1; 
-	__declspec(target(mic)) MKL_INT* CI_host = (MKL_INT*)mkl_malloc( (nelm_CI) * sizeof( MKL_INT ), 64 );
-	__declspec(target(mic)) MKL_INT* CJ_host = (MKL_INT*)mkl_malloc( (nelm_CJ) * sizeof( MKL_INT ), 64 );
-	__declspec(target(mic)) double* Cval_host = (double*)mkl_malloc( (nelm_Cval) * sizeof( double ), 64 );
-	#pragma offload target(mic:micdev) in(nelm_CI)\ 
-		inout(CI_host:length(nelm_CI)	alloc_if(1)	free_if(0)) \
-		inout(CJ_host:length(nelm_CJ)	alloc_if(1)	free_if(0)) \
-		inout(Cval_host:length(nelm_Cval)	alloc_if(1)	free_if(0)) \
-		nocopy(CI:	alloc_if(0)	free_if(0)) \
-		nocopy(CJ:	alloc_if(0)	free_if(0)) \
-		nocopy(Cval:	alloc_if(0)	free_if(0))
-	{
-		int i;
-		for(i = 0; i < nelm_CI; i++) CI_host[i] = CI[i];
-		for(i = 0; i < nelm_CJ; i++) {CJ_host[i] = CJ[i]; Cval_host[i] = Cval[i];}
-	}
-	*pCval = Cval_host; *pCJ = CJ_host; *pCI = CI_host; 
-	}
-} /* ENDOF mkl_mic_spmm }}}*/
-
-/** Read Matrix Market file into COO matrix */
-void read_mm(char* strpath, int* pM, int* pN, int* prealnnz, int** pI, int** pJ, double** pval){ /*{{{*/
-/* 
-*   taken from Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-int i, M, N, nz, *I, *J;
-double* val;
-int ret_code;
-MM_typecode matcode;
-FILE* f;
-if ((f = fopen(strpath, "r")) == NULL) {fprintf(stderr, "Input matrix file %s cannot be opened to read.", strpath);exit(1);}
-/* READ MATRIX */
-if (mm_read_banner(f, &matcode) != 0) {
-    printf("Could not process Matrix Market banner.\n");    exit(1);
-}
-/*  This is how one can screen matrix types if their application */
-/*  only supports a subset of the Matrix Market data types.      */
-if (mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode) ) {
-    printf("Sorry, this application does not support ");
-    printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));exit(1);
-}
-/* find out size of sparse matrix .... */
-if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) exit(1);
-/* reseve memory for matrices */
-I = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int));
-J = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int));
-val = (double *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(double));
-*pI = I;
-*pJ = J;
-*pval = val;
-/* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-/*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-/*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
-int realnnz = 0;
-for (i=0; i<nz; i++) {
-    if(mm_is_pattern(matcode)) {
-        fscanf(f, "%d %d\n", &I[realnnz], &J[realnnz]);
-        val[realnnz] = 1.0;
+    *time = time_mic_numeric_mm;
+    if (1) { // Copy result matrix from MIC to Host
+    int nelm_CI = Am + 2;
+    int nelm_CJ = Cnnz_host + 1;
+    int nelm_Cval = Cnnz_host + 1;
+    __declspec(target(mic)) MKL_INT* CI_host = (MKL_INT*)mkl_malloc( (nelm_CI) * sizeof( MKL_INT ), 64 );
+    __declspec(target(mic)) MKL_INT* CJ_host = (MKL_INT*)mkl_malloc( (nelm_CJ) * sizeof( MKL_INT ), 64 );
+    __declspec(target(mic)) double* Cval_host = (double*)mkl_malloc( (nelm_Cval) * sizeof( double ), 64 );
+    #pragma offload target(mic:micdev) in(nelm_CI)\
+        inout(CI_host:length(nelm_CI)    alloc_if(1)    free_if(0)) \
+        inout(CJ_host:length(nelm_CJ)    alloc_if(1)    free_if(0)) \
+        inout(Cval_host:length(nelm_Cval)    alloc_if(1)    free_if(0)) \
+        nocopy(CI:    alloc_if(0)    free_if(0)) \
+        nocopy(CJ:    alloc_if(0)    free_if(0)) \
+        nocopy(Cval:    alloc_if(0)    free_if(0))
+    {
+        int i;
+        for(i = 0; i < nelm_CI; i++) CI_host[i] = CI[i];
+        for(i = 0; i < nelm_CJ; i++) {CJ_host[i] = CJ[i]; Cval_host[i] = Cval[i];}
     }
-    else
-        fscanf(f, "%d %d %lg\n", &I[realnnz], &J[realnnz], &val[realnnz]);
-    I[realnnz]--;  /* adjust from 1-based to 0-based */
-        J[realnnz]--;
-    if(mm_is_symmetric(matcode) && I[realnnz] != J[realnnz]) {
-        I[realnnz+1] = J[realnnz];
-        J[realnnz+1] = I[realnnz];
-        val[realnnz+1] = val[realnnz];
-        realnnz++;
+    *pCval = Cval_host; *pCJ = CJ_host; *pCI = CI_host;
     }
-    realnnz++;
-}
-if (f !=stdin) fclose(f);
-*pM = M;
-*pN = N;
-*prealnnz = realnnz;
-} /* ENDOF read_mm }}}*/
-
-/** Converts COO matrix to CSR matrix */
-void coo_to_csr(int m, int nnz, int* I, int* J, double* val, MKL_INT* AI, MKL_INT* AJ, double* Aval) { /*{{{*/
-
-MKL_INT info = 0;
-MKL_INT job[8];   
-//job[1]=0; // zero based indexing in csr
-job[1]=1; // one based indexing in csr
-
-job[2]=0; // zero based indexing in coo
-job[3]=2; // I don't know
-job[4]=nnz; // nnz
-
-job[0]=1;  // coo to csr
-job[5]=0;  // Acsr and AJR allocated by user
-//void mkl_dcsrcoo (MKL_INT * job, MKL_INT * n, double *Acsr, MKL_INT * AJR, MKL_INT *AIR, MKL_INT * nnz, double *Acoo, MKL_INT * ir, MKL_INT * jc, MKL_INT * info);
-mkl_dcsrcoo (job,&m, Aval, AJ, AI, &nnz, val, I, J, &info);
-} /* ENDOF coo_to_csr }}}*/
+} /* ENDOF mkl_mic_spmm }}}*/
 
 int main(int argc, char* argv[]) { /*{{{*/
-/** usage */
-int nrequired_args = 7;
-if (argc != nrequired_args){
-    fprintf(stderr, "NAME:\n\tmkl_spgemm - multiply two sparse matrices\n");
-    fprintf(stderr, "\nSYNOPSIS:\n");
-    fprintf(stderr, "\tmkl_spgemm MATRIX_A MATRIX_B MATRIX_C NUMBER_OF_THREADS MIC_ID PRINT_MATRICES\n");
-    fprintf(stderr, "\nDESCRIPTION:\n");
-    fprintf(stderr, "\tNUMBER_OF_THREADS: {0,1,2,...}\n");
-    fprintf(stderr, "\t\t0: Use number of threads determined by MKL\n");
-    fprintf(stderr, "\tPRINT_MATRICES: PRINT_YES, PRINT_NO\n");
-    fprintf(stderr, "\tMIC_ID: {0,1,2,...}\n");
-    fprintf(stderr, "\t\t0: The device ID\n");
-    fprintf(stderr, "\nSAMPLE EXECUTION:\n");
-    fprintf(stderr, "\texport OFFLOAD_INIT=on_offload;%s test.mtx test.mtx out.mtx 2 0 PRINT_YES\n", argv[0]);
-    exit(1);
-}
-/** parse arguments */
-int iarg = 1;
-char* strpathA = argv[iarg];    iarg++;
-char* strpathB = argv[iarg];    iarg++;
-char* strpathC = argv[iarg];    iarg++;
-nthreads = atoi(argv[iarg]);    iarg++;
-if (nthreads > 0) {
-    mkl_set_num_threads(nthreads);
-} else {
-    nthreads = mkl_get_max_threads();
-}
-micdev = atoi(argv[iarg]);    iarg++;
-option_print_matrices = strcmp(argv[iarg], "PRINT_YES")==0?OPTION_PRINT_MATRICES:OPTION_NOPRINT_MATRICES;    iarg++;
-assert(nrequired_args == iarg);
-
-/** read matrix market file for A matrix */
-MKL_INT Am, An, Annz;
-MKL_INT *Ax, *Ay;  
-double *Anz;
-read_mm(strpathA, &Am, &An, &Annz, &Ax, &Ay, &Anz);
-
-/** construct csr storage for A matrix */
-MKL_INT* AJ = (MKL_INT*) mkl_malloc( Annz * sizeof( MKL_INT ), 64 );
-MKL_INT* AI = (MKL_INT*) mkl_malloc( (Am+1) * sizeof( MKL_INT ), 64 );
-double* Aval = (double*) mkl_malloc( Annz * sizeof( double ),  64 );
-coo_to_csr(Am, Annz, Ax, Ay, Anz, AI, AJ, Aval);    
-
-/** read matrix market file for B matrix */
-MKL_INT Bm, Bn, Bnnz;   
-MKL_INT *Bx, *By;  
-double *Bnz;
-read_mm(strpathB, &Bm, &Bn, &Bnnz, &Bx, &By, &Bnz);
-
-/** construct csr storage for B matrix */
-MKL_INT* BJ = (MKL_INT*) mkl_malloc( Bnnz * sizeof( MKL_INT ), 64 );
-MKL_INT* BI = (MKL_INT*) mkl_malloc( (Bm+1) * sizeof( MKL_INT ), 64 );
-double* Bval = (double*) mkl_malloc( Bnnz * sizeof( double ),  64 );
-coo_to_csr(Bm, Bnnz, Bx, By, Bnz, BI, BJ, Bval);    
-
-/** multiply two matrices */
-double* Cval, time; 
-MKL_INT* CJ; MKL_INT* CI;
-mkl_mic_spgemm(Am, An, Annz, Aval, AJ, AI, Bn, Bnnz, Bval, BJ, BI, &Cval, &CJ, &CI, &time);
-
-int i;
-for(i=0;i<=Am;i++)AI[i]--;    for(i=0;i<Annz;i++)AJ[i]--;
-for(i=0;i<=Bm;i++)BI[i]--;    for(i=0;i<Bnnz;i++)BJ[i]--;
-printmm_one(Am, Cval, CJ, CI);
-
-/** In order to write the output C matrix to file, uncomment the following line */
-printfilemm_one(strpathC, Am, Bn, Cval, CJ, CI);
-
-/** run my SpGEMM routine in order to find number of multiply-and-add operations */
-long nummult = 0;
-csi* xb = (csi*)calloc(Bn, sizeof(csi));
-csv* x = (csv*)calloc(Bn, sizeof(csv));
-csr* C = csr_multiply(Am, An, Annz, AI, AJ, Aval, Bm, Bn, Bnnz, BI, BJ, Bval, &nummult, xb, x);
-double gflop = 2 * (double) nummult / 1e9;
-
-/** print gflop per second and time */
-printf("%d\t", nthreads);
-printf("%g\t", (gflop/time));
-printf("%g\t", time);
-printf("%s\t", strpathA);
-printf("%s\t", strpathB);
-printf("%s\t", strpathC);
-printf("\n");
-
-/** free allocated space */
-mkl_free(AI);
-mkl_free(AJ);
-mkl_free(Aval);
-mkl_free(BI);
-mkl_free(BJ);
-mkl_free(Bval);
-return 0;
-} /* ENDOF main }}}*/
-
-/** Prints matrix in CSR format */
-void printmm_one(int m, double* Aval, int* AJ, int* AI){ //{{{
+    /** usage */
+    int nrequired_args = 7;
+    if (argc != nrequired_args){
+        fprintf(stderr, "NAME:\n\tmkl_spgemm - multiply two sparse matrices\n");
+        fprintf(stderr, "\nSYNOPSIS:\n");
+        fprintf(stderr, "\tmkl_spgemm MATRIX_A MATRIX_B MATRIX_C NUMBER_OF_THREADS MIC_ID PRINT_MATRICES\n");
+        fprintf(stderr, "\nDESCRIPTION:\n");
+        fprintf(stderr, "\tNUMBER_OF_THREADS: {0,1,2,...}\n");
+        fprintf(stderr, "\t\t0: Use number of threads determined by MKL\n");
+        fprintf(stderr, "\tPRINT_MATRICES: PRINT_YES, PRINT_NO\n");
+        fprintf(stderr, "\tMIC_ID: {0,1,2,...}\n");
+        fprintf(stderr, "\t\t0: The device ID\n");
+        fprintf(stderr, "\nSAMPLE EXECUTION:\n");
+        fprintf(stderr, "\texport OFFLOAD_INIT=on_offload;%s test.mtx test.mtx out.mtx 2 0 PRINT_YES\n", argv[0]);
+        exit(1);
+    }
+    /** parse arguments */
+    int iarg = 1;
+    char* strpathA = argv[iarg];    iarg++;
+    char* strpathB = argv[iarg];    iarg++;
+    char* strpathC = argv[iarg];    iarg++;
+    nthreads = atoi(argv[iarg]);    iarg++;
+    if (nthreads > 0) {
+        mkl_set_num_threads(nthreads);
+    } else {
+        nthreads = mkl_get_max_threads();
+    }
+    micdev = atoi(argv[iarg]);    iarg++;
+    option_print_matrices = strcmp(argv[iarg], "PRINT_YES")==0?OPTION_PRINT_MATRICES:OPTION_NOPRINT_MATRICES;    iarg++;
+    assert(nrequired_args == iarg);
+
+    /** read matrix market file for A matrix */
+    MKL_INT Am, An, Annz;
+    MKL_INT *Ax, *Ay;
+    double *Anz;
+    read_mm(strpathA, &Am, &An, &Annz, &Ax, &Ay, &Anz);
+
+    /** construct csr storage for A matrix */
+    MKL_INT* AJ = (MKL_INT*) mkl_malloc( Annz * sizeof( MKL_INT ), 64 );
+    MKL_INT* AI = (MKL_INT*) mkl_malloc( (Am+1) * sizeof( MKL_INT ), 64 );
+    double* Aval = (double*) mkl_malloc( Annz * sizeof( double ),  64 );
+    coo_to_csr(Am, Annz, Ax, Ay, Anz, AI, AJ, Aval);
+
+    /** read matrix market file for B matrix */
+    MKL_INT Bm, Bn, Bnnz;
+    MKL_INT *Bx, *By;
+    double *Bnz;
+    read_mm(strpathB, &Bm, &Bn, &Bnnz, &Bx, &By, &Bnz);
+
+    /** construct csr storage for B matrix */
+    MKL_INT* BJ = (MKL_INT*) mkl_malloc( Bnnz * sizeof( MKL_INT ), 64 );
+    MKL_INT* BI = (MKL_INT*) mkl_malloc( (Bm+1) * sizeof( MKL_INT ), 64 );
+    double* Bval = (double*) mkl_malloc( Bnnz * sizeof( double ),  64 );
+    coo_to_csr(Bm, Bnnz, Bx, By, Bnz, BI, BJ, Bval);
+
+    /** multiply two matrices */
+    double* Cval, time;
+    MKL_INT* CJ; MKL_INT* CI;
+    mkl_mic_spgemm(Am, An, Annz, Aval, AJ, AI, Bn, Bnnz, Bval, BJ, BI, &Cval, &CJ, &CI, &time);
 
-    if (option_print_matrices == OPTION_NOPRINT_MATRICES)
-        return;
     int i;
-    for(i = 0; i < m; i++) {
-        printf("%d: ", i+1);
-        int j;
-        for(j = AI[i]-1; j < AI[i+1]-1; j++) {
-            printf("%d:%g  ", AJ[j], Aval[j]);
-        }
-        printf("\n");
-    }
+    for(i=0;i<=Am;i++)AI[i]--;    for(i=0;i<Annz;i++)AJ[i]--;
+    for(i=0;i<=Bm;i++)BI[i]--;    for(i=0;i<Bnnz;i++)BJ[i]--;
+    printmm_one(Am, Cval, CJ, CI);
+
+    /** Write the output C matrix to file */
+    printfilemm_one(strpathC, Am, Bn, Cval, CJ, CI);
+
+    /** run my SpGEMM routine in order to find number of multiply-and-add operations */
+    long nummult = 0;
+    csi* xb = (csi*)calloc(Bn, sizeof(csi));
+    csv* x = (csv*)calloc(Bn, sizeof(csv));
+    csr* C = csr_multiply(Am, An, Annz, AI, AJ, Aval, Bm, Bn, Bnnz, BI, BJ, Bval, &nummult, xb, x);
+    double gflop = 2 * (double) nummult / 1e9;
+
+    /** print gflop per second and time */
+    printf("%d\t", nthreads);
+    printf("%g\t", (gflop/time));
+    printf("%g\t", time);
+    printf("%s\t", strpathA);
+    printf("%s\t", strpathB);
+    printf("%s\t", strpathC);
     printf("\n");
-}//}}}
 
-/** Writes matrix in CSR format in to a file using Matrix Market format */
-void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI){//{{{
+    /** free allocated space */
+    mkl_free(AI);
+    mkl_free(AJ);
+    mkl_free(Aval);
+    mkl_free(BI);
+    mkl_free(BJ);
+    mkl_free(Bval);
+    return 0;
+} /* ENDOF main }}}*/
 
-    FILE* f = fopen(file, "w");
-    if(f == NULL){
-        printf("%s %s %d: %s cannot be opened to write matrix\n", __FILE__, __PRETTY_FUNCTION__, __LINE__, file);
-        exit(1); 
-    }
-    int i;
-    fprintf(f, "%%%%MatrixMarket matrix coordinate real general\n");
-    fprintf(f, "%d %d %d\n", m, n, AI[m]-1);
-    for(i = 0; i < m; i++) {
-        int j;
-        for(j = AI[i]-1; j < AI[i+1]-1; j++) {
-            fprintf(f, "%d %d %g\n", i+1, AJ[j], Aval[j]);
-        }
-    }
-    fclose(f);
-}//}}}
 
diff --git a/spgemm/mkl_xphi/mmio.c b/spgemm/mkl_xphi/mmio.c
deleted file mode 100644
index c250ff2aed998fe65248537a8b19a359206187ce..0000000000000000000000000000000000000000
--- a/spgemm/mkl_xphi/mmio.c
+++ /dev/null
@@ -1,511 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-
-#include "mmio.h"
-
-int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
-                double **val_, int **I_, int **J_)
-{
-    FILE *f;
-    MM_typecode matcode;
-    int M, N, nz;
-    int i;
-    double *val;
-    int *I, *J;
- 
-    if ((f = fopen(fname, "r")) == NULL)
-            return -1;
- 
- 
-    if (mm_read_banner(f, &matcode) != 0)
-    {
-        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
-        printf(" in file [%s]\n", fname);
-        return -1;
-    }
- 
- 
- 
-    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
-            mm_is_sparse(matcode)))
-    {
-        fprintf(stderr, "Sorry, this application does not support ");
-        fprintf(stderr, "Market Market type: [%s]\n",
-                mm_typecode_to_str(matcode));
-        return -1;
-    }
- 
-    /* find out size of sparse matrix: M, N, nz .... */
- 
-    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
-    {
-        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
-        return -1;
-    }
- 
-    *M_ = M;
-    *N_ = N;
-    *nz_ = nz;
- 
-    /* reseve memory for matrices */
- 
-    I = (int *) malloc(nz * sizeof(int));
-    J = (int *) malloc(nz * sizeof(int));
-    val = (double *) malloc(nz * sizeof(double));
- 
-    *val_ = val;
-    *I_ = I;
-    *J_ = J;
- 
-    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
- 
-    for (i=0; i<nz; i++)
-    {
-        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
-        I[i]--;  /* adjust from 1-based to 0-based */
-        J[i]--;
-    }
-    fclose(f);
- 
-    return 0;
-}
-
-int mm_is_valid(MM_typecode matcode)
-{
-    if (!mm_is_matrix(matcode)) return 0;
-    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
-    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
-    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
-                mm_is_skew(matcode))) return 0;
-    return 1;
-}
-
-int mm_read_banner(FILE *f, MM_typecode *matcode)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    char banner[MM_MAX_TOKEN_LENGTH];
-    char mtx[MM_MAX_TOKEN_LENGTH]; 
-    char crd[MM_MAX_TOKEN_LENGTH];
-    char data_type[MM_MAX_TOKEN_LENGTH];
-    char storage_scheme[MM_MAX_TOKEN_LENGTH];
-    char *p;
-
-
-    mm_clear_typecode(matcode);  
-
-    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
-        return MM_PREMATURE_EOF;
-
-    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
-        storage_scheme) != 5)
-        return MM_PREMATURE_EOF;
-
-    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
-    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
-    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
-    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
-
-    /* check for banner */
-    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
-        return MM_NO_HEADER;
-
-    /* first field should be "mtx" */
-    if (strcmp(mtx, MM_MTX_STR) != 0)
-        return  MM_UNSUPPORTED_TYPE;
-    mm_set_matrix(matcode);
-
-
-    /* second field describes whether this is a sparse matrix (in coordinate
-            storgae) or a dense array */
-
-
-    if (strcmp(crd, MM_SPARSE_STR) == 0)
-        mm_set_sparse(matcode);
-    else
-    if (strcmp(crd, MM_DENSE_STR) == 0)
-            mm_set_dense(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* third field */
-
-    if (strcmp(data_type, MM_REAL_STR) == 0)
-        mm_set_real(matcode);
-    else
-    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
-        mm_set_complex(matcode);
-    else
-    if (strcmp(data_type, MM_PATTERN_STR) == 0)
-        mm_set_pattern(matcode);
-    else
-    if (strcmp(data_type, MM_INT_STR) == 0)
-        mm_set_integer(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* fourth field */
-
-    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
-        mm_set_general(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
-        mm_set_symmetric(matcode);
-    else
-    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
-        mm_set_hermitian(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
-        mm_set_skew(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-        
-
-    return 0;
-}
-
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
-{
-    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
-
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = *nz = 0;
-
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
-        return 0;
-        
-    else
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 3);
-
-    return 0;
-}
-
-
-int mm_read_mtx_array_size(FILE *f, int *M, int *N)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = 0;
-	
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d", M, N) == 2)
-        return 0;
-        
-    else /* we have a blank line */
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d", M, N); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 2);
-
-    return 0;
-}
-
-int mm_write_mtx_array_size(FILE *f, int M, int N)
-{
-    if (fprintf(f, "%d %d\n", M, N) != 2)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
-
-
-
-/*-------------------------------------------------------------------------*/
-
-/******************************************************************/
-/* use when I[], J[], and val[]J, and val[] are already allocated */
-/******************************************************************/
-
-int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    int i;
-    if (mm_is_complex(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-        for (i=0; i<nz; i++)
-        {
-            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
-                != 3) return MM_PREMATURE_EOF;
-
-        }
-    }
-
-    else if (mm_is_pattern(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d", &I[i], &J[i])
-                != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-        
-}
-
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
-        double *real, double *imag, MM_typecode matcode)
-{
-    if (mm_is_complex(matcode))
-    {
-            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-            if (fscanf(f, "%d %d %lg\n", I, J, real)
-                != 3) return MM_PREMATURE_EOF;
-
-    }
-
-    else if (mm_is_pattern(matcode))
-    {
-            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-        
-}
-
-
-/************************************************************************
-    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
-                        type code, e.g. 'MCRS'
-
-                        if matrix is complex, values[] is of size 2*nz,
-                            (nz pairs of real/imaginary values)
-************************************************************************/
-
-int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
-        double **val, MM_typecode *matcode)
-{
-    int ret_code;
-    FILE *f;
-
-    if (strcmp(fname, "stdin") == 0) f=stdin;
-    else
-    if ((f = fopen(fname, "r")) == NULL)
-        return MM_COULD_NOT_READ_FILE;
-
-
-    if ((ret_code = mm_read_banner(f, matcode)) != 0)
-        return ret_code;
-
-    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
-            mm_is_matrix(*matcode)))
-        return MM_UNSUPPORTED_TYPE;
-
-    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
-        return ret_code;
-
-
-    *I = (int *)  malloc(*nz * sizeof(int));
-    *J = (int *)  malloc(*nz * sizeof(int));
-    *val = NULL;
-
-    if (mm_is_complex(*matcode))
-    {
-        *val = (double *) malloc(*nz * 2 * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-    else if (mm_is_real(*matcode))
-    {
-        *val = (double *) malloc(*nz * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-
-    else if (mm_is_pattern(*matcode))
-    {
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-
-    if (f != stdin) fclose(f);
-    return 0;
-}
-
-int mm_write_banner(FILE *f, MM_typecode matcode)
-{
-    char *str = mm_typecode_to_str(matcode);
-    int ret_code;
-
-    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
-    free(str);
-    if (ret_code !=2 )
-        return MM_COULD_NOT_WRITE_FILE;
-    else
-        return 0;
-}
-
-int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    FILE *f;
-    int i;
-
-    if (strcmp(fname, "stdout") == 0) 
-        f = stdout;
-    else
-    if ((f = fopen(fname, "w")) == NULL)
-        return MM_COULD_NOT_WRITE_FILE;
-    
-    /* print banner followed by typecode */
-    fprintf(f, "%s ", MatrixMarketBanner);
-    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
-
-    /* print matrix sizes and nonzeros */
-    fprintf(f, "%d %d %d\n", M, N, nz);
-
-    /* print values */
-    if (mm_is_pattern(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d\n", I[i], J[i]);
-    else
-    if (mm_is_real(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
-    else
-    if (mm_is_complex(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
-                        val[2*i+1]);
-    else
-    {
-        if (f != stdout) fclose(f);
-        return MM_UNSUPPORTED_TYPE;
-    }
-
-    if (f !=stdout) fclose(f);
-
-    return 0;
-}
-  
-
-/**
-*  Create a new copy of a string s.  mm_strdup() is a common routine, but
-*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
-*
-*/
-char *mm_strdup(const char *s)
-{
-	int len = strlen(s);
-	char *s2 = (char *) malloc((len+1)*sizeof(char));
-	return strcpy(s2, s);
-}
-
-char  *mm_typecode_to_str(MM_typecode matcode)
-{
-    char buffer[MM_MAX_LINE_LENGTH];
-    char *types[4];
-	char *mm_strdup(const char *);
-    int error =0;
-
-    /* check for MTX type */
-    if (mm_is_matrix(matcode)) 
-        types[0] = MM_MTX_STR;
-    else
-        error=1;
-
-    /* check for CRD or ARR matrix */
-    if (mm_is_sparse(matcode))
-        types[1] = MM_SPARSE_STR;
-    else
-    if (mm_is_dense(matcode))
-        types[1] = MM_DENSE_STR;
-    else
-        return NULL;
-
-    /* check for element data type */
-    if (mm_is_real(matcode))
-        types[2] = MM_REAL_STR;
-    else
-    if (mm_is_complex(matcode))
-        types[2] = MM_COMPLEX_STR;
-    else
-    if (mm_is_pattern(matcode))
-        types[2] = MM_PATTERN_STR;
-    else
-    if (mm_is_integer(matcode))
-        types[2] = MM_INT_STR;
-    else
-        return NULL;
-
-
-    /* check for symmetry type */
-    if (mm_is_general(matcode))
-        types[3] = MM_GENERAL_STR;
-    else
-    if (mm_is_symmetric(matcode))
-        types[3] = MM_SYMM_STR;
-    else 
-    if (mm_is_hermitian(matcode))
-        types[3] = MM_HERM_STR;
-    else 
-    if (mm_is_skew(matcode))
-        types[3] = MM_SKEW_STR;
-    else
-        return NULL;
-
-    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
-    return mm_strdup(buffer);
-
-}
diff --git a/spgemm/mkl_xphi/mmio.h b/spgemm/mkl_xphi/mmio.h
deleted file mode 100644
index 7cfd0a1b7ae7572e85b8c10bcb2fd0b3333ad0b6..0000000000000000000000000000000000000000
--- a/spgemm/mkl_xphi/mmio.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-#ifndef MM_IO_H
-#define MM_IO_H
-
-#define MM_MAX_LINE_LENGTH 1025
-#define MatrixMarketBanner "%%MatrixMarket"
-#define MM_MAX_TOKEN_LENGTH 64
-
-typedef char MM_typecode[4];
-
-char *mm_typecode_to_str(MM_typecode matcode);
-
-int mm_read_banner(FILE *f, MM_typecode *matcode);
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
-int mm_read_mtx_array_size(FILE *f, int *M, int *N);
-
-int mm_write_banner(FILE *f, MM_typecode matcode);
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
-int mm_write_mtx_array_size(FILE *f, int M, int N);
-
-
-/********************* MM_typecode query fucntions ***************************/
-
-#define mm_is_matrix(typecode)	((typecode)[0]=='M')
-
-#define mm_is_sparse(typecode)	((typecode)[1]=='C')
-#define mm_is_coordinate(typecode)((typecode)[1]=='C')
-#define mm_is_dense(typecode)	((typecode)[1]=='A')
-#define mm_is_array(typecode)	((typecode)[1]=='A')
-
-#define mm_is_complex(typecode)	((typecode)[2]=='C')
-#define mm_is_real(typecode)		((typecode)[2]=='R')
-#define mm_is_pattern(typecode)	((typecode)[2]=='P')
-#define mm_is_integer(typecode) ((typecode)[2]=='I')
-
-#define mm_is_symmetric(typecode)((typecode)[3]=='S')
-#define mm_is_general(typecode)	((typecode)[3]=='G')
-#define mm_is_skew(typecode)	((typecode)[3]=='K')
-#define mm_is_hermitian(typecode)((typecode)[3]=='H')
-
-int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
-
-
-/********************* MM_typecode modify fucntions ***************************/
-
-#define mm_set_matrix(typecode)	((*typecode)[0]='M')
-#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
-#define mm_set_array(typecode)	((*typecode)[1]='A')
-#define mm_set_dense(typecode)	mm_set_array(typecode)
-#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
-
-#define mm_set_complex(typecode)((*typecode)[2]='C')
-#define mm_set_real(typecode)	((*typecode)[2]='R')
-#define mm_set_pattern(typecode)((*typecode)[2]='P')
-#define mm_set_integer(typecode)((*typecode)[2]='I')
-
-
-#define mm_set_symmetric(typecode)((*typecode)[3]='S')
-#define mm_set_general(typecode)((*typecode)[3]='G')
-#define mm_set_skew(typecode)	((*typecode)[3]='K')
-#define mm_set_hermitian(typecode)((*typecode)[3]='H')
-
-#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
-									(*typecode)[2]=' ',(*typecode)[3]='G')
-
-#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
-
-
-/********************* Matrix Market error codes ***************************/
-
-
-#define MM_COULD_NOT_READ_FILE	11
-#define MM_PREMATURE_EOF		12
-#define MM_NOT_MTX				13
-#define MM_NO_HEADER			14
-#define MM_UNSUPPORTED_TYPE		15
-#define MM_LINE_TOO_LONG		16
-#define MM_COULD_NOT_WRITE_FILE	17
-
-
-/******************** Matrix Market internal definitions ********************
-
-   MM_matrix_typecode: 4-character sequence
-
-				    ojbect 		sparse/   	data        storage 
-						  		dense     	type        scheme
-
-   string position:	 [0]        [1]			[2]         [3]
-
-   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
-						        A(array)	C(omplex)   H(ermitian)
-											P(attern)   S(ymmetric)
-								    		I(nteger)	K(kew)
-
- ***********************************************************************/
-
-#define MM_MTX_STR		"matrix"
-#define MM_ARRAY_STR	"array"
-#define MM_DENSE_STR	"array"
-#define MM_COORDINATE_STR "coordinate" 
-#define MM_SPARSE_STR	"coordinate"
-#define MM_COMPLEX_STR	"complex"
-#define MM_REAL_STR		"real"
-#define MM_INT_STR		"integer"
-#define MM_GENERAL_STR  "general"
-#define MM_SYMM_STR		"symmetric"
-#define MM_HERM_STR		"hermitian"
-#define MM_SKEW_STR		"skew-symmetric"
-#define MM_PATTERN_STR  "pattern"
-
-
-/*  high level routines */
-
-int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-		 double val[], MM_typecode matcode);
-int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-		double val[], MM_typecode matcode);
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
-			MM_typecode matcode);
-
-int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
-                double **val_, int **I_, int **J_);
-
-
-
-#endif
diff --git a/spmv/mkl_shmem/CMakeLists.txt b/spmv/mkl_shmem/CMakeLists.txt
index 535317497621bbfdfd885d41109af28444fbe4a8..f51bf5b6a50fa576cf9e4e911e80a44f1cb00be2 100644
--- a/spmv/mkl_shmem/CMakeLists.txt
+++ b/spmv/mkl_shmem/CMakeLists.txt
@@ -47,7 +47,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS}")
 if (MKL_FOUND)
 	include_directories(${MKL_INCLUDE_DIR})
 	link_directories(${MKL_LIBRARY_DIR})
-	add_executable(${NAME} mklspmv.c mmio.c)
+	add_executable(${NAME} mklspmv.c)
 	target_link_libraries(${NAME} mkl_intel_lp64 mkl_sequential mkl_core pthread m)
 	install(TARGETS ${NAME} DESTINATION bin)
 else ()
diff --git a/spmv/mkl_shmem/mklspmv.c b/spmv/mkl_shmem/mklspmv.c
index 0c26b868f87994fb71b2738fc5e85910adacf6ba..bdec50ffa107d77faad5139b80e4759d352a623b 100644
--- a/spmv/mkl_shmem/mklspmv.c
+++ b/spmv/mkl_shmem/mklspmv.c
@@ -3,227 +3,102 @@
 #include <assert.h>
 #include <math.h>
 #include <mkl.h>
-#include "mmio.h"
+#define MIC_TARGET
+#include "../../common/mmio.h"
+#include "../../common/util.h"
 /*}}}*/
 
 /* global varibles and defines {{{*/
 int nrepeat = 100;
-int option_print_matrices = 0;
-#define OPTION_NOPRINT_MATRICES 0
-#define OPTION_PRINT_MATRICES 1
 char transa = 'n';
 /*}}}*/
 
-/* method declarations {{{*/
-int  main(int argc, char* argv[]);
-void printmm_one(int m, double* Aval, int* AJ, int* AI);
-void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI);
-void printmm_zero(int m, double* Aval, int* AJ, int* AI);
-/*}}}*/
-
 /** Multiply two sparse matrices which are stored in CSR format. MKL is used */
 void mkl_cpu_spmv(const MKL_INT Am, const MKL_INT An,  double* Aval, MKL_INT* AJ, MKL_INT* AI, double* xvec, double* yvec, double* time) {  /*{{{*/
+    MKL_INT ierr;
+    MKL_INT request = 1;    // symbolic multiplication
+    double alpha = 1.0;
+    double beta = 0.0;
+    char* matdescra = "G NF";
 
-MKL_INT ierr;
-MKL_INT request = 1;    // symbolic multiplication
-double alpha = 1.0;
-double beta = 0.0;
-char* matdescra = "G NF";
-
-double time_st = dsecnd();
-int i;
-for(i = 0; i < nrepeat; i++) {
-    /** y := alpha*A*x + beta*y */
-    mkl_dcsrmv(&transa,  &Am, &An, &alpha, matdescra, Aval, AJ, AI, (AI+1), xvec, &beta, yvec);
-}
-double time_end = dsecnd();
-*time = (time_end - time_st)/nrepeat;
-} /*}}}*/ 
-
-/** Read Matrix Market file into COO matrix */
-void read_mm(char* strpath, int* pM, int* pN, int* prealnnz, int** pI, int** pJ, double** pval){ /*{{{*/
-/* 
-*   taken from Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-int i, M, N, nz, *I, *J;
-double* val;
-int ret_code;
-MM_typecode matcode;
-FILE* f;
-if ((f = fopen(strpath, "r")) == NULL) {fprintf(stderr, "Input matrix file %s cannot be opened to read.", strpath);exit(1);}
-/* READ MATRIX */
-if (mm_read_banner(f, &matcode) != 0) {
-    printf("Could not process Matrix Market banner.\n");    exit(1);
-}
-/*  This is how one can screen matrix types if their application */
-/*  only supports a subset of the Matrix Market data types.      */
-if (mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode) ) {
-    printf("Sorry, this application does not support ");
-    printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));exit(1);
-}
-/* find out size of sparse matrix .... */
-if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) exit(1);
-/* reseve memory for matrices */
-I = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int));
-J = (int *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(int));
-val = (double *) malloc((mm_is_symmetric(matcode) ? 2*nz : nz) * sizeof(double));
-*pI = I;
-*pJ = J;
-*pval = val;
-/* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-/*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-/*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
-int realnnz = 0;
-for (i=0; i<nz; i++) {
-    if(mm_is_pattern(matcode)) {
-        fscanf(f, "%d %d\n", &I[realnnz], &J[realnnz]);
-        val[realnnz] = 1.0;
-    }
-    else
-        fscanf(f, "%d %d %lg\n", &I[realnnz], &J[realnnz], &val[realnnz]);
-    I[realnnz]--;  /* adjust from 1-based to 0-based */
-        J[realnnz]--;
-    if(mm_is_symmetric(matcode) && I[realnnz] != J[realnnz]) {
-        I[realnnz+1] = J[realnnz];
-        J[realnnz+1] = I[realnnz];
-        val[realnnz+1] = val[realnnz];
-        realnnz++;
+    double time_st = dsecnd();
+    int i;
+    for(i = 0; i < nrepeat; i++) {
+        /** y := alpha*A*x + beta*y */
+        mkl_dcsrmv(&transa,  &Am, &An, &alpha, matdescra, Aval, AJ, AI, (AI+1), xvec, &beta, yvec);
     }
-    realnnz++;
-}
-if (f !=stdin) fclose(f);
-*pM = M;
-*pN = N;
-*prealnnz = realnnz;
-} /* ENDOF read_mm }}}*/
-
-/** Converts COO matrix to CSR matrix */
-void coo_to_csr(int m, int nnz, int* I, int* J, double* val, MKL_INT* AI, MKL_INT* AJ, double* Aval) { /*{{{*/
-
-MKL_INT info = 0;
-MKL_INT job[8];   
-//job[1]=0; // zero based indexing in csr
-job[1]=1; // one based indexing in csr
-
-job[2]=0; // zero based indexing in coo
-job[3]=2; // I don't know
-job[4]=nnz; // nnz
-
-job[0]=1;  // coo to csr
-job[5]=0;  // Acsr and AJR allocated by user
-//void mkl_dcsrcoo (MKL_INT * job, MKL_INT * n, double *Acsr, MKL_INT * AJR, MKL_INT *AIR, MKL_INT * nnz, double *Acoo, MKL_INT * ir, MKL_INT * jc, MKL_INT * info);
-mkl_dcsrcoo (job,&m, Aval, AJ, AI, &nnz, val, I, J, &info);
-} /* ENDOF coo_to_csr }}}*/
+    double time_end = dsecnd();
+    *time = (time_end - time_st)/nrepeat;
+} /*}}}*/
 
 int main(int argc, char* argv[]) { /*{{{*/
-/** usage */
-int nrequired_args = 4;
-if (argc != nrequired_args){
-    fprintf(stderr, "NAME:\n\tmkl_spmv - multiply a sparse matrix with a dense vector\n");
-    fprintf(stderr, "\nSYNOPSIS:\n");
-    fprintf(stderr, "\tmkl_spmv MATRIX_A NUMBER_OF_THREADS PRINT_MATRIX\n");
-    fprintf(stderr, "\nDESCRIPTION:\n");
-    fprintf(stderr, "\tNUMBER_OF_THREADS: {0,1,2,...}\n");
-    fprintf(stderr, "\t\t0: Use number of threads determined by MKL\n");
-    fprintf(stderr, "\tPRINT_MATRIX: PRINT_YES, PRINT_NO\n");
-    fprintf(stderr, "\nSAMPLE EXECUTION:\n");
-    fprintf(stderr, "\t%s test.mtx 2 PRINT_YES\n", argv[0]);
-    exit(1);
-}
-/** parse arguments */
-int iarg = 1;
-char* strpathA = argv[iarg];    iarg++;
-int nthreads = atoi(argv[iarg]);    iarg++;
-if (nthreads > 0) {
-    mkl_set_num_threads(nthreads);
-} else {
-    nthreads = mkl_get_max_threads();
-}
-option_print_matrices = strcmp(argv[iarg], "PRINT_YES")==0?OPTION_PRINT_MATRICES:OPTION_NOPRINT_MATRICES;    iarg++;
-assert(nrequired_args == iarg);
-
-/** read matrix market file for A matrix */
-MKL_INT Am, An, Annz;
-MKL_INT *Ax, *Ay;  
-double *Anz;
-read_mm(strpathA, &Am, &An, &Annz, &Ax, &Ay, &Anz);
-
-/** construct csr storage for A matrix */
-MKL_INT* AJ = (MKL_INT*) mkl_malloc( Annz * sizeof( MKL_INT ), 64 );
-MKL_INT* AI = (MKL_INT*) mkl_malloc( (Am+1) * sizeof( MKL_INT ), 64 );
-double* Aval = (double*) mkl_malloc( Annz * sizeof( double ),  64 );
-coo_to_csr(Am, Annz, Ax, Ay, Anz, AI, AJ, Aval);    
-
-double* xvec = (double*) mkl_malloc( An * sizeof( double ), 64 ); 
-double* yvec = (double*) mkl_malloc( Am * sizeof( double ), 64 );
-int i;
-for(i=0;i<An;i++)xvec[i]=1.0;
-for(i=0;i<Am;i++)yvec[i]=0.0;
-
-double time; 
-mkl_cpu_spmv(Am, An, Aval, AJ, AI, xvec, yvec, &time);
-
-printmm_one(Am, Aval, AJ, AI);
-
-/** number of multiply-and-add operations in terms of giga flops*/
-double gflop = 2 * (double) Annz / 1e9;
+    /** usage */
+    int nrequired_args = 4;
+    if (argc != nrequired_args){
+        fprintf(stderr, "NAME:\n\tmkl_spmv - multiply a sparse matrix with a dense vector\n");
+        fprintf(stderr, "\nSYNOPSIS:\n");
+        fprintf(stderr, "\tmkl_spmv MATRIX_A NUMBER_OF_THREADS PRINT_MATRIX\n");
+        fprintf(stderr, "\nDESCRIPTION:\n");
+        fprintf(stderr, "\tNUMBER_OF_THREADS: {0,1,2,...}\n");
+        fprintf(stderr, "\t\t0: Use number of threads determined by MKL\n");
+        fprintf(stderr, "\tPRINT_MATRIX: PRINT_YES, PRINT_NO\n");
+        fprintf(stderr, "\nSAMPLE EXECUTION:\n");
+        fprintf(stderr, "\t%s test.mtx 2 PRINT_YES\n", argv[0]);
+        exit(1);
+    }
+    /** parse arguments */
+    int iarg = 1;
+    char* strpathA = argv[iarg];    iarg++;
+    int nthreads = atoi(argv[iarg]);    iarg++;
+    if (nthreads > 0) {
+        mkl_set_num_threads(nthreads);
+    } else {
+        nthreads = mkl_get_max_threads();
+    }
+    option_print_matrices = strcmp(argv[iarg], "PRINT_YES")==0?OPTION_PRINT_MATRICES:OPTION_NOPRINT_MATRICES;    iarg++;
+    assert(nrequired_args == iarg);
+
+    /** read matrix market file for A matrix */
+    MKL_INT Am, An, Annz;
+    MKL_INT *Ax, *Ay;
+    double *Anz;
+    read_mm(strpathA, &Am, &An, &Annz, &Ax, &Ay, &Anz);
+
+    /** construct csr storage for A matrix */
+    MKL_INT* AJ = (MKL_INT*) mkl_malloc( Annz * sizeof( MKL_INT ), 64 );
+    MKL_INT* AI = (MKL_INT*) mkl_malloc( (Am+1) * sizeof( MKL_INT ), 64 );
+    double* Aval = (double*) mkl_malloc( Annz * sizeof( double ),  64 );
+    coo_to_csr(Am, Annz, Ax, Ay, Anz, AI, AJ, Aval);
+
+    double* xvec = (double*) mkl_malloc( An * sizeof( double ), 64 );
+    double* yvec = (double*) mkl_malloc( Am * sizeof( double ), 64 );
+    int i;
+    for(i=0;i<An;i++)xvec[i]=1.0;
+    for(i=0;i<Am;i++)yvec[i]=0.0;
 
-/** print gflop per second and time */
-printf("%d\t", nthreads);
-printf("%g\t", (gflop/time));
-printf("%g\t", time);
-printf("%s\t", strpathA);
-printf("\n");
+    double time;
+    mkl_cpu_spmv(Am, An, Aval, AJ, AI, xvec, yvec, &time);
 
-printf("Output vector:\n");
-for(i=0;i<Am;i++)printf("%g ", yvec[i]);
-printf("\n");
-/** free allocated space */
-mkl_free(AI);
-mkl_free(AJ);
-mkl_free(Aval);
-return 0;
-} /* ENDOF main }}}*/
+    printmm_one(Am, Aval, AJ, AI);
 
-/** Prints matrix in CSR format */
-void printmm_one(int m, double* Aval, int* AJ, int* AI){ //{{{
+    /** number of multiply-and-add operations in terms of giga flops*/
+    double gflop = 2 * (double) Annz / 1e9;
 
-    if (option_print_matrices == OPTION_NOPRINT_MATRICES)
-        return;
-    int i;
-    for(i = 0; i < m; i++) {
-        printf("%d: ", i+1);
-        int j;
-        for(j = AI[i]-1; j < AI[i+1]-1; j++) {
-            printf("%d:%g  ", AJ[j], Aval[j]);
-        }
-        printf("\n");
-    }
+    /** print gflop per second and time */
+    printf("%d\t", nthreads);
+    printf("%g\t", (gflop/time));
+    printf("%g\t", time);
+    printf("%s\t", strpathA);
     printf("\n");
-}//}}}
 
-/** Writes matrix in CSR format in to a file using Matrix Market format */
-void printfilemm_one(char* file, int m, int n, double* Aval, int* AJ, int* AI){//{{{
+    printf("Output vector:\n");
+    for(i=0;i<Am;i++)printf("%g ", yvec[i]);
+    printf("\n");
+    /** free allocated space */
+    mkl_free(AI);
+    mkl_free(AJ);
+    mkl_free(Aval);
+    return 0;
+} /* ENDOF main }}}*/
 
-    FILE* f = fopen(file, "w");
-    if(f == NULL){
-        printf("%s %s %d: %s cannot be opened to write matrix\n", __FILE__, __PRETTY_FUNCTION__, __LINE__, file);
-        exit(1); 
-    }
-    int i;
-    fprintf(f, "%%%%MatrixMarket matrix coordinate real general\n");
-    fprintf(f, "%d %d %d\n", m, n, AI[m]-1);
-    for(i = 0; i < m; i++) {
-        int j;
-        for(j = AI[i]-1; j < AI[i+1]-1; j++) {
-            fprintf(f, "%d %d %g\n", i+1, AJ[j], Aval[j]);
-        }
-    }
-    fclose(f);
-}//}}}
 
diff --git a/spmv/mkl_shmem/mmio.c b/spmv/mkl_shmem/mmio.c
deleted file mode 100644
index c250ff2aed998fe65248537a8b19a359206187ce..0000000000000000000000000000000000000000
--- a/spmv/mkl_shmem/mmio.c
+++ /dev/null
@@ -1,511 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-
-#include "mmio.h"
-
-int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
-                double **val_, int **I_, int **J_)
-{
-    FILE *f;
-    MM_typecode matcode;
-    int M, N, nz;
-    int i;
-    double *val;
-    int *I, *J;
- 
-    if ((f = fopen(fname, "r")) == NULL)
-            return -1;
- 
- 
-    if (mm_read_banner(f, &matcode) != 0)
-    {
-        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
-        printf(" in file [%s]\n", fname);
-        return -1;
-    }
- 
- 
- 
-    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
-            mm_is_sparse(matcode)))
-    {
-        fprintf(stderr, "Sorry, this application does not support ");
-        fprintf(stderr, "Market Market type: [%s]\n",
-                mm_typecode_to_str(matcode));
-        return -1;
-    }
- 
-    /* find out size of sparse matrix: M, N, nz .... */
- 
-    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
-    {
-        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
-        return -1;
-    }
- 
-    *M_ = M;
-    *N_ = N;
-    *nz_ = nz;
- 
-    /* reseve memory for matrices */
- 
-    I = (int *) malloc(nz * sizeof(int));
-    J = (int *) malloc(nz * sizeof(int));
-    val = (double *) malloc(nz * sizeof(double));
- 
-    *val_ = val;
-    *I_ = I;
-    *J_ = J;
- 
-    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
- 
-    for (i=0; i<nz; i++)
-    {
-        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
-        I[i]--;  /* adjust from 1-based to 0-based */
-        J[i]--;
-    }
-    fclose(f);
- 
-    return 0;
-}
-
-int mm_is_valid(MM_typecode matcode)
-{
-    if (!mm_is_matrix(matcode)) return 0;
-    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
-    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
-    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
-                mm_is_skew(matcode))) return 0;
-    return 1;
-}
-
-int mm_read_banner(FILE *f, MM_typecode *matcode)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    char banner[MM_MAX_TOKEN_LENGTH];
-    char mtx[MM_MAX_TOKEN_LENGTH]; 
-    char crd[MM_MAX_TOKEN_LENGTH];
-    char data_type[MM_MAX_TOKEN_LENGTH];
-    char storage_scheme[MM_MAX_TOKEN_LENGTH];
-    char *p;
-
-
-    mm_clear_typecode(matcode);  
-
-    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
-        return MM_PREMATURE_EOF;
-
-    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
-        storage_scheme) != 5)
-        return MM_PREMATURE_EOF;
-
-    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
-    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
-    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
-    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
-
-    /* check for banner */
-    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
-        return MM_NO_HEADER;
-
-    /* first field should be "mtx" */
-    if (strcmp(mtx, MM_MTX_STR) != 0)
-        return  MM_UNSUPPORTED_TYPE;
-    mm_set_matrix(matcode);
-
-
-    /* second field describes whether this is a sparse matrix (in coordinate
-            storgae) or a dense array */
-
-
-    if (strcmp(crd, MM_SPARSE_STR) == 0)
-        mm_set_sparse(matcode);
-    else
-    if (strcmp(crd, MM_DENSE_STR) == 0)
-            mm_set_dense(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* third field */
-
-    if (strcmp(data_type, MM_REAL_STR) == 0)
-        mm_set_real(matcode);
-    else
-    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
-        mm_set_complex(matcode);
-    else
-    if (strcmp(data_type, MM_PATTERN_STR) == 0)
-        mm_set_pattern(matcode);
-    else
-    if (strcmp(data_type, MM_INT_STR) == 0)
-        mm_set_integer(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* fourth field */
-
-    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
-        mm_set_general(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
-        mm_set_symmetric(matcode);
-    else
-    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
-        mm_set_hermitian(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
-        mm_set_skew(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-        
-
-    return 0;
-}
-
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
-{
-    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
-
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = *nz = 0;
-
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
-        return 0;
-        
-    else
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 3);
-
-    return 0;
-}
-
-
-int mm_read_mtx_array_size(FILE *f, int *M, int *N)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = 0;
-	
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d", M, N) == 2)
-        return 0;
-        
-    else /* we have a blank line */
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d", M, N); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 2);
-
-    return 0;
-}
-
-int mm_write_mtx_array_size(FILE *f, int M, int N)
-{
-    if (fprintf(f, "%d %d\n", M, N) != 2)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
-
-
-
-/*-------------------------------------------------------------------------*/
-
-/******************************************************************/
-/* use when I[], J[], and val[]J, and val[] are already allocated */
-/******************************************************************/
-
-int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    int i;
-    if (mm_is_complex(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-        for (i=0; i<nz; i++)
-        {
-            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
-                != 3) return MM_PREMATURE_EOF;
-
-        }
-    }
-
-    else if (mm_is_pattern(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d", &I[i], &J[i])
-                != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-        
-}
-
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
-        double *real, double *imag, MM_typecode matcode)
-{
-    if (mm_is_complex(matcode))
-    {
-            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-            if (fscanf(f, "%d %d %lg\n", I, J, real)
-                != 3) return MM_PREMATURE_EOF;
-
-    }
-
-    else if (mm_is_pattern(matcode))
-    {
-            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
-
-    return 0;
-        
-}
-
-
-/************************************************************************
-    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
-                        type code, e.g. 'MCRS'
-
-                        if matrix is complex, values[] is of size 2*nz,
-                            (nz pairs of real/imaginary values)
-************************************************************************/
-
-int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
-        double **val, MM_typecode *matcode)
-{
-    int ret_code;
-    FILE *f;
-
-    if (strcmp(fname, "stdin") == 0) f=stdin;
-    else
-    if ((f = fopen(fname, "r")) == NULL)
-        return MM_COULD_NOT_READ_FILE;
-
-
-    if ((ret_code = mm_read_banner(f, matcode)) != 0)
-        return ret_code;
-
-    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
-            mm_is_matrix(*matcode)))
-        return MM_UNSUPPORTED_TYPE;
-
-    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
-        return ret_code;
-
-
-    *I = (int *)  malloc(*nz * sizeof(int));
-    *J = (int *)  malloc(*nz * sizeof(int));
-    *val = NULL;
-
-    if (mm_is_complex(*matcode))
-    {
-        *val = (double *) malloc(*nz * 2 * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-    else if (mm_is_real(*matcode))
-    {
-        *val = (double *) malloc(*nz * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-
-    else if (mm_is_pattern(*matcode))
-    {
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-
-    if (f != stdin) fclose(f);
-    return 0;
-}
-
-int mm_write_banner(FILE *f, MM_typecode matcode)
-{
-    char *str = mm_typecode_to_str(matcode);
-    int ret_code;
-
-    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
-    free(str);
-    if (ret_code !=2 )
-        return MM_COULD_NOT_WRITE_FILE;
-    else
-        return 0;
-}
-
-int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    FILE *f;
-    int i;
-
-    if (strcmp(fname, "stdout") == 0) 
-        f = stdout;
-    else
-    if ((f = fopen(fname, "w")) == NULL)
-        return MM_COULD_NOT_WRITE_FILE;
-    
-    /* print banner followed by typecode */
-    fprintf(f, "%s ", MatrixMarketBanner);
-    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
-
-    /* print matrix sizes and nonzeros */
-    fprintf(f, "%d %d %d\n", M, N, nz);
-
-    /* print values */
-    if (mm_is_pattern(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d\n", I[i], J[i]);
-    else
-    if (mm_is_real(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
-    else
-    if (mm_is_complex(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
-                        val[2*i+1]);
-    else
-    {
-        if (f != stdout) fclose(f);
-        return MM_UNSUPPORTED_TYPE;
-    }
-
-    if (f !=stdout) fclose(f);
-
-    return 0;
-}
-  
-
-/**
-*  Create a new copy of a string s.  mm_strdup() is a common routine, but
-*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
-*
-*/
-char *mm_strdup(const char *s)
-{
-	int len = strlen(s);
-	char *s2 = (char *) malloc((len+1)*sizeof(char));
-	return strcpy(s2, s);
-}
-
-char  *mm_typecode_to_str(MM_typecode matcode)
-{
-    char buffer[MM_MAX_LINE_LENGTH];
-    char *types[4];
-	char *mm_strdup(const char *);
-    int error =0;
-
-    /* check for MTX type */
-    if (mm_is_matrix(matcode)) 
-        types[0] = MM_MTX_STR;
-    else
-        error=1;
-
-    /* check for CRD or ARR matrix */
-    if (mm_is_sparse(matcode))
-        types[1] = MM_SPARSE_STR;
-    else
-    if (mm_is_dense(matcode))
-        types[1] = MM_DENSE_STR;
-    else
-        return NULL;
-
-    /* check for element data type */
-    if (mm_is_real(matcode))
-        types[2] = MM_REAL_STR;
-    else
-    if (mm_is_complex(matcode))
-        types[2] = MM_COMPLEX_STR;
-    else
-    if (mm_is_pattern(matcode))
-        types[2] = MM_PATTERN_STR;
-    else
-    if (mm_is_integer(matcode))
-        types[2] = MM_INT_STR;
-    else
-        return NULL;
-
-
-    /* check for symmetry type */
-    if (mm_is_general(matcode))
-        types[3] = MM_GENERAL_STR;
-    else
-    if (mm_is_symmetric(matcode))
-        types[3] = MM_SYMM_STR;
-    else 
-    if (mm_is_hermitian(matcode))
-        types[3] = MM_HERM_STR;
-    else 
-    if (mm_is_skew(matcode))
-        types[3] = MM_SKEW_STR;
-    else
-        return NULL;
-
-    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
-    return mm_strdup(buffer);
-
-}
diff --git a/spmv/mkl_shmem/mmio.h b/spmv/mkl_shmem/mmio.h
deleted file mode 100644
index 7cfd0a1b7ae7572e85b8c10bcb2fd0b3333ad0b6..0000000000000000000000000000000000000000
--- a/spmv/mkl_shmem/mmio.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
-#ifndef MM_IO_H
-#define MM_IO_H
-
-#define MM_MAX_LINE_LENGTH 1025
-#define MatrixMarketBanner "%%MatrixMarket"
-#define MM_MAX_TOKEN_LENGTH 64
-
-typedef char MM_typecode[4];
-
-char *mm_typecode_to_str(MM_typecode matcode);
-
-int mm_read_banner(FILE *f, MM_typecode *matcode);
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
-int mm_read_mtx_array_size(FILE *f, int *M, int *N);
-
-int mm_write_banner(FILE *f, MM_typecode matcode);
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
-int mm_write_mtx_array_size(FILE *f, int M, int N);
-
-
-/********************* MM_typecode query fucntions ***************************/
-
-#define mm_is_matrix(typecode)	((typecode)[0]=='M')
-
-#define mm_is_sparse(typecode)	((typecode)[1]=='C')
-#define mm_is_coordinate(typecode)((typecode)[1]=='C')
-#define mm_is_dense(typecode)	((typecode)[1]=='A')
-#define mm_is_array(typecode)	((typecode)[1]=='A')
-
-#define mm_is_complex(typecode)	((typecode)[2]=='C')
-#define mm_is_real(typecode)		((typecode)[2]=='R')
-#define mm_is_pattern(typecode)	((typecode)[2]=='P')
-#define mm_is_integer(typecode) ((typecode)[2]=='I')
-
-#define mm_is_symmetric(typecode)((typecode)[3]=='S')
-#define mm_is_general(typecode)	((typecode)[3]=='G')
-#define mm_is_skew(typecode)	((typecode)[3]=='K')
-#define mm_is_hermitian(typecode)((typecode)[3]=='H')
-
-int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
-
-
-/********************* MM_typecode modify fucntions ***************************/
-
-#define mm_set_matrix(typecode)	((*typecode)[0]='M')
-#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
-#define mm_set_array(typecode)	((*typecode)[1]='A')
-#define mm_set_dense(typecode)	mm_set_array(typecode)
-#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
-
-#define mm_set_complex(typecode)((*typecode)[2]='C')
-#define mm_set_real(typecode)	((*typecode)[2]='R')
-#define mm_set_pattern(typecode)((*typecode)[2]='P')
-#define mm_set_integer(typecode)((*typecode)[2]='I')
-
-
-#define mm_set_symmetric(typecode)((*typecode)[3]='S')
-#define mm_set_general(typecode)((*typecode)[3]='G')
-#define mm_set_skew(typecode)	((*typecode)[3]='K')
-#define mm_set_hermitian(typecode)((*typecode)[3]='H')
-
-#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
-									(*typecode)[2]=' ',(*typecode)[3]='G')
-
-#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
-
-
-/********************* Matrix Market error codes ***************************/
-
-
-#define MM_COULD_NOT_READ_FILE	11
-#define MM_PREMATURE_EOF		12
-#define MM_NOT_MTX				13
-#define MM_NO_HEADER			14
-#define MM_UNSUPPORTED_TYPE		15
-#define MM_LINE_TOO_LONG		16
-#define MM_COULD_NOT_WRITE_FILE	17
-
-
-/******************** Matrix Market internal definitions ********************
-
-   MM_matrix_typecode: 4-character sequence
-
-				    ojbect 		sparse/   	data        storage 
-						  		dense     	type        scheme
-
-   string position:	 [0]        [1]			[2]         [3]
-
-   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
-						        A(array)	C(omplex)   H(ermitian)
-											P(attern)   S(ymmetric)
-								    		I(nteger)	K(kew)
-
- ***********************************************************************/
-
-#define MM_MTX_STR		"matrix"
-#define MM_ARRAY_STR	"array"
-#define MM_DENSE_STR	"array"
-#define MM_COORDINATE_STR "coordinate" 
-#define MM_SPARSE_STR	"coordinate"
-#define MM_COMPLEX_STR	"complex"
-#define MM_REAL_STR		"real"
-#define MM_INT_STR		"integer"
-#define MM_GENERAL_STR  "general"
-#define MM_SYMM_STR		"symmetric"
-#define MM_HERM_STR		"hermitian"
-#define MM_SKEW_STR		"skew-symmetric"
-#define MM_PATTERN_STR  "pattern"
-
-
-/*  high level routines */
-
-int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-		 double val[], MM_typecode matcode);
-int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-		double val[], MM_typecode matcode);
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
-			MM_typecode matcode);
-
-int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
-                double **val_, int **I_, int **J_);
-
-
-
-#endif