/*********************************************************************************************** *cr University of Patras, Greece *cr Copyright (c) 2015 University of Patras *cr All rights reserved *cr *cr Developed by: HPClab *cr Computer Engineering and Informatics Department *cr University of Patras *cr ***********************************************************************************************/ #include "pddp_2means.h" /******************************************************************************/ #pragma offload_attribute (push, target(mic)) /******************************************************************************/ volatile unsigned long leaves = 1, active_data_points; unsigned long num_of_cores; /******************************************************************************/ void print_elapsed_time(struct timeval start, struct timeval end, char *msg) { printf("%s: %f sec\n", msg, ((end.tv_sec - start.tv_sec) * 1000000.0 + (end.tv_usec - start.tv_usec)) / 1000000.0 ); fflush(NULL); } /******************************************************************************/ Node *allocate_node(unsigned long data_points) { Node *temp; int error; error = posix_memalign((void **)(&temp), 64, sizeof(Node)); if (error != 0) { printf("ERROR: Could not allocate memory for tree node.\n"); exit(0); } return(temp); } /******************************************************************************/ void init_node(Node *node, Node *sibling, double *M, unsigned long data_points, unsigned long *indices, double *centroid, unsigned long attributes, unsigned long padded_attributes, unsigned long threads) { node->num_of_indices = data_points; node->indices = indices; node->centroid = centroid; node->leftchild = NULL; node->rightchild = NULL; node->sibling = sibling; node->keep_in_tree = 0; node->is_splittable = 1; node->scat = calc_scatter(node, M, attributes, padded_attributes, threads); } /******************************************************************************/ void process_node(Node *node, double *M, unsigned long attributes, unsigned long padded_attributes, unsigned long current_level, unsigned long max_level) { unsigned long l_data_points, r_data_points, *l_indices, *r_indices, threads; double *l_centroid, *r_centroid, *v; #if defined(_OPENMP) omp_set_nested(1); #endif threads = lround(((double)(node->num_of_indices * num_of_cores)) / ((double)active_data_points)); if (threads < 1) { threads = 1; } /* * Calculate leading principal component using the power iteration method. */ power_iteration(node, M, attributes, padded_attributes, &v, threads); /* * Split cluster into 2 new clusters using the 2-means algorithm. */ vector_2_means(node, M, v, attributes, padded_attributes, &l_indices, &r_indices, &l_centroid, &r_centroid, &l_data_points, &r_data_points, threads); munmap(v, node->num_of_indices * sizeof(double)); /* * Check that both new clusters will contain enough data points. */ if ((l_data_points > 1) && (r_data_points > 1)) { node->leftchild = allocate_node(l_data_points); node->rightchild = allocate_node(r_data_points); DBG("Left node allocated is %p\n", node->leftchild); DBG("Right node allocated is %p\n", node->rightchild); init_node(node->leftchild, node->rightchild, M, l_data_points, l_indices, l_centroid, attributes, padded_attributes, threads); init_node(node->rightchild, node->leftchild, M, r_data_points, r_indices, r_centroid, attributes, padded_attributes, threads); DBG("Left child has %ld indices\n", node->leftchild->num_of_indices); DBG("Right child has %ld indices\n", node->rightchild->num_of_indices); #pragma omp atomic leaves++; if (current_level < max_level - 1) { #pragma omp task process_node(node->leftchild, M, attributes, padded_attributes, current_level + 1, max_level); #pragma omp task process_node(node->rightchild, M, attributes, padded_attributes, current_level + 1, max_level); } else { #pragma omp atomic active_data_points -= node->num_of_indices; } } else { #pragma omp atomic active_data_points -= node->num_of_indices; node->is_splittable = 0; if (l_indices != NULL) { munmap(l_indices, l_data_points * sizeof(unsigned long)); } if (r_indices != NULL) { munmap(r_indices, r_data_points * sizeof(unsigned long)); } free(l_centroid); free(r_centroid); } } /******************************************************************************/ double calc_scatter(Node *node, double *M, unsigned long attributes, unsigned long padded_attributes, unsigned long threads) { unsigned long i, j; double scat = 0.0, *M_line; #pragma omp parallel for default(none) private(i, j, M_line) shared(node, M, attributes, padded_attributes) reduction(+:scat) num_threads(threads) for (i = 0; i < node->num_of_indices; i++) { M_line = &M[node->indices[i] * padded_attributes]; #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { scat += M_line[j] * M_line[j]; } } DBG("Scatter value for node %p is %f\n", node, scat); return(scat); } /******************************************************************************/ void vector_2_means(Node *node, double *M, double *v, unsigned long attributes, unsigned long padded_attributes, unsigned long **l_indices, unsigned long **r_indices, double **l_centroid, double **r_centroid, unsigned long *l_data_points, unsigned long *r_data_points, unsigned long threads) { double *c_l, *c_r, *M_line; double prev_diff, diff = INFINITY; unsigned long i, j, iter, l = 0, r = 0, error, *i_l, *i_r; char *clusters; /* * Allocate memory for centroids of the two clusters to be created. */ error = posix_memalign((void **)(&c_l), 64, attributes * sizeof(double)); if (error != 0) { printf("ERROR: Could not allocate memory for vector c_l.\n"); exit(0); } error = posix_memalign((void **)(&c_r), 64, attributes * sizeof(double)); if (error != 0) { printf("ERROR: Could not allocate memory for vector c_r.\n"); exit(0); } clusters = (char *)alloca(node->num_of_indices * sizeof(char)); __assume_aligned(node, 64); __assume_aligned(v, 64); __assume_aligned(*l_indices, 64); __assume_aligned(*r_indices, 64); __assume_aligned(*l_centroid, 64); __assume_aligned(*r_centroid, 64); __assume_aligned(l_data_points, 64); __assume_aligned(r_data_points, 64); // __assume_aligned(clusters, 64); __assume_aligned(M_line, 64); __assume_aligned(c_l, 64); __assume_aligned(c_r, 64); __assume_aligned(i_l, 64); __assume_aligned(i_r, 64); /* * Initialize centroids. */ memset(c_l, 0, attributes * sizeof(double)); memset(c_r, 0, attributes * sizeof(double)); /* * Make an initial assignment of each data point to a cluster and * calculate the initial centroids of the two clusters. */ #pragma omp parallel default(none) private(i, j, M_line) shared(l, r, node, clusters, c_l, c_r, v, M, attributes, padded_attributes) num_threads(threads) { __attribute__((aligned(64))) double c_l_local[attributes], c_r_local[attributes]; memset(c_l_local, 0, attributes * sizeof(double)); memset(c_r_local, 0, attributes * sizeof(double)); #pragma omp for reduction(+:l, r) for (i = 0; i < node->num_of_indices; i++) { M_line = &M[node->indices[i] * padded_attributes]; if (v[i] <= 0.0) { #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { c_l_local[j] += M_line[j]; } clusters[i] = 0; l++; } else { #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { c_r_local[j] += M_line[j]; } clusters[i] = 1; r++; } } #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { #pragma omp atomic c_l[j] += c_l_local[j]; #pragma omp atomic c_r[j] += c_r_local[j]; } } #pragma vector aligned #pragma ivdep for (i = 0; i < attributes; i++) { c_l[i] /= l; c_r[i] /= r; } /* * Apply the 2-means algorithm to refine the assignment of data points to the two clusters. */ iter = 0; do { prev_diff = diff; diff = 0.0; #pragma omp parallel default(none) private(i, j, M_line) shared(l, r, node, clusters, c_l, c_r, M, attributes, padded_attributes) reduction(+:diff) num_threads(threads) { __attribute__((aligned(64))) double dist1, dist2; __attribute__((aligned(64))) double c_l_local[attributes], c_r_local[attributes]; memset(c_l_local, 0, attributes * sizeof(double)); memset(c_r_local, 0, attributes * sizeof(double)); /* * For every data point of the node we are processing. */ #pragma omp for schedule(static) reduction(+:l, r) nowait for (i = 0; i < node->num_of_indices; i++) { dist1 = 0.0; dist2 = 0.0; /* * Calculate the Euclidean distance from the data point to each of the two cluster centers. */ M_line = &M[node->indices[i] * padded_attributes]; #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { dist1 += (c_l[j] - M_line[j]) * (c_l[j] - M_line[j]); dist2 += (c_r[j] - M_line[j]) * (c_r[j] - M_line[j]); } /* * If the data point is closer to the left-side cluster center, then assign it to that cluster. * Furthermore, check whether the data point is changing cluster (it previously belonged to the right-side cluster) * and update accordingly the number of data points that belong to each cluster. */ if (dist1 < dist2) { if (clusters[i] == 1) { l++; r--; } clusters[i] = 0; diff += dist1; } else { if (clusters[i] == 0) { l--; r++; } clusters[i] = 1; diff += dist2; } } /* * Calculate new centroids. */ #pragma omp for schedule(static) for (i = 0; i < node->num_of_indices; i++) { M_line = &M[node->indices[i] * padded_attributes]; if (clusters[i] == 0) { #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { c_l_local[j] += M_line[j]; } } else { #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { c_r_local[j] += M_line[j]; } } } #pragma omp single { memset(c_l, 0, attributes * sizeof(double)); memset(c_r, 0, attributes * sizeof(double)); } #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { #pragma omp atomic c_l[j] += c_l_local[j]; #pragma omp atomic c_r[j] += c_r_local[j]; } } #pragma vector aligned #pragma ivdep for (i = 0; i < attributes; i++) { c_l[i] /= l; c_r[i] /= r; } iter++; // printf("%p prev_diff = %.20f, diff = %.20f, fabs(diff - prev_diff) = %.20f\n", node, prev_diff, diff, fabs(diff - prev_diff)); } while ((fabs(diff - prev_diff) > TWO_MEANS_TOLERANCE) && (iter < TWO_MEANS_MAX_ITERATIONS)); // printf("2-means ended after %ld iterations.\n", iter); /* * Return to the caller the calculated size of each cluster. */ *l_data_points = l; *r_data_points = r; /* * Return to the caller the calculated centroid of each cluster. */ *l_centroid = c_l; *r_centroid = c_r; if ((l > 0) && (r > 0)) { i_l = (unsigned long *)mmap(NULL, l * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); i_r = (unsigned long *)mmap(NULL, r * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if ((i_l == MAP_FAILED) || (i_r == MAP_FAILED)) { printf("ERROR: Could not allocate memory for a vector in 2-means.\n"); exit(0); } /* * Find to which cluster each data point belongs. */ l = 0; r = 0; #pragma vector aligned #pragma ivdep for (i = 0; i < node->num_of_indices; i++) { if (clusters[i] == 0) { i_l[l] = node->indices[i]; l++; } else { i_r[r] = node->indices[i]; r++; } } /* * Return to the caller the indices that belong to each cluster. */ *l_indices = i_l; *r_indices = i_r; } else if (l == 0) { i_r = (unsigned long *)mmap(NULL, r * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (i_r == MAP_FAILED) { printf("ERROR: Could not allocate memory for right vector in 2-means.\n"); exit(0); } /* * All data points belong to the right cluster. */ #pragma vector aligned #pragma ivdep for (i = 0; i < node->num_of_indices; i++) { i_r[i] = node->indices[i]; } /* * Return to the caller the indices that belong to each cluster. */ *l_indices = NULL; *r_indices = i_r; } else { i_l = (unsigned long *)mmap(NULL, l * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (i_l == MAP_FAILED) { printf("ERROR: Could not allocate memory for left vector in 2-means.\n"); exit(0); } /* * All data points belong to the left cluster. */ #pragma vector aligned #pragma ivdep for (i = 0; i < node->num_of_indices; i++) { i_l[i] = node->indices[i]; } /* * Return to the caller the indices that belong to each cluster. */ *l_indices = i_l; *r_indices = NULL; } } /******************************************************************************/ void power_iteration(Node *node, double *M, unsigned long attributes, unsigned long padded_attributes, double **v, unsigned long threads) { double *x_prev, *x_curr, *x_temp, max, norm = INFINITY; unsigned long i, j; x_prev = (double *)mmap(NULL, node->num_of_indices * sizeof(double), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); x_curr = (double *)mmap(NULL, node->num_of_indices * sizeof(double), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); x_temp = (double *)mmap(NULL, attributes * sizeof(double), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if ((x_prev == MAP_FAILED) || (x_curr == MAP_FAILED) || (x_temp == MAP_FAILED)) { printf("ERROR: Could not allocate a vector in power iteration.\n"); exit(0); } __assume_aligned(x_prev, 64); __assume_aligned(x_curr, 64); __assume_aligned(x_temp, 64); __assume_aligned(node, 64); __assume_aligned(*v, 64); #pragma vector aligned #pragma ivdep for (i = 0; i < node->num_of_indices; i++) { x_curr[i] = 1.0; } #pragma omp parallel default(none) private(i, j) shared(x_temp, x_curr, x_prev, node, max, norm, M, attributes, padded_attributes) num_threads(threads) if(threads > 1) { __attribute__((aligned(64))) double x_temp_local[attributes]; double *temp, *M_line; unsigned long iter = 0; __assume_aligned(M_line, 64); do { #pragma omp single { max = 0.0; temp = x_prev; x_prev = x_curr; x_curr = temp; memset(x_temp, 0, attributes * sizeof(double)); } memset(x_temp_local, 0, attributes * sizeof(double)); #pragma omp for nowait for (i = 0; i < node->num_of_indices; i++) { M_line = &M[node->indices[i] * padded_attributes]; #pragma vector aligned nontemporal #pragma ivdep for (j = 0; j < attributes; j++) { x_temp_local[j] += (M_line[j] - node->centroid[j]) * x_prev[i]; } } #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { #pragma omp atomic x_temp[j] += x_temp_local[j]; } norm = 0.0; #pragma omp barrier #pragma omp for reduction(max:max) for (i = 0; i < node->num_of_indices; i++) { x_curr[i] = 0.0; M_line = &M[node->indices[i] * padded_attributes]; #pragma vector aligned nontemporal #pragma ivdep for (j = 0; j < attributes; j++) { x_curr[i] += (M_line[j] - node->centroid[j]) * x_temp[j]; } if (fabs(x_curr[i]) > max) { max = fabs(x_curr[i]); } } #pragma omp for reduction(+:norm) for (i = 0; i < node->num_of_indices; i++) { x_curr[i] /= max; norm += (x_curr[i] - x_prev[i]) * (x_curr[i] - x_prev[i]); } iter++; } while (((norm / node->num_of_indices) > POWER_ITERATION_TOLERANCE) && (iter < POWER_ITERATION_MAX_ITERATIONS)); } munmap(x_prev, node->num_of_indices * sizeof(double)); munmap(x_temp, attributes * sizeof(double)); *v = x_curr; } /******************************************************************************/ void reset_nodes_to_keep(Node *node) { if (node != NULL) { reset_nodes_to_keep(node->leftchild); reset_nodes_to_keep(node->rightchild); node->keep_in_tree = 0; } } /******************************************************************************/ void keep_all_nodes(Node *node) { if (node != NULL) { keep_all_nodes(node->leftchild); keep_all_nodes(node->rightchild); node->keep_in_tree = 2; } } /******************************************************************************/ void find_max_scatter_value(Node *node, Node **K) { if (node != NULL) { find_max_scatter_value(node->leftchild, K); find_max_scatter_value(node->rightchild, K); if ((node->scat > (*K)->scat) && (node->keep_in_tree < 2)) { (*K) = node; } } } /******************************************************************************/ unsigned long find_nodes_to_keep(Node *node, unsigned long clusters) { unsigned long c = 1; Node *K, dummy; dummy.scat = -INFINITY; /* * The meaning of the values for 'keep_in_tree' is as follows: * - 0 means that the node is not in the tree. * - 1 means that the node is in the tree because its parent had at some point * the largest scatter value among the nodes that had not been added to the * tree yet (hence it was split and its children must belong to the tree). * However, there is a possibility that it's scatter value will be checked later on * and if it is the largest among the remaining nodes further processing will happen * for the node itself and its children (if it has children). * - 2 means that the node is in the tree and no further checks will be done. */ node->keep_in_tree = 1; do { K = &dummy; find_max_scatter_value(node, &K); K->keep_in_tree = 2; if (K->leftchild != K->rightchild) { K->leftchild->keep_in_tree = 1; K->rightchild->keep_in_tree = 1; c += 2; } else if (K->is_splittable == 1) { reset_nodes_to_keep(node); return(0L); } } while (c < (2 * clusters - 1)); return(1L); } /******************************************************************************/ double max_cluster_diameter(Node *node, double *M, unsigned long attributes, unsigned long padded_attributes) { double l_max_dist = 0.0, r_max_dist = 0.0, max_dist = 0.0, *M_line_i, *M_line_j; __assume_aligned(node, 64); __assume_aligned(M, 64); __assume_aligned(M_line_i, 64); __assume_aligned(M_line_j, 64); if (node != NULL) { l_max_dist = max_cluster_diameter(node->leftchild, M, attributes, padded_attributes); r_max_dist = max_cluster_diameter(node->rightchild, M, attributes, padded_attributes); if (l_max_dist > r_max_dist) { max_dist = l_max_dist; } else { max_dist = r_max_dist; } /* * The PDDP algorithm ensures that every node has either 2 children or is a leaf node. * Hence, a leaf node is found when both pointers to the children are equal (both are NULL). */ if (((node->keep_in_tree == 1) || (node->keep_in_tree == 2)) && ((node->leftchild == node->rightchild) || (node->leftchild->keep_in_tree == 0) || (node->rightchild->keep_in_tree == 0))) { unsigned long i, j, k; double dist; #pragma omp parallel for default(none) private(i, j, k, dist, M_line_i, M_line_j) shared(node, M, attributes, padded_attributes) reduction(max:max_dist) num_threads(omp_get_num_procs()) for (i = 0; i < node->num_of_indices; i++) { M_line_i = &M[node->indices[i] * padded_attributes]; for (j = i + 1; j < node->num_of_indices; j++) { M_line_j = &M[node->indices[j] * padded_attributes]; dist = 0.0; #pragma vector aligned #pragma ivdep for (k = 0; k < attributes; k++) { dist += (M_line_i[k] - M_line_j[k]) * (M_line_i[k] - M_line_j[k]); } if (dist > max_dist) { max_dist = dist; } } } } } return(max_dist); } /******************************************************************************/ void find_all_leaves(Node *node, Node **cluster_nodes, unsigned long *cluster_num) { if (node != NULL) { find_all_leaves(node->leftchild, cluster_nodes, cluster_num); find_all_leaves(node->rightchild, cluster_nodes, cluster_num); /* * The PDDP algorithm ensures that every node has either 2 children or is a leaf node. * Hence, a leaf node is found when both pointers to the children are equal (both are NULL). */ if (((node->keep_in_tree == 1) || (node->keep_in_tree == 2)) && ((node->leftchild == node->rightchild) || (node->leftchild->keep_in_tree == 0) || (node->rightchild->keep_in_tree == 0))) { cluster_nodes[*cluster_num] = node; (*cluster_num)++; } } } /******************************************************************************/ __attribute__ ((target(mic))) void find_all_splittable_leaves(Node *node, Node **cluster_nodes, unsigned long *cluster_num) { if (node != NULL) { find_all_splittable_leaves(node->leftchild, cluster_nodes, cluster_num); find_all_splittable_leaves(node->rightchild, cluster_nodes, cluster_num); /* * The PDDP algorithm ensures that every node has either 2 children or is a leaf node. * Hence, a leaf node is found when both pointers to the children are equal (both are NULL). */ if ((node->leftchild == node->rightchild) && (node->is_splittable == 1)) { cluster_nodes[*cluster_num] = node; (*cluster_num)++; } } } /******************************************************************************/ double min_cluster_distance(Node *node, unsigned long clusters, unsigned long attributes) { Node **cluster_nodes; unsigned long i, j, k, cluster_num = 0; int error; double dist, min_dist = INFINITY; error = posix_memalign((void **)(&cluster_nodes), 64, clusters * sizeof(Node *)); if (error != 0) { printf("ERROR: Could not allocate memory for vector of cluster nodes.\n"); exit(0); } find_all_leaves(node, cluster_nodes, &cluster_num); printf("Found %ld leaf nodes to keep\n", cluster_num); for (i = 0; i < clusters; i++) { for (j = i + 1; j < clusters; j++) { dist = 0.0; for (k = 0; k < attributes; k++) { dist += (cluster_nodes[i]->centroid[k] - cluster_nodes[j]->centroid[k]) * (cluster_nodes[i]->centroid[k] - cluster_nodes[j]->centroid[k]); } if (dist < min_dist) { min_dist = dist; } } } free(cluster_nodes); return(min_dist); } /******************************************************************************/ void post_order_traversal(Node *node, unsigned long *indices, unsigned long *cluster) { if (node != NULL) { post_order_traversal(node->leftchild, indices, cluster); post_order_traversal(node->rightchild, indices, cluster); /* * The PDDP algorithm ensures that every node has either 2 children or is a leaf node. * Hence, a leaf node is found when both pointers to the children are equal (both are NULL). */ if (((node->keep_in_tree == 1) || (node->keep_in_tree == 2)) && ((node->leftchild == node->rightchild) || (node->leftchild->keep_in_tree == 0) || (node->rightchild->keep_in_tree == 0))) { unsigned long i; for (i = 0; i < node->num_of_indices; i++) { indices[node->indices[i]] = *cluster; } (*cluster)++; } } } /******************************************************************************/ void assign_cluster_numbers(Node *node, unsigned long *result, unsigned long data_points) { unsigned long cluster = 1; post_order_traversal(node, result, &cluster); } /******************************************************************************/ #pragma offload_attribute (pop) /******************************************************************************/ void write_results(unsigned long *result, unsigned long data_points, char *filename) { unsigned long i; FILE *file; file = fopen(filename, "w"); if (file == NULL) { printf("ERROR: Could not open output file.\n"); exit(0); } fprintf(file, "["); for (i = 0; i < data_points - 1; i++) { fprintf(file, "%ld, ", result[i]); } fprintf(file, "%ld]", result[data_points - 1]); fclose(file); } /******************************************************************************/ int main(int argc, char *argv[]) { unsigned long i, j, data_points, attributes, padded_attributes, clusters, *result; int error; FILE *file; char buffer[BUFFER_SIZE], *line = buffer, *token, *endptr; double *M; struct timeval start, end; __assume_aligned(M, 64); if (argc < 4) { printf("ERROR: Insufficient number of arguments.\n"); exit(0); } clusters = strtol(argv[3], &endptr, 10); if (*endptr != '\0' || clusters < 2) { printf("ERROR: Invalid number of clusters to build.\n"); exit(0); } gettimeofday(&start, NULL); file = fopen(argv[1], "r"); if (file == NULL) { printf("ERROR: Could not open input file.\n"); exit(0); } /* * Count how many lines the input data file contains. */ data_points = 0; while (fgets(buffer, BUFFER_SIZE, file) != NULL) { data_points++; } printf("Lines in input file: %ld\n", data_points); if (data_points < 2) { printf("ERROR: The number of data points to be clustered must be greater than 1.\n"); exit(0); } active_data_points = data_points; /* * Allocate memory to store output data. * We use mmap() since it is faster than malloc() and aligns the starting address on a page boundary. */ result = (unsigned long *)mmap(NULL, data_points * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (result == MAP_FAILED) { printf("ERROR: Could not allocate memory for output vector.\n"); exit(0); } /* * Count how many attributes a line consists of. * Attributes in the input data file are seperated by a comma (','), except the last attribute. * We assume that each line contains at least one attribute. */ attributes = 1; for (; *line; attributes += *line == ',', line++); printf("Attributes in each line: %ld\n", attributes); /* * Pad each line to make its size a multiple of 64 bytes. * Useful for performance reasons, as it allows better vectorization on the Intel Xeon Phi. */ padded_attributes = (((attributes * sizeof(double) - 1) | (ALIGNMENT - 1)) + 1) / sizeof(double); printf("Padded attributes in each line: %ld\n", padded_attributes); /* * Allocate memory to store input data. * We use mmap() since it is faster than malloc() and aligns the starting address on a page boundary. */ M = (double *)mmap(NULL, data_points * padded_attributes * sizeof(double), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (M == MAP_FAILED) { printf("ERROR: Could not allocate memory for input array.\n"); exit(0); } /* * Return to the beginning of the file to start reading the input data. */ fseek(file, 0, SEEK_SET); i = 0; j = 0; while((line = fgets(buffer, BUFFER_SIZE, file)) != NULL) { token = strtok(line, ","); while (token != NULL) { M[i * padded_attributes + j] = atof(token); token = strtok(NULL, ","); j++; if (j == attributes) { j = 0; i++; } } } fclose(file); gettimeofday(&end, NULL); print_elapsed_time(start, end, "Time to read input data"); #pragma offload target(mic) in(leaves, data_points, active_data_points, attributes, padded_attributes) \ in(M : length(data_points * padded_attributes) align(64)) \ out(result : length(data_points) align(64)) \ inout(clusters)\ nocopy(num_of_cores) { __attribute__((aligned(64))) unsigned long i, j, current_level, max_level, stop, cluster_num, *indices; __attribute__((aligned(64))) int error; __attribute__((aligned(64))) double *M_line, *centroid; __attribute__((aligned(64))) Node *root, **cluster_nodes; __attribute__((aligned(64))) double max_diam, min_dist; __attribute__((aligned(64))) struct timeval start, end; __assume_aligned(root, 64); __assume_aligned(M_line, 64); __assume_aligned(indices, 64); __assume_aligned(centroid, 64); __assume_aligned(cluster_nodes, 64); #if defined(_OPENMP) #pragma omp parallel { #pragma omp single { num_of_cores = omp_get_num_threads(); } } #else num_of_cores = 1; #endif printf("Detected %lu processors.\n", num_of_cores); gettimeofday(&start, NULL); /* * Make an initial guess about the depth of the tree to be constructed. * We will construct a perfect binary tree in which the number of leaves is greater or equal to the number of requested clusters. */ i = clusters; max_level = -1; while (i > 0) { i >>= 1; max_level++; } if ((clusters & (clusters - 1))) { max_level++; } /* * Allocate the root node of the tree. */ root = allocate_node(data_points); DBG("Root is %p\n", root); indices = (unsigned long *)mmap(NULL, data_points * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); centroid = (double *)mmap(NULL, attributes * sizeof(double), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if ((indices == MAP_FAILED) || (centroid == MAP_FAILED)) { printf("ERROR: Could not allocate memory for a vector in tree node.\n"); exit(0); } /* * All data points belong to the root node at the beginning. */ #pragma vector aligned #pragma ivdep for (i = 0; i < data_points; i++) { indices[i] = i; } /* * Initialize centroid. */ memset(centroid, 0, attributes * sizeof(double)); #pragma omp parallel default(none) private(i, j, M_line) shared(M, indices, centroid, data_points, attributes, padded_attributes) { __attribute__((aligned(64))) double centroid_local[attributes]; memset(centroid_local, 0, attributes * sizeof(double)); #pragma omp for for (i = 0; i < data_points; i++) { M_line = &M[indices[i] * padded_attributes]; #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { centroid_local[j] += M_line[j]; } } #pragma vector aligned #pragma ivdep for (j = 0; j < attributes; j++) { #pragma omp atomic centroid[j] += centroid_local[j]; } } #pragma vector aligned #pragma ivdep for (i = 0; i < attributes; i++) { centroid[i] /= data_points; } init_node(root, root, M, data_points, indices, centroid, attributes, padded_attributes, num_of_cores); current_level = 0; stop = 0; #pragma omp parallel private(i) num_threads(num_of_cores < ((1 << max_level) / 4) ? num_of_cores : ((1 << max_level) / 4)) { #pragma omp single { do { cluster_num = 0; #pragma omp taskgroup { error = posix_memalign((void **)(&cluster_nodes), 64, (1 << current_level) * sizeof(Node *)); if (error != 0) { printf("ERROR: Could not allocate memory for vector of cluster nodes.\n"); exit(0); } find_all_splittable_leaves(root, cluster_nodes, &cluster_num); for (i = 0; i < cluster_num; i++) { #pragma omp task process_node(cluster_nodes[i], M, attributes, padded_attributes, current_level, max_level); } } // #pragma omp taskgroup free(cluster_nodes); /* * If there have not been created as many clusters as requested, * then we have to continue creating new clusters. * * Except if there are no more clusters that can be split. */ if (leaves < clusters) { if (cluster_num == 0) { stop = 1; clusters = leaves; keep_all_nodes(root); } } else { /* * The number of clusters created is equal or larger to the number of clusters requested. * Check which ones have to be kept and if they are the ones required for the proper solution. */ stop = find_nodes_to_keep(root, clusters); } current_level = max_level; max_level++; } while (stop == 0); } // #pragma omp single } // #pragma omp parallel printf("Created a total of %ld clusters.\n", clusters); gettimeofday(&end, NULL); print_elapsed_time(start, end, "Time for calculations"); gettimeofday(&start, NULL); assign_cluster_numbers(root, result, data_points); gettimeofday(&end, NULL); print_elapsed_time(start, end, "Time to assign cluster numbers"); gettimeofday(&start, NULL); max_diam = sqrt(max_cluster_diameter(root, M, attributes, padded_attributes)); gettimeofday(&end, NULL); print_elapsed_time(start, end, "Time to calculate maximum cluster diameter"); printf("Maximum cluster diameter is %f\n", max_diam); gettimeofday(&start, NULL); min_dist = sqrt(min_cluster_distance(root, clusters, attributes)); gettimeofday(&end, NULL); print_elapsed_time(start, end, "Time to calculate minimum cluster distance"); printf("Minimum cluster distance is %f\n", min_dist); printf("Dunn index = %.40f\n", min_dist / max_diam); } // #pragma offload gettimeofday(&start, NULL); write_results(result, data_points, argv[2]); gettimeofday(&end, NULL); print_elapsed_time(start, end, "Time to write output data"); return(0); }