#ifndef _KMEANS_CUDA_KERNEL_H_ #define _KMEANS_CUDA_KERNEL_H_ #include #include #include "kmeans.h" // FIXME: Make this a runtime selectable variable! #define ASSUMED_NR_CLUSTERS 32 #define SDATA( index) CUT_BANK_CHECKER(sdata, index) // t_features has the layout dim0[points 0-m-1]dim1[ points 0-m-1]... texture t_features; // t_features_flipped has the layout point0[dim 0-n-1]point1[dim 0-n-1] texture t_features_flipped; texture t_clusters; __constant__ float c_clusters[ASSUMED_NR_CLUSTERS*34]; /* constant memory for cluster centers */ /* ----------------- invert_mapping() --------------------- */ /* inverts data array from row-major to column-major. [p0,dim0][p0,dim1][p0,dim2] ... [p1,dim0][p1,dim1][p1,dim2] ... [p2,dim0][p2,dim1][p2,dim2] ... to [dim0,p0][dim0,p1][dim0,p2] ... [dim1,p0][dim1,p1][dim1,p2] ... [dim2,p0][dim2,p1][dim2,p2] ... */ __global__ void invert_mapping(float *input, /* original */ float *output, /* inverted */ int npoints, /* npoints */ int nfeatures) /* nfeatures */ { int point_id = threadIdx.x + blockDim.x*blockIdx.x; /* id of thread */ int i; if(point_id < npoints){ for(i=0;i 1; threadids_participating /= 2) { if(threadIdx.x < threadids_participating) { deltas[threadIdx.x] += deltas[threadIdx.x + threadids_participating]; } __syncthreads(); } if(threadIdx.x < 1) {deltas[threadIdx.x] += deltas[threadIdx.x + 1];} __syncthreads(); // propagate number of changes to global counter if(threadIdx.x == 0) { block_deltas[blockIdx.y * gridDim.x + blockIdx.x] = deltas[0]; //printf("original id: %d, modified: %d\n", blockIdx.y*gridDim.x+blockIdx.x, blockIdx.x); } #endif #ifdef GPU_NEW_CENTER_REDUCTION int center_id = threadIdx.x / nfeatures; int dim_id = threadIdx.x - nfeatures*center_id; __shared__ int new_center_ids[THREADS_PER_BLOCK]; new_center_ids[threadIdx.x] = index; __syncthreads(); /*** determine which dimension calculte the sum for mapping of threads is center0[dim0,dim1,dim2,...]center1[dim0,dim1,dim2,...]... ***/ int new_base_index = (point_id - threadIdx.x)*nfeatures + dim_id; float accumulator = 0.f; if(threadIdx.x < nfeatures * nclusters) { // accumulate over all the elements of this threadblock for(int i = 0; i< (THREADS_PER_BLOCK); i++) { float val = tex1Dfetch(t_features_flipped,new_base_index+i*nfeatures); if(new_center_ids[i] == center_id) accumulator += val; } // now store the sum for this threadblock /*** mapping to global array is block0[center0[dim0,dim1,dim2,...]center1[dim0,dim1,dim2,...]...]block1[...]... ***/ block_clusters[(blockIdx.y*gridDim.x + blockIdx.x) * nclusters * nfeatures + threadIdx.x] = accumulator; } #endif } #endif // #ifndef _KMEANS_CUDA_KERNEL_H_