Introduction to Parallel Computer Architecture Counting sort
parallel_scan_v2/Makefile
# A simple CUDA makefile # # Author: Naga Kandasamy # Date: 02/14/2017 # # CUDA depends on two things: # 1) The CUDA nvcc compiler, which needs to be on your path, # or called directly, which we do here # 2) The CUDA shared library being available at runtime, # which we make available by setting the LD_LIBRARY_PATH # variable for the duration of the makefile. # # Note that you can set your PATH and LD_LIBRARY_PATH variables as part of your # .bash_profile so that you can compile and run without using this makefile. NVCCFLAGS := -O3 -gencode arch=compute_60,code=sm_60 NVCC := /usr/local/cuda/bin/nvcc LD_LIBRARY_PATH := /usr/local/cuda/lib64 all: scan scan: scan.cu scan_gold.cpp $(NVCC) -o scan scan.cu scan_gold.cpp $(NVCCFLAGS) clean: rm scan
parallel_scan_v2/scan.cu
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> #include <float.h> // includes, kernels #include "scan_naive_kernel.cu" void runTest( int argc, char** argv); extern "C" unsigned int compare( const float* reference, const float* data, const unsigned int len); extern "C" void computeGold( float* reference, float* idata, const unsigned int len); void checkCUDAError(const char *msg); int checkResults(float *, float *, int, float); int main( int argc, char** argv) { runTest( argc, argv); exit(0); } void runTest( int argc, char** argv) { unsigned int num_elements = 512; const unsigned int mem_size = sizeof( float) * num_elements; const unsigned int shared_mem_size = sizeof(float) * num_elements; // allocate host memory to store the input data float* h_data = (float*) malloc(mem_size); // initialize the input data on the host to be integer values // between 0 and 10 for( unsigned int i = 0; i < num_elements; ++i){ h_data[i] = floorf(10*(rand()/(float)RAND_MAX)); } // compute reference solution float* reference = (float*) malloc( mem_size); computeGold( reference, h_data, num_elements); // allocate device memory input and output arrays float* d_idata; float* d_odata; cudaMalloc( (void**) &d_idata, mem_size); cudaMalloc( (void**) &d_odata, mem_size); // copy host memory to device input array cudaMemcpy( d_idata, h_data, mem_size, cudaMemcpyHostToDevice); // setup execution parameters // Note that these scans only support a single thread-block worth of data, dim3 grid(1, 1, 1); dim3 threads(512, 1, 1); printf("Running parallel prefix sum (scan) of %d elements\n", num_elements); scan_naive<<< grid, threads, 2 * shared_mem_size >>>(d_odata, d_idata, num_elements); cudaThreadSynchronize(); // copy result from device to host cudaMemcpy( h_data, d_odata, sizeof(float) * num_elements, cudaMemcpyDeviceToHost); float epsilon = 0.0f; unsigned int result_regtest = checkResults( reference, h_data, num_elements, epsilon); printf( "Test %s\n", (1 == result_regtest) ? "PASSED" : "FAILED"); // cleanup memory free( h_data); free( reference); cudaFree(d_idata); cudaFree(d_odata); } void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { printf("CUDA ERROR: %s (%s).\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } } int checkResults(float *reference, float *gpu_result, int num_elements, float threshold) { int checkMark = 1; for(int i = 0; i < num_elements; i++) if((reference[i] - gpu_result[i]) > threshold){ checkMark = 0; break; } return checkMark; }
parallel_scan_v2/scan_gold.cpp
#include <stdio.h> #include <math.h> #include <float.h> extern "C" void computeGold( float* reference, float* idata, const unsigned int len); void computeGold( float* reference, float* idata, const unsigned int len) { reference[0] = 0; double total_sum = 0; unsigned int i; for(i = 1; i < len; ++i){ total_sum += idata[i-1]; reference[i] = idata[i-1] + reference[i-1]; } // Here it should be okay to use != because we have integer values // in a range where float can be exactly represented if (total_sum != reference[i-1]) printf("Warning: exceeding single-precision accuracy. Scan will be inaccurate.\n"); }
parallel_scan_v2/scan_naive_kernel.cu
#ifndef _SCAN_NAIVE_KERNEL_H_ #define _SCAN_NAIVE_KERNEL_H_ __global__ void scan_naive(float *g_odata, float *g_idata, int n) { // Dynamically allocated shared memory for scan kernels extern __shared__ float temp[]; int thid = threadIdx.x; int pout = 0; int pin = 1; // Cache the computational window in shared memory temp[pout*n + thid] = (thid > 0) ? g_idata[thid - 1] : 0; for (int offset = 1; offset < n; offset *= 2) { pout = 1 - pout; pin = 1 - pout; __syncthreads(); temp[pout*n + thid] = temp[pin*n + thid]; if (thid >= offset) temp[pout*n + thid] += temp[pin*n + thid - offset]; } __syncthreads(); g_odata[thid] = temp[pout*n + thid]; } #endif // #ifndef _SCAN_NAIVE_KERNEL_H_