Introduction to Parallel Computer Architecture

profileRock1027
jacobi_solver.zip

jacobi_solver/jacobi_iteration.cu

/* Host side code for the Jacobi method of solving a system of linear equations by iteration. Author: Naga Kandasamy Date modified: 3/9/2018 Compile as follows: make clean && make */ #include <stdlib.h> #include <stdio.h> #include <sys/time.h> #include <string.h> #include <math.h> #include <cuda_runtime.h> #include "jacobi_iteration.h" // Include the kernel code #include "jacobi_iteration_kernel.cu" // Uncomment the line below if you want the code to spit out some debug information // #define DEBUG // Prototypes of external functions called in this file extern "C" void compute_gold (const Matrix, Matrix, const Matrix); extern "C" void display_jacobi_solution (const Matrix, const Matrix, const Matrix); // Prototypes of local functions used in this file Matrix allocate_matrix_on_gpu (const Matrix); Matrix allocate_matrix (int, int, int); int check_if_diagonal_dominant (const Matrix); Matrix create_diagonally_dominant_matrix (unsigned int, unsigned int); void copy_matrix_to_device (Matrix, const Matrix); void copy_matrix_from_device (Matrix, const Matrix); void compute_on_device (const Matrix, Matrix, const Matrix); int perform_simple_check (const Matrix); void print_matrix (const Matrix); float get_random_number (int, int); void checkCUDAError (const char *); int checkResults( float *, float *, int, float); int main(int argc, char** argv) { if(argc > 1){ printf("Error. This program accepts no arguments. \n"); exit(0); } Matrix A; // The NxN constant matrix Matrix B; // The Nx1 input matrix Matrix reference_x; // The reference solution Matrix gpu_solution_x; // The solution computed by the GPU // Initialize the random number generator with a seed value srand(time(NULL)); // Create the diagonally dominant matrix A = create_diagonally_dominant_matrix (MATRIX_SIZE, MATRIX_SIZE); if (A.elements == NULL){ printf ("Error creating matrix. \n"); exit (0); } B = allocate_matrix (MATRIX_SIZE, 1, 1); // Create a matrix B holding the constants reference_x = allocate_matrix (MATRIX_SIZE, 1, 0); // Create a matrix for the reference solution gpu_solution_x = allocate_matrix (MATRIX_SIZE, 1, 0); // Create a matrix for the GPU solution #ifdef DEBUG print_matrix (A); print_matrix (B); print_matrix (reference_x); #endif // Compute the Jacobi solution on the CPU printf("Performing Jacobi iteration on the CPU. \n"); compute_gold (A, reference_x, B); display_jacobi_solution(A, reference_x, B); // Display statistics // Compute the Jacobi solution on the GPU. The solution is returned in gpu_solution_x printf("\n Performing Jacobi iteration on the GPU. \n"); compute_on_device (A, gpu_solution_x, B); display_jacobi_solution(A, gpu_solution_x, B); // Display statistics free(A.elements); free(B.elements); free(reference_x.elements); free(gpu_solution_x.elements); exit(0); } // Complete this function to perform the Jacobi calculation on the GPU void compute_on_device(const Matrix A, Matrix gpu_solution_x, const Matrix B){ } // Allocate a device matrix of same size as M. Matrix allocate_matrix_on_gpu(const Matrix M){ Matrix Mdevice = M; int size = M.num_rows * M.num_columns * sizeof(float); cudaMalloc((void**)&Mdevice.elements, size); return Mdevice; } // Allocate a matrix of dimensions height*width // If init == 0, initialize to all zeroes. // If init == 1, perform random initialization. Matrix allocate_matrix(int num_rows, int num_columns, int init){ Matrix M; M.num_columns = M.pitch = num_columns; M.num_rows = num_rows; int size = M.num_rows * M.num_columns; M.elements = (float*) malloc(size*sizeof(float)); for(unsigned int i = 0; i < size; i++){ if(init == 0) M.elements[i] = 0; else M.elements[i] = get_random_number(MIN_NUMBER, MAX_NUMBER); } return M; } // Copy a host matrix to a device matrix. void copy_matrix_to_device(Matrix Mdevice, const Matrix Mhost) { int size = Mhost.num_rows * Mhost.num_columns * sizeof(float); Mdevice.num_rows = Mhost.num_rows; Mdevice.num_columns = Mhost.num_columns; Mdevice.pitch = Mhost.pitch; cudaMemcpy(Mdevice.elements, Mhost.elements, size, cudaMemcpyHostToDevice); } // Copy a device matrix to a host matrix. void copy_matrix_from_device(Matrix Mhost, const Matrix Mdevice){ int size = Mdevice.num_rows * Mdevice.num_columns * sizeof(float); cudaMemcpy(Mhost.elements, Mdevice.elements, size, cudaMemcpyDeviceToHost); } // Prints the matrix out to screen void print_matrix(const Matrix M){ for(unsigned int i = 0; i < M.num_rows; i++){ printf("Line number = %d ############## \n", i); for(unsigned int j = 0; j < M.num_columns; j++){ printf("%f ", M.elements[i*M.num_rows + j]); } printf("\n"); } printf("\n"); printf("####################################### \n"); } // Returns a random floating-point number between the specified min and max values float get_random_number(int min, int max){ return (float)floor((double)(min + (max - min + 1)*((float)rand()/(float)RAND_MAX))); } // Check for errors in kernel execution void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { printf("CUDA ERROR: %s (%s).\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } } int checkResults(float *reference, float *gpu_result, int num_elements, float threshold) { int checkMark = 1; float epsilon = 0.0; for(int i = 0; i < num_elements; i++){ if(fabsf((reference[i] - gpu_result[i])/reference[i]) > threshold){ checkMark = 0; printf("error at %d \n",i); printf("element r %f and g %f \n",reference[i] ,gpu_result[i]); break; } } int maxEle; for(int i = 0; i < num_elements; i++){ if(fabsf((reference[i] - gpu_result[i])/reference[i]) > epsilon){ epsilon = fabsf((reference[i] - gpu_result[i])/reference[i]); maxEle=i; } } printf("Max epsilon = %f at i = %d value at cpu %f and gpu %f \n", epsilon,maxEle,reference[maxEle],gpu_result[maxEle]); return checkMark; } /* Function checks if the matrix is diagonally dominant. */ int check_if_diagonal_dominant(const Matrix M) { float diag_element; float sum; for(unsigned int i = 0; i < M.num_rows; i++){ sum = 0.0; diag_element = M.elements[i * M.num_rows + i]; for(unsigned int j = 0; j < M.num_columns; j++){ if(i != j) sum += abs(M.elements[i * M.num_rows + j]); } if(diag_element <= sum) return 0; } return 1; } /* Create a diagonally dominant matix. */ Matrix create_diagonally_dominant_matrix (unsigned int num_rows, unsigned int num_columns) { Matrix M; M.num_columns = M.pitch = num_columns; M.num_rows = num_rows; unsigned int size = M.num_rows * M.num_columns; M.elements = (float *) malloc (size * sizeof (float)); // Create a matrix with random numbers between [-.5 and .5] unsigned int i, j; printf ("Creating a %d x %d matrix with random numbers between [-.5, .5]...", num_rows, num_columns); for(i = 0; i < size; i++) // M.elements[i] = ((float)rand ()/(float)RAND_MAX) - 0.5; M.elements[i] = get_random_number(MIN_NUMBER, MAX_NUMBER); printf("done. \n"); // Make the diagonal entries large with respect to the entries on each row printf("Generating the positive definite matrix."); for (i = 0; i < num_rows; i++){ float row_sum = 0.0; for(j = 0; j < num_columns; j++){ row_sum += fabs (M.elements[i * M.num_rows + j]); } M.elements[i * M.num_rows + i] = 0.5 + row_sum; } if(!check_if_diagonal_dominant (M)){ free (M.elements); M.elements = NULL; } return M; }

jacobi_solver/jacobi_iteration.h

#ifndef _MATRIX_H_ #define _MATRIX_H_ #define THRESHOLD 1e-5 // Threshold for convergence #define MIN_NUMBER 2 // Min number in the A and b matrices #define MAX_NUMBER 10 // Max number in the A and b matrices #define THREAD_BLOCK_SIZE 128 // Size of a thread block #define NUM_BLOCKS 32 // Number of thread blocks // Dimension of the n x n matrix #define MATRIX_SIZE 1024 #define NUM_COLUMNS MATRIX_SIZE // Number of columns in Matrix A #define NUM_ROWS MATRIX_SIZE // Number of rows in Matrix A // Matrix Structure declaration typedef struct Matrix { //width of the matrix represented unsigned int num_columns; //height of the matrix represented unsigned int num_rows; //number of elements between the beginnings of adjacent // rows in the memory layout (useful for representing sub-matrices) unsigned int pitch; //Pointer to the first element of the matrix represented float* elements; unsigned int thread_id; } Matrix; #endif // _MATRIX_H_

jacobi_solver/jacobi_iteration_gold.cpp

jacobi_solver/jacobi_iteration_gold.cpp

/* Reference code for solving the equation by jacobi by iteration method. */
#include   < stdio . h >
#include   < stdlib . h >
#include   < math . h >
#include   "jacobi_iteration.h"

extern   Matrix  allocate_matrix ( int  num_rows ,   int  num_columns ,   int  init );
extern   "C"   void  display_jacobi_solution ( const   Matrix  A ,   const   Matrix  reference ,   const   Matrix  B );
extern   "C"   void  compute_gold  ( const   Matrix  A ,   const   Matrix  reference ,   const   Matrix  B );

void
compute_gold  ( const   Matrix  A ,   Matrix  ref_x ,   const   Matrix  B )
{
     unsigned   int  i ,  j ,  k ;
     unsigned   int  num_rows  =  A . num_rows ;   // Rows in matrix A
     unsigned   int  num_cols  =  A . num_columns ;   // Columns in matrix A
     Matrix  new_x  =  allocate_matrix  ( MATRIX_SIZE ,   1 ,   0 );    // Allocate n x 1 matrix to hold iteration values
    
     // Initialize the current jacobi solution
     for   ( =   0 ;  i  <  num_rows ;  i ++ )
        ref_x . elements [ i ]   =  B . elements [ i ];

     // Perform the Jacobi iteration 
     unsigned   int  done  =   0 ;
     double  ssd ,  mse ;
     unsigned   int  num_iter  =   0 ;
    
     while   ( ! done ){
         for   ( =   0 ;  i  <  num_rows ;  i ++ ){
             double  sum  =   0.0 ;
             for   ( =   0 ;  j  <  num_cols ;  j ++ ){
                 if   ( !=  j )
                    sum  +=  A . elements [ *  num_cols  +  j ]   *  ref_x . elements [ j ];
             }
           
             // Calculate the new values for the unkowns for the current row 
            new_x . elements [ i ]   =   ( B . elements [ i ]   -  sum ) / A . elements [ *  num_cols  +  i ];
         }

         /* Note: you can optimize the above nested loops by removing the branch statement within 
         * the j loop. The rewritten code is as follows: 
         *
         * for (i = 0; i < num_rows; i++){
         *      double sum = -A.elements[i * num_cols + i] * ref_x.elements[i];
         *      for (j = 0; j < num_cols; j++)
         *          sum += A.elements[i * num_cols + j] * ref_x.elements[j];
         * }
         *
         * new_x.elements[i] = (B.elements[i] - sum)/A.elements[i * num_cols + i];
         *
         * I recommend using this code snippet within your GPU kernel implementation.
         * 
         * */

         // Check for convergence and update the unknowns
        ssd  =   0.0 ;   // Sum of squared differences 
         for   ( =   0 ;  i  <  num_rows ;  i ++ ){
            ssd  +=   ( new_x . elements [ i ]   -  ref_x . elements [ i ])   *   ( new_x . elements [ i ]   -  ref_x . elements [ i ]);
            ref_x . elements [ i ]   =  new_x . elements [ i ];
         }
        num_iter ++ ;
        mse  =  sqrt ( ssd );   // Mean squared error
        printf  ( "MSE during teration %d is %f \n" ,  num_iter ,  mse );
         if   ( mse  <=  THRESHOLD )
            done  =   1 ;
     }

    printf  ( "Convergence achieved after %d iterations \n" ,  num_iter );
    free  ( new_x . elements );
}
    
/* Function to display statistic related to the Jacobi solution. */
void
display_jacobi_solution ( const   Matrix  A ,   const   Matrix  ref_x ,   const   Matrix  B )
{
     double  diff  =   0.0 ;
     unsigned   int  num_rows  =  A . num_rows ;
     unsigned   int  num_cols  =  A . num_columns ;
    
     for ( unsigned   int  i  =   0 ;  i  <  num_rows ;  i ++ ){
         double  line_sum  =   0.0 ;
         for ( unsigned   int  j  =   0 ;  j  <  num_cols ;  j ++ ){
            line_sum  +=  A . elements [ *  num_cols  +  j ]   *  ref_x . elements [ j ];
         }
        
        diff  +=  fabsf ( line_sum  -  B . elements [ i ]);
     }

    printf ( "Average diff between LHS and RHS: %f \n" ,  diff / ( float ) num_rows );
}

jacobi_solver/jacobi_iteration_kernel.cu

#include "jacobi_iteration.h" // Write the GPU kernel to solve the Jacobi iterations __global__ void jacobi_iteration_kernel () { }

jacobi_solver/Makefile

# A simple CUDA makefile # # Author: Naga Kandasamy # Date: 3/9/2018 # # CUDA depends on two things: # 1) The CUDA nvcc compiler, which needs to be on your path, # or called directly, which we do here # 2) The CUDA shared library being available at runtime, # which we make available by setting the LD_LIBRARY_PATH # variable for the duration of the makefile. # # Note that you can set your PATH and LD_LIBRARY_PATH variables as part of your # .bash_profile so that you can compile and run without using this makefile. NVCCFLAGS := -O3 -gencode arch=compute_60,code=sm_60 NVCC := /usr/local/cuda/bin/nvcc LD_LIBRARY_PATH := /usr/local/cuda/lib64 all: jacobi_iteration jacobi_iteration: jacobi_iteration.cu jacobi_iteration_gold.cpp $(NVCC) -o jacobi_iteration jacobi_iteration.cu jacobi_iteration_gold.cpp $(NVCCFLAGS) clean: rm jacobi_iteration # A simple CUDA makefilevim