Introduction to Parallel Computer Architecture
jacobi_solver/jacobi_iteration.cu
/* Host side code for the Jacobi method of solving a system of linear equations by iteration. Author: Naga Kandasamy Date modified: 3/9/2018 Compile as follows: make clean && make */ #include <stdlib.h> #include <stdio.h> #include <sys/time.h> #include <string.h> #include <math.h> #include <cuda_runtime.h> #include "jacobi_iteration.h" // Include the kernel code #include "jacobi_iteration_kernel.cu" // Uncomment the line below if you want the code to spit out some debug information // #define DEBUG // Prototypes of external functions called in this file extern "C" void compute_gold (const Matrix, Matrix, const Matrix); extern "C" void display_jacobi_solution (const Matrix, const Matrix, const Matrix); // Prototypes of local functions used in this file Matrix allocate_matrix_on_gpu (const Matrix); Matrix allocate_matrix (int, int, int); int check_if_diagonal_dominant (const Matrix); Matrix create_diagonally_dominant_matrix (unsigned int, unsigned int); void copy_matrix_to_device (Matrix, const Matrix); void copy_matrix_from_device (Matrix, const Matrix); void compute_on_device (const Matrix, Matrix, const Matrix); int perform_simple_check (const Matrix); void print_matrix (const Matrix); float get_random_number (int, int); void checkCUDAError (const char *); int checkResults( float *, float *, int, float); int main(int argc, char** argv) { if(argc > 1){ printf("Error. This program accepts no arguments. \n"); exit(0); } Matrix A; // The NxN constant matrix Matrix B; // The Nx1 input matrix Matrix reference_x; // The reference solution Matrix gpu_solution_x; // The solution computed by the GPU // Initialize the random number generator with a seed value srand(time(NULL)); // Create the diagonally dominant matrix A = create_diagonally_dominant_matrix (MATRIX_SIZE, MATRIX_SIZE); if (A.elements == NULL){ printf ("Error creating matrix. \n"); exit (0); } B = allocate_matrix (MATRIX_SIZE, 1, 1); // Create a matrix B holding the constants reference_x = allocate_matrix (MATRIX_SIZE, 1, 0); // Create a matrix for the reference solution gpu_solution_x = allocate_matrix (MATRIX_SIZE, 1, 0); // Create a matrix for the GPU solution #ifdef DEBUG print_matrix (A); print_matrix (B); print_matrix (reference_x); #endif // Compute the Jacobi solution on the CPU printf("Performing Jacobi iteration on the CPU. \n"); compute_gold (A, reference_x, B); display_jacobi_solution(A, reference_x, B); // Display statistics // Compute the Jacobi solution on the GPU. The solution is returned in gpu_solution_x printf("\n Performing Jacobi iteration on the GPU. \n"); compute_on_device (A, gpu_solution_x, B); display_jacobi_solution(A, gpu_solution_x, B); // Display statistics free(A.elements); free(B.elements); free(reference_x.elements); free(gpu_solution_x.elements); exit(0); } // Complete this function to perform the Jacobi calculation on the GPU void compute_on_device(const Matrix A, Matrix gpu_solution_x, const Matrix B){ } // Allocate a device matrix of same size as M. Matrix allocate_matrix_on_gpu(const Matrix M){ Matrix Mdevice = M; int size = M.num_rows * M.num_columns * sizeof(float); cudaMalloc((void**)&Mdevice.elements, size); return Mdevice; } // Allocate a matrix of dimensions height*width // If init == 0, initialize to all zeroes. // If init == 1, perform random initialization. Matrix allocate_matrix(int num_rows, int num_columns, int init){ Matrix M; M.num_columns = M.pitch = num_columns; M.num_rows = num_rows; int size = M.num_rows * M.num_columns; M.elements = (float*) malloc(size*sizeof(float)); for(unsigned int i = 0; i < size; i++){ if(init == 0) M.elements[i] = 0; else M.elements[i] = get_random_number(MIN_NUMBER, MAX_NUMBER); } return M; } // Copy a host matrix to a device matrix. void copy_matrix_to_device(Matrix Mdevice, const Matrix Mhost) { int size = Mhost.num_rows * Mhost.num_columns * sizeof(float); Mdevice.num_rows = Mhost.num_rows; Mdevice.num_columns = Mhost.num_columns; Mdevice.pitch = Mhost.pitch; cudaMemcpy(Mdevice.elements, Mhost.elements, size, cudaMemcpyHostToDevice); } // Copy a device matrix to a host matrix. void copy_matrix_from_device(Matrix Mhost, const Matrix Mdevice){ int size = Mdevice.num_rows * Mdevice.num_columns * sizeof(float); cudaMemcpy(Mhost.elements, Mdevice.elements, size, cudaMemcpyDeviceToHost); } // Prints the matrix out to screen void print_matrix(const Matrix M){ for(unsigned int i = 0; i < M.num_rows; i++){ printf("Line number = %d ############## \n", i); for(unsigned int j = 0; j < M.num_columns; j++){ printf("%f ", M.elements[i*M.num_rows + j]); } printf("\n"); } printf("\n"); printf("####################################### \n"); } // Returns a random floating-point number between the specified min and max values float get_random_number(int min, int max){ return (float)floor((double)(min + (max - min + 1)*((float)rand()/(float)RAND_MAX))); } // Check for errors in kernel execution void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { printf("CUDA ERROR: %s (%s).\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } } int checkResults(float *reference, float *gpu_result, int num_elements, float threshold) { int checkMark = 1; float epsilon = 0.0; for(int i = 0; i < num_elements; i++){ if(fabsf((reference[i] - gpu_result[i])/reference[i]) > threshold){ checkMark = 0; printf("error at %d \n",i); printf("element r %f and g %f \n",reference[i] ,gpu_result[i]); break; } } int maxEle; for(int i = 0; i < num_elements; i++){ if(fabsf((reference[i] - gpu_result[i])/reference[i]) > epsilon){ epsilon = fabsf((reference[i] - gpu_result[i])/reference[i]); maxEle=i; } } printf("Max epsilon = %f at i = %d value at cpu %f and gpu %f \n", epsilon,maxEle,reference[maxEle],gpu_result[maxEle]); return checkMark; } /* Function checks if the matrix is diagonally dominant. */ int check_if_diagonal_dominant(const Matrix M) { float diag_element; float sum; for(unsigned int i = 0; i < M.num_rows; i++){ sum = 0.0; diag_element = M.elements[i * M.num_rows + i]; for(unsigned int j = 0; j < M.num_columns; j++){ if(i != j) sum += abs(M.elements[i * M.num_rows + j]); } if(diag_element <= sum) return 0; } return 1; } /* Create a diagonally dominant matix. */ Matrix create_diagonally_dominant_matrix (unsigned int num_rows, unsigned int num_columns) { Matrix M; M.num_columns = M.pitch = num_columns; M.num_rows = num_rows; unsigned int size = M.num_rows * M.num_columns; M.elements = (float *) malloc (size * sizeof (float)); // Create a matrix with random numbers between [-.5 and .5] unsigned int i, j; printf ("Creating a %d x %d matrix with random numbers between [-.5, .5]...", num_rows, num_columns); for(i = 0; i < size; i++) // M.elements[i] = ((float)rand ()/(float)RAND_MAX) - 0.5; M.elements[i] = get_random_number(MIN_NUMBER, MAX_NUMBER); printf("done. \n"); // Make the diagonal entries large with respect to the entries on each row printf("Generating the positive definite matrix."); for (i = 0; i < num_rows; i++){ float row_sum = 0.0; for(j = 0; j < num_columns; j++){ row_sum += fabs (M.elements[i * M.num_rows + j]); } M.elements[i * M.num_rows + i] = 0.5 + row_sum; } if(!check_if_diagonal_dominant (M)){ free (M.elements); M.elements = NULL; } return M; }
jacobi_solver/jacobi_iteration.h
#ifndef _MATRIX_H_ #define _MATRIX_H_ #define THRESHOLD 1e-5 // Threshold for convergence #define MIN_NUMBER 2 // Min number in the A and b matrices #define MAX_NUMBER 10 // Max number in the A and b matrices #define THREAD_BLOCK_SIZE 128 // Size of a thread block #define NUM_BLOCKS 32 // Number of thread blocks // Dimension of the n x n matrix #define MATRIX_SIZE 1024 #define NUM_COLUMNS MATRIX_SIZE // Number of columns in Matrix A #define NUM_ROWS MATRIX_SIZE // Number of rows in Matrix A // Matrix Structure declaration typedef struct Matrix { //width of the matrix represented unsigned int num_columns; //height of the matrix represented unsigned int num_rows; //number of elements between the beginnings of adjacent // rows in the memory layout (useful for representing sub-matrices) unsigned int pitch; //Pointer to the first element of the matrix represented float* elements; unsigned int thread_id; } Matrix; #endif // _MATRIX_H_
jacobi_solver/jacobi_iteration_gold.cpp
jacobi_solver/jacobi_iteration_gold.cpp
/* Reference code for solving the equation by jacobi by iteration method. */
#include
<
stdio
.
h
>
#include
<
stdlib
.
h
>
#include
<
math
.
h
>
#include
"jacobi_iteration.h"
extern
Matrix
allocate_matrix
(
int
num_rows
,
int
num_columns
,
int
init
);
extern
"C"
void
display_jacobi_solution
(
const
Matrix
A
,
const
Matrix
reference
,
const
Matrix
B
);
extern
"C"
void
compute_gold
(
const
Matrix
A
,
const
Matrix
reference
,
const
Matrix
B
);
void
compute_gold
(
const
Matrix
A
,
Matrix
ref_x
,
const
Matrix
B
)
{
unsigned
int
i
,
j
,
k
;
unsigned
int
num_rows
=
A
.
num_rows
;
// Rows in matrix A
unsigned
int
num_cols
=
A
.
num_columns
;
// Columns in matrix A
Matrix
new_x
=
allocate_matrix
(
MATRIX_SIZE
,
1
,
0
);
// Allocate n x 1 matrix to hold iteration values
// Initialize the current jacobi solution
for
(
i
=
0
;
i
<
num_rows
;
i
++
)
ref_x
.
elements
[
i
]
=
B
.
elements
[
i
];
// Perform the Jacobi iteration
unsigned
int
done
=
0
;
double
ssd
,
mse
;
unsigned
int
num_iter
=
0
;
while
(
!
done
){
for
(
i
=
0
;
i
<
num_rows
;
i
++
){
double
sum
=
0.0
;
for
(
j
=
0
;
j
<
num_cols
;
j
++
){
if
(
i
!=
j
)
sum
+=
A
.
elements
[
i
*
num_cols
+
j
]
*
ref_x
.
elements
[
j
];
}
// Calculate the new values for the unkowns for the current row
new_x
.
elements
[
i
]
=
(
B
.
elements
[
i
]
-
sum
)
/
A
.
elements
[
i
*
num_cols
+
i
];
}
/* Note: you can optimize the above nested loops by removing the branch statement within
* the j loop. The rewritten code is as follows:
*
* for (i = 0; i < num_rows; i++){
* double sum = -A.elements[i * num_cols + i] * ref_x.elements[i];
* for (j = 0; j < num_cols; j++)
* sum += A.elements[i * num_cols + j] * ref_x.elements[j];
* }
*
* new_x.elements[i] = (B.elements[i] - sum)/A.elements[i * num_cols + i];
*
* I recommend using this code snippet within your GPU kernel implementation.
*
* */
// Check for convergence and update the unknowns
ssd
=
0.0
;
// Sum of squared differences
for
(
i
=
0
;
i
<
num_rows
;
i
++
){
ssd
+=
(
new_x
.
elements
[
i
]
-
ref_x
.
elements
[
i
])
*
(
new_x
.
elements
[
i
]
-
ref_x
.
elements
[
i
]);
ref_x
.
elements
[
i
]
=
new_x
.
elements
[
i
];
}
num_iter
++
;
mse
=
sqrt
(
ssd
);
// Mean squared error
printf
(
"MSE during teration %d is %f \n"
,
num_iter
,
mse
);
if
(
mse
<=
THRESHOLD
)
done
=
1
;
}
printf
(
"Convergence achieved after %d iterations \n"
,
num_iter
);
free
(
new_x
.
elements
);
}
/* Function to display statistic related to the Jacobi solution. */
void
display_jacobi_solution
(
const
Matrix
A
,
const
Matrix
ref_x
,
const
Matrix
B
)
{
double
diff
=
0.0
;
unsigned
int
num_rows
=
A
.
num_rows
;
unsigned
int
num_cols
=
A
.
num_columns
;
for
(
unsigned
int
i
=
0
;
i
<
num_rows
;
i
++
){
double
line_sum
=
0.0
;
for
(
unsigned
int
j
=
0
;
j
<
num_cols
;
j
++
){
line_sum
+=
A
.
elements
[
i
*
num_cols
+
j
]
*
ref_x
.
elements
[
j
];
}
diff
+=
fabsf
(
line_sum
-
B
.
elements
[
i
]);
}
printf
(
"Average diff between LHS and RHS: %f \n"
,
diff
/
(
float
)
num_rows
);
}
jacobi_solver/jacobi_iteration_kernel.cu
#include "jacobi_iteration.h" // Write the GPU kernel to solve the Jacobi iterations __global__ void jacobi_iteration_kernel () { }
jacobi_solver/Makefile
# A simple CUDA makefile # # Author: Naga Kandasamy # Date: 3/9/2018 # # CUDA depends on two things: # 1) The CUDA nvcc compiler, which needs to be on your path, # or called directly, which we do here # 2) The CUDA shared library being available at runtime, # which we make available by setting the LD_LIBRARY_PATH # variable for the duration of the makefile. # # Note that you can set your PATH and LD_LIBRARY_PATH variables as part of your # .bash_profile so that you can compile and run without using this makefile. NVCCFLAGS := -O3 -gencode arch=compute_60,code=sm_60 NVCC := /usr/local/cuda/bin/nvcc LD_LIBRARY_PATH := /usr/local/cuda/lib64 all: jacobi_iteration jacobi_iteration: jacobi_iteration.cu jacobi_iteration_gold.cpp $(NVCC) -o jacobi_iteration jacobi_iteration.cu jacobi_iteration_gold.cpp $(NVCCFLAGS) clean: rm jacobi_iteration # A simple CUDA makefilevim