Use the MinoTauro User's Guide provided by the teacher to solve the tasks in this lab.

numsection Example code of matrix multiplication using CUDA and MPI

Use the following skeleton of a matrix multiplication program using CUDA and MPI to solve the tasks of this lab (code available also HERE):


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/* MULTI-NODE AND PARALLEL MATRIX-MATRIX PRODUCT WITH MPI AND CUDA           */
/*                                                                           */
/* File:         mmpmpicuda.cu                                               */
/* Author:       Alberto Pou Quirós (Github: bertini36)                      */
/* Revisited by: Francesc Sastre Cabot (Github: xiscosc)                     */
/* Description:  This program performs a matrix product (A * B = C)          */
/*               distributing the computation between multiple nodes         */
/*               with MPI technology and parallelizing the computation in    */
/*               every node with Nvidia CUDA technology                      */
/* Compilation:  nvcc -I/opt/mpi/bullxmpi/1.2.9.1/include                    */
/*               -L/opt/mpi/bullxmpi/1.2.9.1/lib -lmpi -ldl -lm -lnuma       */
/*               -lrt -lnsl -lutil -lm -ldl mmpmpicuda.cu -o mmpmpicuda      */
/* Strategy:                                                                 */
/*                  Example 16x16 matrices with 4 nodes:                     */
/*                   _________________16________________                     */
/*                   |                                 |                     */
/*                   |               NODE 1            | 4                   */
/*                   |_________________________________|                     */
/*                   |                                 |                     */
/*                   |               NODE 2            | 4                   */
/*              C =  |_________________________________|    16               */
/*                   |                                 |                     */
/*                   |               NODE 3            | 4                   */
/*                   |_________________________________|                     */
/*                   |                                 |                     */
/*                   |               NODE 4            | 4                   */
/*                   |_________________________________|                     */
/*                                                                           */
/*                  Node 1 computes 4 rows of result matrix:                 */
/*                   __________________________________                      */
/*                   |                                 |                     */
/*                   |         4x16 CUDA block         |                     */
/*                   |_________________________________|                     */
/*                                                                           */
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <mpi.h>

#define N 1024 # It has to be 32 multiple. Min 32 * Number of nodes.

#define err(format, ...) do { fprintf(stderr, format, ##__VA_ARGS__); exit(1); } while (0)

struct timeval start_time, end_time;

inline void checkCuda(cudaError_t e) {
    if (e != cudaSuccess) {
        err("CUDA Error %d: %s\n", e, cudaGetErrorString(e));
    }
}

__global__ void matrixProduct(double *matrix_a, double *matrix_b, double *matrix_c, int width, int from, int my_rank) {
    int row = threadIdx.y + blockDim.y * blockIdx.y;
    int col = threadIdx.x + blockDim.x * blockIdx.x;
    matrix_c[row * width + col] = 0;
    for (int k=0; k < width; k++) {
        matrix_c[row * width + col] += matrix_a[((row + from) * width) + k] * matrix_b[k * width + col];
    }
}

void initializeMatrices(double matrix_a[N][N], double matrix_b[N][N]) {
    int i, j;
    srand(time(NULL));
    for (i=0; i<N; i++) {
        for (j=0; j<N; j++) {
            matrix_a[i][j] = rand();
            matrix_b[i][j] = rand();
        }
    }
}

void showMatrices(double matrix_a[N][N], double matrix_b[N][N], double matrix_c[N][N]) {
    int i, j;
    srand(time(NULL));
    printf("***** MATRIX A ***** \n");
    for (i=0; i<N; i++) {
        for (j=0; j<N; j++) {
            (j % N == N-1) ? printf("%.1f \n", matrix_a[i][j]) : printf("%.1f,", matrix_a[i][j]);
        }
    }
    printf("***** MATRIX B ***** \n");
    for (i=0; i<N; i++) {
        for (j=0; j<N; j++) {
            (j % N == N-1) ? printf("%.1f \n", matrix_b[i][j]) : printf("%.1f,", matrix_b[i][j]);
        }
    }
    printf("***** RESULT MATRIX ***** \n");
    for (int i=0; i<N; i++) {
        for (int j=0; j<N; j++) {
            (j % N == N-1) ? printf("%f \n", matrix_c[i][j]) : printf("%f,", matrix_c[i][j]);
        }
    }
}


int main(int argc, char *argv[]) {

    double A[N][N], B[N][N], C[N][N];
    double *d_a, *d_b, *d_c;
    int my_rank, comm_sz, from, to, nrows;

    // MPI initialization
    ...

    if (N % comm_sz != 0) {
        if (my_rank == 0) printf("Matrix size not divisible by number of processors \n");
        MPI_Finalize();
        exit(-1);
    }

    // Calculate interval lines to compute per node
    from = ...
    to = ...
    nrows = to - from;

    if (my_rank == 0) { initializeMatrices(A, B); }

    // Send A y B to every node
    MPI_Bcast(...); // Send A
    MPI_Bcast(...); // Send B

    // Allocate memory in the device
    checkCuda(cudaMalloc((void **) &d_a, N*N*sizeof(double))); // Allocate A
    checkCuda(...); // Allocate B
    checkCuda(...); // Allocate C

    // Copy the information (A, B) in the device
    checkCuda(...);
    checkCuda(...);

    // CUDA threads structure definition
    dim3 dimGrid(...);
    dim3 dimBlock(32, 32);    // MAX BLOCK SIZE

    MPI_Barrier(MPI_COMM_WORLD);
    if (my_rank == 0) { gettimeofday(&start_time, NULL); }

    // Kernel launch
    matrixProduct<<<..., ...>>>(d_a, d_b, d_c, N, from, my_rank);
    checkCuda(cudaDeviceSynchronize());
    checkCuda(cudaGetLastError());

    // Calculate compute time
    MPI_Barrier(MPI_COMM_WORLD);
    if (my_rank == 0) {
        gettimeofday(&end_time, NULL);
        printf("Compute time: %.1f ms \n", (float) (end_time.tv_sec - start_time.tv_sec) * 1000 + (end_time.tv_usec - start_time.tv_usec) / 1000);
     }

    // Get results from device
    checkCuda(cudaMemcpy(C[from], d_c, (nrows)*N*sizeof(double), ...));

    // Unify results from nodes
    MPI_Gather(...);

    // if (my_rank == 0)  { showMatrices(A, B, C); }

    checkCuda(cudaFree(d_a));
    checkCuda(cudaFree(d_b));
    checkCuda(cudaFree(d_c));

    MPI_Finalize();
    return 0;

}
task: Complete the program using the skeleton and run it on MinoTauro:
  • Complete the program.
  • Modify the N global variable to fit your requirements.
  • Compile on the Minotauro login node. Hint:
    
    nvcc -I/opt/mpi/bullxmpi/1.2.9.1/include -L/opt/mpi/bullxmpi/1.2.9.1/lib -lmpi -ldl -lm -lnuma -lrt -lnsl -lutil -lm -ldl mmpmpicuda.cu -o mmpmpicuda       
                                
  • Create a task file with K80 nodes, and ask for 2 GPUs (2 is the minimun allowed).
  • Run with different number of nodes and answer the following questions:
    • Does the computation time decrease in a linear way if you increase the number of nodes?
    • Does a point exist where the computation time does not decrease?