Use the MinoTauro User's Guide provided by the teacher to solve the tasks in this lab.
There are 2 supported methods for submitting jobs. The first one is to use a wrapper maintained by the Operations Team at BSC that provides a standard syntax regardless of the underlying Batch system (mnsubmit). The other one is to use the SLURM sbatch directives directly. The second option is recommended for advanced users only.
Node | n = 131072 | n = 262144 | n = 524288 |
---|---|---|---|
1 | |||
2 | |||
4 |
Hint: Example job file (2 Nodes)
#!/bin/bash
#SBATCH --job-name=cuda_k80
#SBATCH -D .
#SBATCH --output=k80_%j.out
#SBATCH --error=k80_%j.err
#SBATCH --ntasks=2
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=2
#SBATCH --gres gpu:2
#SBATCH --cpus-per-task=8
#SBATCH --constraint=k80
#SBATCH --time=00:02:00
...
Hint: Add this to run a job using the reservation queue
#SBATCH --reservation=YOUR_RESERVATION
Hint: Add this to run a job using the debug queue
#SBATCH --partition=debug
#SBATCH --qos=debug
Hint: Ask for an interactive node using:
mnsh -k -g 1
#include <stdlib.h>
#include <time.h>
void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
int idx;
for (idx = 0; idx < N; idx++)
{
C[idx] = A[idx] + B[idx];
}
}
void initialData(float *ip, int size)
{
// generate different seed for random number
time_t t;
srand((unsigned) time(&t));
int i
for (i = 0; i < size; i++)
{
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
return;
}
int main(int argc, char **argv)
{
int nElem = 1024;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *h_C;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
h_C = (float *)malloc(nBytes);
initialData(h_A, nElem);
initialData(h_B, nElem);
sumArraysOnHost(h_A, h_B, h_C, nElem);
free(h_A);
free(h_B);
free(h_C);
return(0);
}
Some help:
//Memory allocation:
float *d_A, *d_B, *d_C;
cudaMalloc((float**)&d_A, nBytes);
cudaMalloc((float**)&d_B, nBytes);
cudaMalloc((float**)&d_C, nBytes);
// Transfer the data from the CPU memory to the GPU global memory
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
// with the parameter cudaMemcpyHostToDevice specifying the transfer direction.
//Copy the result from the GPU memory back to the host:
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
// Release the memory used on the GPU
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}