TIL rocprof -d flag

I work with AMD GPUs and ROCm/HIP a little bit. If I write an MPI program that also uses HIP such that each MPI task is running on its own GPU, it would useful for me sometimes to profile each rank. AMD provides a tool called rocprof which is very useful, but by default it will output a single results.csv file. If you’re profiling multiple ranks, different ranks can end up overwriting each other’s results.csv.

Say you had a silly little program where each MPI task is adding two arrays using a GPU. It might look something like:

#include <stdio.h>
#include <math.h>
#include <mpi.h>
#include <hip/hip_runtime.h>

/* ---------------------------------------------------------------------------------
Macro for checking errors in HIP API calls
#define hipErrorCheck(call)                                                                 \
do{                                                                                         \
    hipError_t hipErr = call;                                                               \
    if(hipSuccess != hipErr){                                                               \
        printf("HIP Error - %s:%d: '%s'\n", __FILE__, __LINE__, hipGetErrorString(hipErr)); \
        exit(0);                                                                            \
    }                                                                                       \

/* ---------------------------------------------------------------------------------
Vector addition kernel
__global__ void add_vectors(double *a, double *b, double *c, int n){
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if(id < n) c[id] = a[id] + b[id];

/* ---------------------------------------------------------------------------------
Main program
int main(int argc, char *argv[]){
    MPI_Init(&argc, &argv);
    int size;
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    char name[MPI_MAX_PROCESSOR_NAME];
    int resultlength;
    MPI_Get_processor_name(name, &resultlength);
    if(rank == 0) {
      printf("number of ranks: %d\n", size);
    // Start Total Runtime Timer
    double start_time, end_time, elapsed_time;
    start_time = MPI_Wtime();

    // Array length
    long long int N = 256*1024*1024;

    size_t buffer_size = N * sizeof(double);

    double *A = (double*)malloc(buffer_size);
    double *B = (double*)malloc(buffer_size);
    double *C = (double*)malloc(buffer_size);

    for(int i=0; i<N; i++){
        A[i] = 1;
        B[i] = 1;
        C[i] = 0.0;

    double *d_A, *d_B, *d_C;
    hipErrorCheck( hipMalloc(&d_A, buffer_size) );
    hipErrorCheck( hipMalloc(&d_B, buffer_size) );
    hipErrorCheck( hipMalloc(&d_C, buffer_size) );

    hipErrorCheck( hipMemcpy(d_A, A, buffer_size, hipMemcpyHostToDevice) );
    hipErrorCheck( hipMemcpy(d_B, B, buffer_size, hipMemcpyHostToDevice) );

    hipEvent_t start, end;
    hipErrorCheck( hipEventCreate(&start) );
    hipErrorCheck( hipEventCreate(&end) );

    // Set execution configuration parameters
    int thr_per_blk = 256;
    int blk_in_grid = ceil( float(N) / thr_per_blk );

    hipErrorCheck( hipEventRecord(start, NULL) );
    add_vectors<<<dim3(blk_in_grid), dim3(thr_per_blk), 0, hipStreamDefault>>>(d_A, d_B, d_C, N);

    hipErrorCheck( hipEventRecord(end, NULL) );
    hipErrorCheck( hipEventSynchronize(end) );
    float milliseconds = 0.0;
    hipErrorCheck( hipEventElapsedTime(&milliseconds, start, end) ); 

    hipErrorCheck( hipMemcpy(C, d_C, buffer_size, hipMemcpyDeviceToHost) );

    float max_gpu_time;
    MPI_Reduce(&milliseconds, &max_gpu_time, 1, MPI_FLOAT, MPI_MAX, 0, MPI_COMM_WORLD);

    double sum = 0.0;
    for(int i=0; i<N; i++){
        sum = sum + C[i];

    double result = sum / (double)(2*N);

    if(result != 1){
        printf("In rank %d: Test failed!\n", rank);

    hipErrorCheck( hipFree(d_A) );
    hipErrorCheck( hipFree(d_B) );
    hipErrorCheck( hipFree(d_C) );


    end_time = MPI_Wtime();
    elapsed_time = end_time - start_time;
    double total_time_max;
    MPI_Reduce(&elapsed_time, &total_time_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    if(rank == 0) {
      printf("Result              = %.16f\n", result);
      printf("Array buffer size   = %zu\n", buffer_size);
      printf("Max GPU time (s)    = %.6f\n", max_gpu_time / 1000.0);
      printf("Max MPI time (s)    = %.6f\n", total_time_max / 1000.0);
    return 0;

And you run on a system with a Slurm scheduler like below which starts 4 MPI tasks:

srun -N1 -n4 --gpus-per-task=1 rocprof ./vecAdd

This would create four instances of rocprof that would clobber each other’s results.csv output.

A way around this is by using the -d flag to specify a directory. So instead your srun command could look like

srun -N1 -n4 --gpus-per-task=1 rocprof -d results ./vecAdd

So instead each rank will generate a subdirectory in the results directory that will be created in your current directory. If you look at your results directory you will see

$ ls results
rpl_data_230815_193349_122832  rpl_data_230815_193349_122833  rpl_data_230815_193349_76989  rpl_data_230815_193349_76990

where each rank has its own directory. You can find the profiling results for one of the ranks for example in rpl_data_230815_193349_76990/input_results_230815_193349/results.txt

There’s probably a more streamlined way to get each rank to print its profiling results instead having to dig through the results directory (and this could potentially have problems when you’re using MPI across multiple nodes and the pids of two MPI ranks on separate nodes happen to be the same). But at least learning about the -d flag is a start.