Correct way to measure execution, and data transfer time

Hi

I’m new to Cuda in general, the following is part of my code, with many events.

satur_xr=xr.open_dataset('/f/work/julich/dg-rr/dg_rr_elshambakey/plant_available_water/satur.nc')
poro_xr=xr.open_dataset('/f/work/julich/dg-rr/dg_rr_elshambakey/plant_available_water/poro.nc')
s_cuda_event=cuda.event() # To record kernel start
e_cuda_event=cuda.event() # To record kernel end
data_to_device_start_time_event=cuda.event() # To record start of moving data from host to device
data_to_device_end_time_event=cuda.event() # To record end of moving data from host to device
data_to_host_start_time_event=cuda.event() # To record start of moving data back from device to host
data_to_host_end_time_event=cuda.event() # To record end of moving data back from device to host

@cuda.jit
def PlantAvailableWater_cuda(Poro=None,satur=None,Dz=1,Dzmult=None,wlt_pt_p=-150,fld_cp_p=-0.6,Nx=0,Ny=0,Nz=0,Alpha=None,Nvg=None,Sres=None,init_paw=None):
    
    ty,tz=cuda.grid(2)
    Ssat=1
    satur_vol=0.0
    wlt_pt_vol=0.0
    fld_cp_vol=0.0

    for k in range(Nz):
        kk = Nz-k-1
        m = 1.0 - 1.0/Nvg[kk,ty%Ny,tz%Nx]

        if wlt_pt_p<0.0:
            wlt_pt=(Ssat - Sres[kk,ty%Ny,tz%Nx])/((1.0 + (Alpha[kk,ty%Ny,tz%Nx]*abs(wlt_pt_p))**Nvg[kk,ty%Ny,tz%Nx])**m) + Sres[kk,ty%Ny,tz%Nx]
        else:
            wlt_pt=1.0

        if fld_cp_p<0.0:
            fld_cp=(Ssat - Sres[kk,ty%Ny,tz%Nx])/((1.0 + (Alpha[kk,ty%Ny,tz%Nx]*abs(fld_cp_p))**Nvg[kk,ty%Ny,tz%Nx])**m) + Sres[kk,ty%Ny,tz%Nx]
        else:
            fld_cp=1.0

        satur_vol += (satur[int(ty/Ny),kk,ty%Ny,tz%Nx] * Dz * Dzmult[kk] * Poro[int(ty/Ny),kk,ty%Ny,tz%Nx])
        wlt_pt_vol += (wlt_pt * Dz * Dzmult[kk] * Poro[int(ty/Ny),kk,ty%Ny,tz%Nx])
        fld_cp_vol += (fld_cp * Dz * Dzmult[kk] * Poro[int(ty/Ny),kk,ty%Ny,tz%Nx])

        init_paw[int(ty/Ny),kk,ty%Ny,tz%Nx] = (satur_vol - wlt_pt_vol) / (fld_cp_vol - wlt_pt_vol) *100.

data_to_device_start_time_event.record()
poro_cu=cuda.to_device(poro_xr['porosity'].data)
satur_cu=cuda.to_device(satur_xr['satur'].data)
Dzmult_cu=cuda.to_device(Dzmult)
alpha_cu=cuda.to_device(d['alpha'].numpy())
nvg_cu=cuda.to_device(d['nvg'].numpy())
sres_cu=cuda.to_device(d['sres'].numpy())
init_paw_cu=cuda.to_device(init_paw)
data_to_device_end_time_event.record()
data_to_device_end_time_event.synchronize()
print('Time to move data to device using cuda events: '+str(cuda.event_elapsed_time(data_to_device_start_time_event,data_to_device_end_time_event)/1000)) # Divide by 1000 as cuda.event_elapsed_time is calculated in milli-seconds

for tk in [(10,10),(15,15),(20,20),(25,25),(26,26),(27,27),(28,28),(29,29),(30,30),(10,10),(20,20)]:
   
    try:
        
        no_threads_per_block=tk
        print("No thread per block "+str(tk))
        no_blocks_per_grid_y=math.ceil(poro_xr.dims['z']*poro_xr.dims['lat']/no_threads_per_block[0])
        no_blocks_per_grid_z=math.ceil(poro_xr.dims['z']*poro_xr.dims['lon']/no_threads_per_block[1])
        no_blocks_per_grid=(no_blocks_per_grid_y,no_blocks_per_grid_z)

        s_cuda_event.record()

        PlantAvailableWater_cuda[no_blocks_per_grid,no_threads_per_block](poro_cu,satur_cu,\
                      nrw_paw_params['dz'],Dzmult_cu,\
                      nrw_paw_params['wlt_pt_p'],nrw_paw_params['fld_cp_p'],\
                      poro_xr['lon'].size,poro_xr['lat'].size,poro_xr['lev'].size,\
                      alpha_cu,nvg_cu,sres_cu,init_paw_cu)
        e_cuda_event.record()
        e_cuda_event.synchronize()
        print('Numba cuda computation time using cuda events: '+str(cuda.event_elapsed_time(s_cuda_event,e_cuda_event)/1000)) # Divide by 1000 as cuda.event_elapsed_time is calculated in milli-seconds

        # Move data back from device to host
        data_to_host_start_time_event.record()
        init_paw=init_paw_cu.copy_to_host()
        data_to_host_end_time_event.record()
        data_to_host_end_time_event.synchronize()
        print('Time to move data back to host using cuda event: '+str(cuda.event_elapsed_time(data_to_host_start_time_event,data_to_host_end_time_event)/1000)) # Divide by 1000 as cuda.event_elapsed_time is calculated in milli-seconds
    except Exception as e:
        print(e) # Print error
        continue

I’m trying to measure the time it takes to transfer data between the host/device, and the kernel execution time. I’m not sure if this is the correct way to measure these times? I tried to use nvprof with ‘–csv’ option, but it fails (with an error that the application is not found, despite other options work like --log-file)

Regards