A question about CUDA multithreaded memory manage

Hello everyone,I have another new question :slightly_smiling_face:

A program of mine has a large amount of temporary data , resulting in out of memory of GPU. Therefore, I have to reduce the temporary data according to the number of parallel threads,but now the memory data is confused.

The simulation code is as follows,The intention of the code is to copy from A data set to B data set, and the temporary storage is smaller than that of A and B.
If you use the function “testThreadsPerBlockMemoryNormal”,it will be normal,the difference between it and the function “testThreadsPerBlockMemory” is the assignment operation is in different loops.(In the algorithm, the two loops must be separated)
I need some advice to achieve the results like function “testThreadsPerBlockMemoryNormal”.
----code----

import numpy as np
from numba import cuda


@cuda.jit()
def testThreadsPerBlockMemoryNormal(dataA, dataB, dataBuff, randomBuff):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    pos = tx + bx * bw
    if pos < dataA.shape[0]:
        for i in range(dataBuff.shape[0]):
            dataBuff[tx] = dataA[pos * dataBuff.shape[0] + i]
            # delay
            b = 0
            for j in range(randomBuff[pos * dataBuff.shape[0] + i]):
                b += 1
            # delay end
            dataB[pos * dataBuff.shape[0] + i] = dataBuff[tx]


@cuda.jit()
def testThreadsPerBlockMemory(dataA, dataB, dataBuff, randomBuff):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    pos = tx + bx * bw
    if pos < dataA.shape[0]:
        for i in range(dataBuff.shape[0]):
            dataBuff[tx] = dataA[pos * dataBuff.shape[0] + i]
        # delay begin
        b = 0
        for j in range(randomBuff[pos * dataBuff.shape[0] + i]):
            b += 1
        # delay end
        for i in range(dataBuff.shape[0]):
            dataB[pos * dataBuff.shape[0] + i] = dataBuff[tx]


def doTest():
    data_A_host = np.zeros(100, dtype=np.int32)
    data_B_host = np.array(data_A_host, dtype=np.int32)
    dataBuff_host = np.zeros(10, dtype=np.int32)
    randomBuff_host = np.array(data_A_host, dtype=np.int32)
    for i in range(data_A_host.shape[0]):
        data_A_host[i] = i
    for i in range(randomBuff_host.shape[0]):
        randomBuff_host[i] = np.random.randint(1, 100000000)

    dataA = cuda.to_device(data_A_host)
    dataB = cuda.to_device(data_B_host)
    dataBuff = cuda.to_device(dataBuff_host)
    randomBuff = cuda.to_device(randomBuff_host)
    testThreadsPerBlockMemory[10, 10](dataA, dataB, dataBuff, randomBuff)

    dataB.copy_to_host(data_B_host)
    cuda.synchronize()
    dataBuff.copy_to_host(dataBuff_host)
    cuda.synchronize()
    print('-----data_A_host------')
    print(data_A_host)
    print('-----data_B_host------')
    print(data_B_host)
    print('-----dataBuff_host------')
    print(dataBuff_host)

    for i in range(data_A_host.shape[0]):
        if data_A_host[i] != data_B_host[i]:
            print(data_A_host[i], data_B_host[i])


if __name__ == "__main__":
    doTest()

—print----

-----data_A_host------
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
-----data_B_host------
[ 9  0  0  0  0  0  0  0  0  0 19  0  0  0  0  0  0  0  0  0 29  0  0  0
  0  0  0  0  0  0 39  0  0  0  0  0  0  0  0  0 49  0  0  0  0  0  0  0
  0  0 59  0  0  0  0  0  0  0  0  0 69  0  0  0  0  0  0  0  0  0 79  0
  0  0  0  0  0  0  0  0 89  0  0  0  0  0  0  0  0  0 99  0  0  0  0  0
  0  0  0  0]
-----dataBuff_host------
[0 0 0 0 0 0 0 0 0 0]
0 9
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 19
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 29
21 0
22 0
23 0
24 0
25 0
26 0
27 0
28 0
29 0
30 39
31 0
32 0
33 0
34 0
35 0
36 0
37 0
38 0
39 0
40 49
41 0
42 0
43 0
44 0
45 0
46 0
47 0
48 0
49 0
50 59
51 0
52 0
53 0
54 0
55 0
56 0
57 0
58 0
59 0
60 69
61 0
62 0
63 0
64 0
65 0
66 0
67 0
68 0
69 0
70 79
71 0
72 0
73 0
74 0
75 0
76 0
77 0
78 0
79 0
80 89
81 0
82 0
83 0
84 0
85 0
86 0
87 0
88 0
89 0
90 99
91 0
92 0
93 0
94 0
95 0
96 0
97 0
98 0
99 0

I’m a little slow today so I’m finding it hard to really understand the transformation here, but I note you have:

for j in range(randomBuff[pos * dataBuff.shape[0] + i])

in testThreadsPerBlockMemory() where i will happen to be its last value from the previous loop - did you mean to use i in this way?

sorry ,I reviewed my simulation code. The two functions are not equivalent,Now I upload the new code to make it equivalent in single thread,but now in multithreading, memory reading and writing are still out of sync

import numpy as np
from numba import cuda


@cuda.jit()
def testThreadsPerBlockMemoryNormal(dataA, dataB, dataBuff, randomBuff):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    pos = tx + bx * bw
    if bx <= dataA.shape[0] // dataBuff.shape[1]:
        for i in range(dataBuff.shape[1]):
            dataBuff[tx][i] = dataA[bx * dataBuff.shape[1] + i]
            # delay
            b = 0
            for j in range(randomBuff[bx * dataBuff.shape[1] + i]):
                b += 1
            # delay end
            dataB[bx * dataBuff.shape[1] + i] = dataBuff[tx][i]


@cuda.jit()
def testThreadsPerBlockMemory(dataA, dataB, dataBuff, randomBuff):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    pos = tx + bx * bw

    if bx <= dataA.shape[0]//dataBuff.shape[1]:
        for i in range(dataBuff.shape[1]):
            dataBuff[tx][i] = dataA[bx * dataBuff.shape[1] + i]
            # delay begin
            b = 0
            for j in range(randomBuff[bx * dataBuff.shape[1] + i]):
                b += 1
            # delay end
        for i in range(dataBuff.shape[1]):
            dataB[bx * dataBuff.shape[1] + i] = dataBuff[tx][i]


def doTest():
    data_A_host = np.zeros(100, dtype=np.int32)
    data_B_host = np.array(data_A_host, dtype=np.int32)
    dataBuff_host = np.zeros((10, 10), dtype=np.int32)
    randomBuff_host = np.array(data_A_host, dtype=np.int32)
    for i in range(data_A_host.shape[0]):
        data_A_host[i] = i
    for i in range(randomBuff_host.shape[0]):
        randomBuff_host[i] = np.random.randint(1, 100000000)

    dataA = cuda.to_device(data_A_host)
    dataB = cuda.to_device(data_B_host)
    dataBuff = cuda.to_device(dataBuff_host)
    randomBuff = cuda.to_device(randomBuff_host)
    testThreadsPerBlockMemory[10, 10](dataA, dataB, dataBuff, randomBuff)

    dataB.copy_to_host(data_B_host)
    cuda.synchronize()
    dataBuff.copy_to_host(dataBuff_host)
    cuda.synchronize()
    print('-----data_A_host------')
    print(data_A_host)
    print('-----data_B_host------')
    print(data_B_host)
    print('-----dataBuff_host------')
    print(dataBuff_host)

    for i in range(data_A_host.shape[0]):
        if data_A_host[i] != data_B_host[i]:
            print(data_A_host[i], data_B_host[i])


if __name__ == "__main__":
    doTest()

----print of testThreadsPerBlockMemory()----

-----data_A_host------
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
-----data_B_host------
[ 0 21 32 33 34 35 36 37 38 39  0 21 32 33 34 35 36 37 38 39  0 21 32 33
 34 35 36 37 38 39  0 21 32 33 34 35 36 37 38 39  0 21 32 33 34 35 36 37
 38 39  0 21 32 33 34 35 36 37 38 39  0 21 32 33 34 35 36 37 38 39  0 21
 32 33 34 35 36 37 38 39  0 21 32 33 34 35 36 37 38 39  0 21 32 33 34 35
 36 37 38 39]
-----dataBuff_host------
[[ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]
 [ 0 21 32 33 34 35 36 37 38 39]]
1 21
2 32
3 33
4 34
5 35
6 36
7 37
8 38
9 39
10 0
11 21
12 32
13 33
14 34
15 35
16 36
17 37
18 38
19 39
20 0
22 32
23 33
24 34
25 35
26 36
27 37
28 38
29 39
30 0
31 21
40 0
41 21
42 32
43 33
44 34
45 35
46 36
47 37
48 38
49 39
50 0
51 21
52 32
53 33
54 34
55 35
56 36
57 37
58 38
59 39
60 0
61 21
62 32
63 33
64 34
65 35
66 36
67 37
68 38
69 39
70 0
71 21
72 32
73 33
74 34
75 35
76 36
77 37
78 38
79 39
80 0
81 21
82 32
83 33
84 34
85 35
86 36
87 37
88 38
89 39
90 0
91 21
92 32
93 33
94 34
95 35
96 36
97 37
98 38
99 39

----pinrt of testThreadsPerBlockMemoryNormal()----

-----data_A_host------
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
-----data_B_host------
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
-----dataBuff_host------
[[10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]
 [10 51 32 33 34 35 96 97 98 99]]

I’m now finding it hard to see how the output differs from what you expect - with the new version of the code, what is wrong with the output and what should it be?

It should copy data_A_host to data_B_host,the dataBuff is a cache for each thread and I want to know why the expression of testThreadsPerBlockMemory() makes the threads of the same block out of sync.

I’m not quite sure by what you mean “out of sync” - in what sense are they out of sync?

There are some explanations of the execution of thread blocks in GTC 2022 - How CUDA Programming Works - Stephen Jones, CUDA Architect, NVIDIA - YouTube particularly starting around 15 minutes which might help clarify the programming / memory model - does that section of the tutorial address any of your questions?

thank you gmarkall.The “out of sync” means that all threads of each block will be synchronized once the execution is completed,and the threads of the next block will launch ,but as the result, it is not synchronized in block.
As introduced in the video said “a block has a fixed number of threads which are guaranteed to be running simultaneously on the same SM”, Maybe I have some misunderstandings,the threads of each block has once synchronization,because it makes me confused to produce different results.

now I have a solution that doesn’t look perfect ,code is as follow it is work.
launch some threads in one block and synchronize them, and separate the steps as the blocks into the loop according to the data size

import numpy as np
from numba import cuda

@cuda.jit()
def testThreadsPerBlockMemory(dataA, dataB, dataBuff, randomBuff, cuda_block):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    bx = bx + cuda_block

    if bx <= dataA.shape[0] // dataBuff.shape[1]:
        for i in range(dataBuff.shape[1]):
            dataBuff[tx][i] = dataA[bx * dataBuff.shape[1] + i]
            # delay begin
            b = 0
            for j in range(randomBuff[bx * dataBuff.shape[1] + i]):
                b += 1
            # delay end
        for i in range(dataBuff.shape[1]):
            dataB[bx * dataBuff.shape[1] + i] = dataBuff[tx][i]

def doTest():
    data_A_host = np.zeros(100, dtype=np.int32)
    data_B_host = np.array(data_A_host, dtype=np.int32)
    dataBuff_host = np.zeros((10, 10), dtype=np.int32)
    randomBuff_host = np.array(data_A_host, dtype=np.int32)
    for i in range(data_A_host.shape[0]):
        data_A_host[i] = i
    for i in range(randomBuff_host.shape[0]):
        randomBuff_host[i] = np.random.randint(1, 100000000)

    dataA = cuda.to_device(data_A_host)
    dataB = cuda.to_device(data_B_host)
    dataBuff = cuda.to_device(dataBuff_host)
    randomBuff = cuda.to_device(randomBuff_host)
    # launch all threads in one block synchronize them,
    # and separate the steps as the blocks into the loop according to the data size
    for i in range(data_A_host.shape[0] // dataBuff_host.shape[1]):
        testThreadsPerBlockMemory[1, dataBuff_host.shape[0]](dataA, dataB, dataBuff, randomBuff, i)
        cuda.synchronize()

    dataB.copy_to_host(data_B_host)
    cuda.synchronize()
    dataBuff.copy_to_host(dataBuff_host)
    cuda.synchronize()
    print('-----data_A_host------')
    print(data_A_host)
    print('-----data_B_host------')
    print(data_B_host)
    print('-----dataBuff_host------')
    print(dataBuff_host)

    for i in range(data_A_host.shape[0]):
        if data_A_host[i] != data_B_host[i]:
            print(data_A_host[i], data_B_host[i])

----print-----

-----data_A_host------
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
-----data_B_host------
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
-----dataBuff_host------
[[90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]
 [90 91 92 93 94 95 96 97 98 99]]