Tips for performance improvement of my code

sacdevops · November 26, 2022, 10:59am

Hi,

I’m new to the community and two weeks ago I first taught myself Numba for my thesis, which I am programming in Python. Here I came across Numba to improve the runtime of my program. However, I still have a runtime of 2-3 minutes, so I wanted to ask if someone could help me to improve the performance of my program. I’m sure there are some tricks that could be used, or maybe I’m using something incorrectly that could affect performance.

I’m posting all my code below. Feel free to run it. Everything possible is included in the code. There are no additional parts that have not been mentioned or anything.

I intentionally generate the arrays during each run because otherwise, as before, the size of the 3D arrays, if I create everything once beforehand, will require about 16 GB of memory for 2000 simulations alone. With 5000 simulations, my memory is not even enough. That’s why I do the generation at runtime of the simulation, which makes more use of the CPU, but saves the memory.

As the saying goes: memory costs, but time does not

Code:

import matplotlib.pyplot as plt
import numpy as np
from numba import njit, prange
import time as time

t = time.time()

max_iteration = 30
N = 1000
pattern_nums = np.array([40, 60, 80, 100, 120], dtype=np.uint8)
S = 2000
T = np.array([0.01, 0.3], dtype=np.float32)


# calculating the coupling constants for all patterns
@njit(fastmath=True, nogil=True)
def add_coupling(patterns, N):
    return (np.ascontiguousarray(patterns.transpose(1, 0)) @ patterns) / N


# Generating a random spin and pattern_num patterns of the size max(p)
@njit(fastmath=True, nogil=True)
def init_neurons(max_p, N):
    patterns = np.random.choice(np.array([-1, 1], dtype=np.float32), size=(max_p, N))

    spins = np.empty(N, dtype=np.float32)
    n = np.random.randint((N / 2) + 1, N * 0.975)

    L = np.random.choice(np.arange(N + 1), size=n, replace=False)

    for i in range(N):
        spins[i] = patterns[0, i] if i in L else (patterns[0, i] * -1)

    return spins, patterns


@njit(fastmath=True, parallel=True, nogil=True)
def simulate(T, N, S, max_iteration, max_patterns):
    result = np.zeros((len(T), len(max_patterns), 20), dtype=np.float64)

    for r, t in enumerate(T):
        B = 1 / t
        for j, n in enumerate(max_patterns):
            a = n / N

            values = np.zeros(20, dtype=np.float32)
            count = np.zeros(20, dtype=np.float32)

            for i in prange(S):
                spins, patterns = init_neurons(n, N)
                J = add_coupling(patterns, N)

                M = np.zeros((max_iteration, N), dtype=np.float32)
                H = np.zeros((max_iteration, N), dtype=np.float32)
                u = np.zeros(max_iteration, dtype=np.float32)

                H[0] = (8 / B) * spins
                M[0] = np.tanh(B * H[0, :])
                H[1] = (8 / B) * spins

                M0 = np.sum(patterns[0] * np.tanh(B * H[0])) / N

                # Calculate u for theta 0
                u[0] = B * (1 - M0) if B < 10 else ((B ** (a)) * M0)

                y = iterate(H, J, M, a, B, u)

                key = round(M0 * 20)
                values[key] += y
                count[key] += 1

            result[r, j] = [v / c if c > 0 else 0 for v, c in zip(values, count)]

    return result


@njit(fastmath=True, error_model="numpy", nogil=True)
def iterate(H, J, M, a, B, u):
    N = len(J)  # Size of Neurons N
    L = len(H)  # max iterations

    for t in range(1, L - 1):
        M[t] = np.tanh(B * H[t, :])

        if B >= 10:
            q = np.absolute(np.sum(M[t])) / N
            u[t] = (B ** (a)) * q
        else:
            q = np.sum(M[t] ** 2) / N
            u[t] = B * (1 - q)

        x = (a * u[t]) / (1 - u[t - 1])

        for i in range(N):
            H[t + 1, i] = (
                (J[i, :] @ M[t, :]) - (a * M[t, i]) - (u[t] * H[t, i]) - x * M[t - 1, i]
            )
        H[t + 1, :] /= 1 - u[t]

        if np.mean(np.absolute(H[t + 1, :] - H[t, :])) < 0.000001:
            return 1
    return 0


Y1, Y2 = [], []
result = simulate(T, N, S, max_iteration, pattern_nums)
Y1 = result[0]
Y2 = result[1]

print(
    f"Generated {len(pattern_nums)} Hopfield Models and iterated each with T1 = 0.01 and T2 = 0.3 in {time.time() - t}s"
)


plt.subplot(1, 2, 1)
for i in range(len(Y1)):
    plt.plot(np.arange(0, 1, 0.05), Y1[i], label=pattern_nums[i])
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.subplot(1, 2, 2)
for i in range(len(Y2)):
    plt.plot(np.arange(0, 1, 0.05), Y2[i], label=pattern_nums[i])
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()

Thanks for your help.

SACDevOps

sacdevops · November 26, 2022, 1:30pm

Okay, I was able to decrease the runtime to nearly ~80s.

Here the new code:

import matplotlib.pyplot as plt
import numpy as np
from numba import njit, prange
import time as time

t = time.time()

max_iteration = 30
N = 1000
pattern_nums = np.array([40, 60, 80, 100, 120], dtype=np.uint8)
S = 1000
T = np.array([0.01, 0.3], dtype=np.float32)


# calculating the coupling constants for all patterns
@njit(fastmath=True, nogil=True)
def add_coupling(patterns, N):
    return (np.ascontiguousarray(patterns.transpose(1, 0)) @ patterns) / N


# Generating a random spin and pattern_num patterns of the size max(p)
@njit(fastmath=True, nogil=True)
def init_neurons(max_p, N):
    patterns = np.random.choice(np.array([-1, 1], dtype=np.float32), size=(max_p, N))

    spins = np.copy(patterns[0]) * -1

    L = np.random.choice(
        np.arange(N + 1), size=np.random.randint((N / 2) + 1, N * 0.975), replace=False
    )

    for i in L:
        spins[i] = patterns[0, i]

    return spins, patterns


@njit(fastmath=True, parallel=True, nogil=True)
def simulate(T, N, S, max_iteration, max_patterns):
    result = np.zeros((len(T), len(max_patterns), 20), dtype=np.float64)

    for r, t in enumerate(T):
        B = 1 / t
        for j, n in enumerate(max_patterns):
            a = n / N

            values = np.zeros(20, dtype=np.float32)
            count = np.zeros(20, dtype=np.float32)

            for i in prange(S):
                spins, patterns = init_neurons(n, N)
                J = add_coupling(patterns, N)

                M = np.zeros((max_iteration, N), dtype=np.float32)
                H = np.zeros((max_iteration, N), dtype=np.float32)
                u = np.zeros(max_iteration, dtype=np.float32)

                H[0] = (8 / B) * spins
                M[0] = np.tanh(B * H[0, :])
                H[1] = (8 / B) * spins

                M0 = np.sum(patterns[0] * np.tanh(B * H[0])) / N

                # Calculate u for theta 0
                u[0] = B * (1 - M0) if B < 10 else ((B ** (a)) * M0)

                y = iterate(H, J, M, a, B, u)

                key = round(M0 * 20)
                values[key] += y
                count[key] += 1

            result[r, j] = [v / c if c > 0 else 0 for v, c in zip(values, count)]

    return result


@njit(fastmath=True, error_model="numpy", nogil=True)
def iterate(H, J, M, a, B, u):
    N = len(J)  # Size of Neurons N
    L = len(H)  # max iterations

    for t in range(1, L - 1):
        M[t] = np.tanh(B * H[t, :])

        if B >= 10:
            q = np.absolute(np.sum(M[t])) / N
            u[t] = (B ** (a)) * q
        else:
            q = np.sum(M[t] ** 2) / N
            u[t] = B * (1 - q)

        x = (a * u[t]) / (1 - u[t - 1])

        for i in range(N):
            H[t + 1, i] = (
                (J[i, :] @ M[t, :]) - (a * M[t, i]) - (u[t] * H[t, i]) - x * M[t - 1, i]
            )
        H[t + 1, :] /= 1 - u[t]

        if np.mean(np.absolute(H[t + 1, :] - H[t, :])) < 0.000001:
            return 1
    return 0


Y1, Y2 = [], []
result = simulate(T, N, S, max_iteration, pattern_nums)
Y1 = result[0]
Y2 = result[1]

print(
    f"Generated {len(pattern_nums)} Hopfield Models and iterated each with T1 = 0.01 and T2 = 0.3 in {time.time() - t}s"
)


plt.subplot(1, 2, 1)
for i in range(len(Y1)):
    plt.plot(np.arange(0, 1, 0.05), Y1[i], label=pattern_nums[i])
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.subplot(1, 2, 2)
for i in range(len(Y2)):
    plt.plot(np.arange(0, 1, 0.05), Y2[i], label=pattern_nums[i])
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()

sschaer · November 26, 2022, 1:51pm

Hi @sacdevops

You can eventually get rid of the repeated memory allocation by doing this:

import numba as nb
from numba.np.ufunc.parallel import _iget_thread_id, _iget_num_threads

@nb.njit(parallel=True)
def foo():  
    nthreads = _iget_num_threads()
    
    aa = np.empty((nthreads, 42))
    
    for i in nb.prange(2_000):
        j = _iget_thread_id()
        a = aa[j]
        ...

I only had a very quick look at your code. Definitely make sure it does not change the behaviour of your program.

sacdevops · November 26, 2022, 2:31pm

HI @sschaer

thanks for your solution, but how I can use it? I didn’t understand which line and method it can be helpfully.

sschaer · November 26, 2022, 4:57pm

@sacdevops

I’m sorry for being unclear. Since you already mentioned the memory allocation issue, I thought you would understand where to apply it.

Below is your code with some minor changes and comments (see #NOTE). Most of this won`t have much effect on performance, but it may help you become aware of some common pitfalls of high level languages. The changes I made are very general. I haven’t looked at what your program does (nor could I without spending too much time).
Also, I can’t guarantee that it didn’t change the result of your program. Please check it carefully.

Your program takes about 10 seconds to compile, which is a lot compared to the runtime. You should consider caching it with njit(cache=True, ...).

Finally, I suspect that your second version only became faster because you reduced S.

Hope this helps.

# calculating the coupling constants for all patterns
@njit(fastmath=True, nogil=True)
def add_coupling(patterns, N):
    return (np.ascontiguousarray(patterns.transpose(1, 0)) @ patterns) / N


# Generating a random spin and pattern_num patterns of the size max(p)
@njit(fastmath=True, nogil=True)
def init_neurons(max_p, N):
    patterns = np.random.choice(np.array([-1, 1], dtype=np.float32), size=(max_p, N))

    # NOTE: The assignment + operation already create a new array
    # spins = np.copy(patterns[0]) * -1
    spins = patterns[0] * -1

    L = np.random.choice(
        np.arange(N + 1), size=np.random.randint((N / 2) + 1, N * 0.975), replace=False
    )

    for i in L:
        spins[i] = patterns[0, i]

    return spins, patterns


@njit(fastmath=True, error_model="numpy", nogil=True)
def iterate(H, J, M, a, B, u):
    N = len(J)  # Size of Neurons N
    L = len(H)  # max iterations

    for t in range(1, L - 1):
        M[t] = np.tanh(B * H[t, :])

        if B >= 10:
            q = np.absolute(np.sum(M[t])) / N
            u[t] = (B ** (a)) * q
        else:
            q = np.sum(M[t] ** 2) / N
            u[t] = B * (1 - q)

        x = (a * u[t]) / (1 - u[t - 1])

        for i in range(N):
            H[t + 1, i] = (
                (J[i, :] @ M[t, :]) - (a * M[t, i]) - (u[t] * H[t, i]) - x * M[t - 1, i]
            )
        H[t + 1, :] /= 1 - u[t]

        if np.mean(np.absolute(H[t + 1, :] - H[t, :])) < 0.000001:
            return 1
    return 0


@njit(fastmath=True, parallel=True, nogil=True)
def simulate(T, N, S, max_iteration, max_patterns):
    # NOTE: Use single precision as everywhere else. You won't get higher 
    # precision anyway.  Also allocate an empty array.
    # result = np.zeros((len(T), len(max_patterns), 20), dtype=np.float64)
    result = np.empty((len(T), len(max_patterns), 20), dtype=np.float32)

    # NOTE: Pre-allocate seperate empty array for each thread 
    nthreads = _iget_num_threads()
    MM = np.empty((nthreads, max_iteration, N), dtype=np.float32)
    HH = np.empty((nthreads, max_iteration, N), dtype=np.float32)
    uu = np.empty((nthreads, max_iteration), dtype=np.float32)
                
    for r, t in enumerate(T):
        B = 1 / t
        for j, n in enumerate(max_patterns):
            a = n / N

            # NOTE: I leave those because they are small and not 
            # allocated very often
            values = np.zeros(20, dtype=np.float32)
            count = np.zeros(20, dtype=np.float32)

            for i in prange(S):
                spins, patterns = init_neurons(n, N)
                J = add_coupling(patterns, N)

                # NOTE: Each thread gets its own piece of memory
                # M = np.zeros((max_iteration, N), dtype=np.float32)
                # H = np.zeros((max_iteration, N), dtype=np.float32)
                # u = np.zeros(max_iteration, dtype=np.float32)
                tid = _iget_thread_id()
                M = MM[tid]
                H = HH[tid]
                u = uu[tid]

                H[0] = (8 / B) * spins
                M[0] = np.tanh(B * H[0, :])
                H[1] = (8 / B) * spins

                M0 = np.sum(patterns[0] * np.tanh(B * H[0])) / N

                # Calculate u for theta 0
                u[0] = B * (1 - M0) if B < 10 else ((B ** (a)) * M0)

                y = iterate(H, J, M, a, B, u)

                key = round(M0 * 20)
                values[key] += y
                count[key] += 1

            # NOTE: Your assignment first creates an array and then copies over but
            # we can directly write to result
            # result[r, j] = [v / c if c > 0 else 0 for v, c in zip(values, count)]
            for i, c in enumerate(count):
                if c > 0:
                    result[r, j, i] = values[i] / c
                else:
                    result[r, j, i] = 0

    return result

sacdevops · January 5, 2023, 2:26pm

Hi,

thanks for your solution. I changed a bit my code and got better performance also, but was not able to do all the tips that u give me.

Here are my new code:

import matplotlib.pyplot as plt
import numpy as np
from numba import njit, prange, vectorize
import time as time

t = time.time()

MAX_ITER = np.uint8(30)
N = 1000
P_NUMS = np.array([40, 60, 80, 100, 120], np.uint8)
S = np.uint16(1000)
T = np.array([0.01, 0.3], np.float32)


# Generating a random spin and pattern_num patterns of the size max(p)
@njit(fastmath=True, nogil=True, cache=True)
def init_neurons(max_p, N):
    patterns = np.random.choice(np.array([-1, 1], np.float32), (max_p, N))

    J = (np.ascontiguousarray(patterns.transpose(1, 0)).dot(patterns)) / N

    L = np.random.choice(np.arange(N), np.random.randint(N * 0.5, N * 0.975), False)

    spins = patterns[0] * -1
    spins[L] *= -1

    return spins, patterns, J


@njit(fastmath=True, parallel=True, nogil=True, cache=True)
def simulate(T, N, S, L, max_patterns):
    B = 1 / T

    result = np.empty((len(T), len(max_patterns), 20), dtype=np.float32)

    for j in range(len(max_patterns)):
        a = np.float32(max_patterns[j] / N)

        values_tap = np.zeros((len(T), 20), np.uint16)
        count_tap = np.zeros((len(T), 20), np.uint16)

        # t2 = time.time()

        for s in prange(S):
            spins, patterns, J = init_neurons(max_patterns[j], N)

            # TAP
            for i in range(len(T)):
                c1, M0 = iterate_tap(a, B[i], L, spins, patterns[0], J)

                key = round(M0 * 20)
                values_tap[i, key] += c1
                count_tap[i, key] += 1

        result[:, j] = values_tap / count_tap

    return result


@njit(fastmath=True, error_model="numpy", nogil=True, cache=True)
def iterate_tap(a, B, L, spins, pattern, J):
    N = len(spins)  # Size of Neurons N

    M = np.empty((L, N), np.float32)
    H = np.empty((L, N), np.float32)
    u = np.empty(L, np.float32)

    H[0] = H[1] = (8 / B) * spins
    M[0] = np.tanh(B * H[0])

    M0 = (pattern * M[0]).mean()

    # Calculate u for theta 0
    u[0] = B * (1 - M0)

    for t in range(1, L - 1):
        M[t] = np.tanh(B * H[t])

        u[t] = B * (1 - (M[t] ** 2).mean())

        H[t + 1] = (
            J.dot(M[t]) - a * M[t] - u[t] * (H[t] - (a * M[t - 1] / (1 - u[t - 1])))
        ) / (1 - u[t])

        if np.abs(H[t + 1] - H[t]).mean() < 0.000001:
            return 1, M0
    return 0, M0


Y1, Y2 = [], []
result = simulate(T, N, S, MAX_ITER, P_NUMS)
Y1 = result[0]
Y2 = result[1]

print(f"Generated {len(P_NUMS)} Hopfield Models in {time.time() - t}s")


plt.subplot(1, 2, 1)
for i in range(len(Y1)):
    plt.plot(np.arange(0, 1, 0.05), Y1[i], label=P_NUMS[i])
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])

plt.subplot(1, 2, 2)
for i in range(len(Y2)):
    plt.plot(np.arange(0, 1, 0.05), Y2[i], label=P_NUMS[i])
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()

Maybe a numba vectorize or cuda version would perform better? But I don’t know how I can do this. Any tips continue?

Edit: Changed a bit the code.

Thanks,
SACDevOPs

Topic		Replies	Views
Performance question / issue on particular problem Support: How do I do ...?	19	1098	January 20, 2021
How could i best optimize this with numba Support: How do I do ...?	9	594	May 4, 2021
Help with Optimisation Numba	1	178	December 1, 2023
Feedback on tips for first-timers Community Support	14	832	August 15, 2023
Dictionary performance Community Support	2	736	November 28, 2022

Tips for performance improvement of my code

Related Topics