Seeking alternative ways to set multiple items at a time

I’m trying to implement this function to make it work in jit. It runs about 0.0000341 with n=52, k=2

def combinations_py(n, k):
    a = np.ones((k, n - k + 1), dtype=np.int64)
    a[0] = np.arange(n - k + 1)
    for j in range(1, k):
        reps = (n - k + j) - a[j - 1]
        a = np.repeat(a, reps, axis=1)
        ind = np.add.accumulate(reps)
        a[j, ind[:-1]] = 1 - reps[1:]
        a[j, 0] = j
        a[j] = np.add.accumulate(a[j])

    return a

And this one runs about 0.0000075

@njit(cache=True)
def combinations(n, k):
    a = np.ones((k, n - k + 1), dtype=np.int64)
    a[0] = np.arange(n - k + 1)
    for j in range(1, k):
        reps = (n - k + j) - a[j - 1]
        size = np.sum(reps)
        n_a = np.empty((k, size), dtype=np.int64)
        for i in range(k):
            n_a[i] = np.repeat(a[i, :], reps)

        a = n_a
        ind = np.cumsum(reps)
        for m, l in np.nditer((ind[:-1], reps[1:])):
            a[j, m] = 1 - l

        a[j, 0] = j
        a[j] = np.cumsum(a[j])

    return a

But if I’m turning off jit, it is dropping to 0.0001369.
Replacing back these multiple itemsets resulting 0.0000492.

a[j, ind[:-1]] = 1 - reps[1:]

So the question is how to effectively replace this part of code because I think using nditer is the wrong way.

My timing and checking of the results produced by the code:

import numpy as np
from numba import njit

def combinations_py(n, k):
    a = np.ones((k, n - k + 1), dtype=np.int64)
    a[0] = np.arange(n - k + 1)
    for j in range(1, k):
        reps = (n - k + j) - a[j - 1]
        a = np.repeat(a, reps, axis=1)
        ind = np.add.accumulate(reps)
        a[j, ind[:-1]] = 1 - reps[1:]
        a[j, 0] = j
        a[j] = np.add.accumulate(a[j])

    return a

@njit(cache=True)
def combinations(n, k):
    a = np.ones((k, n - k + 1), dtype=np.int64)
    a[0] = np.arange(n - k + 1)
    for j in range(1, k):
        reps = (n - k + j) - a[j - 1]
        size = np.sum(reps)
        n_a = np.empty((k, size), dtype=np.int64)
        for i in range(k):
            n_a[i] = np.repeat(a[i, :], reps)

        a = n_a
        ind = np.cumsum(reps)
        for m, l in np.nditer((ind[:-1], reps[1:])):
            a[j, m] = 1 - l

        a[j, 0] = j
        a[j] = np.cumsum(a[j])

    return a

n = 52
k = 2

py_result = combinations_py(n, k)
c_result = combinations(n, k)

%timeit combinations_py(n, k)
%timeit combinations(n, k)

np.testing.assert_equal(py_result, c_result)

Gives:

$ ipython repro.ipy 
18.8 µs ± 337 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
4.45 µs ± 7.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

So the question is how to effectively replace this part of code because I think using nditer is the wrong way.

The code is functioning correctly (the assert does not fail), and the performance of the jitted function is better than that of the native function - can you explain why you think that using nditer is wrong?