Cuda Kernal for Row wise historgram

Hi,
I am trying to write a Cuda kernel to generate a row-wise histogram based on input feature set (2 x 6) where each feature row (each having 6 features) is to generate a histogram having nbins=10.
I have implemented the below code but it doesn’t seem to generate the correct row wise histogram.

import numba
import numpy as np
from numba import cuda

np.random.seed(0)
feature = np.random.randint(1, high=6, size=(2,6), dtype=int)
output = np.zeros(20).astype(np.float32).reshape(2,10)

### Kernal Configuration
threads_per_block = 6
blocks = 2

# moving data to device
d_feature = cuda.to_device(feature)
d_output = cuda.to_device(output)
feature_size = d_feature.shape[1]
@cuda.jit
def row_wise_histogram(feature, output, n):
    xmin = np.float32(-4.0)
    xmax = np.float32(4.0)
    idx = cuda.grid(1)
    nbins = 10
    bin_width = (xmax - xmin) / nbins
    for i in range(n):
        # Each thread will take all the row features to generate historgram
        input = feature[idx][i]
        bin_number = np.int32(nbins * (np.float32(input) - np.float32(xmin)) / (np.float32(xmax) - np.float32(xmin)))
        if bin_number >= 0 and bin_number < output.shape[1]:
            cuda.atomic.add(output[idx], bin_number, 1)

row_wise_histogram[blocks, threads_per_block](d_feature, d_output, feature_size)

print(d_output.copy_to_host())

[[ 0.  0.   0.   0.   0.  0.  81111.  81111.  0.  0.]
 [ 0.  0.  0.   0.   0.   0. 162222.  0.  81111. 0.]]

Will appreciate it if I can get help with the issue inside the row_wise_historgram function!