I originally had some code that operates on very large arrays using for
loops. I wanted to see if I can speed it up with numpy
and numba
and tried 4 incremental steps to get it faster.
Setup:
<code>from timeit import timeit
import numpy.typing as npt
FRAME_MASKS = np.ascontiguousarray(
np.random.randint(0, 2, (TIMESTAMPS, HEIGHT, WIDTH), dtype=bool)
REGION_MASKS = np.ascontiguousarray(
np.random.randint(0, 2, (REGIONS, HEIGHT, WIDTH), dtype=bool)
REGION_AREAS = np.sum(REGION_MASKS, axis=(-1, -2))
# OLFR = "Overlap of Frames with Regions"
# Run once to compile the code (although this is not trivial either...)
olfr_native_for_loops(*test_params)
olfr_fully_vectorized(*test_params)
olfr_with_numba(*test_params)
olfr_all_flags_numba(*test_params)
test_params = (FRAME_MASKS, REGION_MASKS, REGION_AREAS)
["for_loops", lambda: olfr_native_for_loops(*test_params)],
["fully_vectorized", lambda: olfr_fully_vectorized(*test_params)],
["with_numba", lambda: olfr_with_numba(*test_params)],
["all_flags_numba", lambda: olfr_all_flags_numba(*test_params)],
speed = timeit(func, number=5)
print(f"Done. {prefix} speed={speed:.4f}")
<code>from timeit import timeit
import numba as nb
import numpy as np
import numpy.typing as npt
TIMESTAMPS = 100
REGIONS = 5
HEIGHT = 240
WIDTH = 320
np.random.seed(0)
FRAME_MASKS = np.ascontiguousarray(
np.random.randint(0, 2, (TIMESTAMPS, HEIGHT, WIDTH), dtype=bool)
)
REGION_MASKS = np.ascontiguousarray(
np.random.randint(0, 2, (REGIONS, HEIGHT, WIDTH), dtype=bool)
)
REGION_AREAS = np.sum(REGION_MASKS, axis=(-1, -2))
# OLFR = "Overlap of Frames with Regions"
# Run once to compile the code (although this is not trivial either...)
olfr_native_for_loops(*test_params)
olfr_fully_vectorized(*test_params)
olfr_with_numba(*test_params)
olfr_all_flags_numba(*test_params)
...
# Timing
test_params = (FRAME_MASKS, REGION_MASKS, REGION_AREAS)
for prefix, func in [
["for_loops", lambda: olfr_native_for_loops(*test_params)],
["fully_vectorized", lambda: olfr_fully_vectorized(*test_params)],
["with_numba", lambda: olfr_with_numba(*test_params)],
["all_flags_numba", lambda: olfr_all_flags_numba(*test_params)],
]:
speed = timeit(func, number=5)
print(f"Done. {prefix} speed={speed:.4f}")
</code>
from timeit import timeit
import numba as nb
import numpy as np
import numpy.typing as npt
TIMESTAMPS = 100
REGIONS = 5
HEIGHT = 240
WIDTH = 320
np.random.seed(0)
FRAME_MASKS = np.ascontiguousarray(
np.random.randint(0, 2, (TIMESTAMPS, HEIGHT, WIDTH), dtype=bool)
)
REGION_MASKS = np.ascontiguousarray(
np.random.randint(0, 2, (REGIONS, HEIGHT, WIDTH), dtype=bool)
)
REGION_AREAS = np.sum(REGION_MASKS, axis=(-1, -2))
# OLFR = "Overlap of Frames with Regions"
# Run once to compile the code (although this is not trivial either...)
olfr_native_for_loops(*test_params)
olfr_fully_vectorized(*test_params)
olfr_with_numba(*test_params)
olfr_all_flags_numba(*test_params)
...
# Timing
test_params = (FRAME_MASKS, REGION_MASKS, REGION_AREAS)
for prefix, func in [
["for_loops", lambda: olfr_native_for_loops(*test_params)],
["fully_vectorized", lambda: olfr_fully_vectorized(*test_params)],
["with_numba", lambda: olfr_with_numba(*test_params)],
["all_flags_numba", lambda: olfr_all_flags_numba(*test_params)],
]:
speed = timeit(func, number=5)
print(f"Done. {prefix} speed={speed:.4f}")
- Native for loops
<code>def olfr_native_for_loops(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
ratios = np.zeros((TIMESTAMPS, REGIONS), dtype=np.int32)
for t in range(TIMESTAMPS):
raw_intersection = frame_masks[t] & region_masks[r]
intersection_area = np.sum(raw_intersection, axis=(-1, -2))
ratios[t, r] = intersection_area / region_areas[r]
<code>def olfr_native_for_loops(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
):
ratios = np.zeros((TIMESTAMPS, REGIONS), dtype=np.int32)
for t in range(TIMESTAMPS):
for r in range(REGIONS):
raw_intersection = frame_masks[t] & region_masks[r]
intersection_area = np.sum(raw_intersection, axis=(-1, -2))
ratios[t, r] = intersection_area / region_areas[r]
return ratios
</code>
def olfr_native_for_loops(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
):
ratios = np.zeros((TIMESTAMPS, REGIONS), dtype=np.int32)
for t in range(TIMESTAMPS):
for r in range(REGIONS):
raw_intersection = frame_masks[t] & region_masks[r]
intersection_area = np.sum(raw_intersection, axis=(-1, -2))
ratios[t, r] = intersection_area / region_areas[r]
return ratios
<code>Done. for_loops speed=0.0792
<code>Done. for_loops speed=0.0792
</code>
Done. for_loops speed=0.0792
It has some vectorization as I learned in StackOverflow – 38760898 – np-newaxis-with-numba-nopython, but the for loops are easy to vectorize so I tried that next.
- Fully vectorized
<code>def olfr_fully_vectorized(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
raw_intersection = frame_masks[:, np.newaxis, ...] & region_masks[np.newaxis]
intersection_area = np.sum(raw_intersection, axis=(-1, -2))
ratios = intersection_area / region_areas
<code>def olfr_fully_vectorized(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
):
raw_intersection = frame_masks[:, np.newaxis, ...] & region_masks[np.newaxis]
intersection_area = np.sum(raw_intersection, axis=(-1, -2))
ratios = intersection_area / region_areas
return ratios
</code>
def olfr_fully_vectorized(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
):
raw_intersection = frame_masks[:, np.newaxis, ...] & region_masks[np.newaxis]
intersection_area = np.sum(raw_intersection, axis=(-1, -2))
ratios = intersection_area / region_areas
return ratios
<code>Done. fully_vectorized speed=0.0808
<code>Done. fully_vectorized speed=0.0808
</code>
Done. fully_vectorized speed=0.0808
This is great, because it is clean, but it got slower. As mentioned in StackOverflow – 38760898, this isn’t too surprising because the array is huge so it is probably getting evicted from the memory cache more easily.
So I want a solution that allows me to write vectorized code, but get the performance benefits of smaller array sizes that don’t get evicted from the cache.
In my production example, I saw CPU usage was half what the “native python” solution had, so I assumed BLAS was not able to schedule enough threads to parallelize these operations (a separate issue) so I turned to numba
.
- Regular
numba
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(
intersection_area = np.sum(raw_intersection, axis=-1)
result = intersection_area / region_areas
<code>@nb.njit
def olfr_with_numba(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
):
raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(
1, REGIONS, -1
)
intersection_area = np.sum(raw_intersection, axis=-1)
result = intersection_area / region_areas
return result
</code>
@nb.njit
def olfr_with_numba(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
):
raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(
1, REGIONS, -1
)
intersection_area = np.sum(raw_intersection, axis=-1)
result = intersection_area / region_areas
return result
<code>Done. with_numba speed=0.7177
<code>Done. with_numba speed=0.7177
</code>
Done. with_numba speed=0.7177
Okay, now it got even slower. Fine, I read more docs and learned that numba is not great with big arrays (StackOverflow – 57819913#7460739), and that I should enable the flags of numba.
numba
with flags
<code>@nb.njit(parallel=True, fastmath=True, nogil=True, cache=True)
def olfr_all_flags_numba(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(
intersection_area = np.sum(raw_intersection, axis=-1)
result = intersection_area / region_areas
<code>@nb.njit(parallel=True, fastmath=True, nogil=True, cache=True)
def olfr_all_flags_numba(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
):
raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(
1, REGIONS, -1
)
intersection_area = np.sum(raw_intersection, axis=-1)
result = intersection_area / region_areas
return result
</code>
@nb.njit(parallel=True, fastmath=True, nogil=True, cache=True)
def olfr_all_flags_numba(
frame_masks: npt.ArrayLike, # T x H x W
region_masks: npt.ArrayLike, # R x H x W
region_areas: npt.ArrayLike, # R
):
raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(
1, REGIONS, -1
)
intersection_area = np.sum(raw_intersection, axis=-1)
result = intersection_area / region_areas
return result
<code>Done. all_flags_numba speed=0.3640
<code>Done. all_flags_numba speed=0.3640
</code>
Done. all_flags_numba speed=0.3640
That helped, but we’re still worse than where we started.
In my production code I am able to successfully make it faster by using from multiprocessing.pool import ThreadPool
and sending out threads for all my slow “huge vectorization” operations, but why can’t BLAS
/numba
do that for me automatically?
Thank you in advance!