Thiết kế website giá rẻ

Question

I originally had some code that operates on very large arrays using for loops. I wanted to see if I can speed it up with numpy and numba and tried 4 incremental steps to get it faster.

Setup:

<code>from timeit import timeit

import numba as nb

import numpy as np

import numpy.typing as npt

TIMESTAMPS = 100

REGIONS = 5

HEIGHT = 240

WIDTH = 320

np.random.seed(0)

FRAME_MASKS = np.ascontiguousarray(

np.random.randint(0, 2, (TIMESTAMPS, HEIGHT, WIDTH), dtype=bool)

)

REGION_MASKS = np.ascontiguousarray(

np.random.randint(0, 2, (REGIONS, HEIGHT, WIDTH), dtype=bool)

)

REGION_AREAS = np.sum(REGION_MASKS, axis=(-1, -2))

# OLFR = "Overlap of Frames with Regions"

# Run once to compile the code (although this is not trivial either...)

olfr_native_for_loops(*test_params)

olfr_fully_vectorized(*test_params)

olfr_with_numba(*test_params)

olfr_all_flags_numba(*test_params)

...

# Timing

test_params = (FRAME_MASKS, REGION_MASKS, REGION_AREAS)

for prefix, func in [

["for_loops", lambda: olfr_native_for_loops(*test_params)],

["fully_vectorized", lambda: olfr_fully_vectorized(*test_params)],

["with_numba", lambda: olfr_with_numba(*test_params)],

["all_flags_numba", lambda: olfr_all_flags_numba(*test_params)],

]:

speed = timeit(func, number=5)

print(f"Done. {prefix} speed={speed:.4f}")

</code>

<code>from timeit import timeit import numba as nb import numpy as np import numpy.typing as npt TIMESTAMPS = 100 REGIONS = 5 HEIGHT = 240 WIDTH = 320 np.random.seed(0) FRAME_MASKS = np.ascontiguousarray( np.random.randint(0, 2, (TIMESTAMPS, HEIGHT, WIDTH), dtype=bool) ) REGION_MASKS = np.ascontiguousarray( np.random.randint(0, 2, (REGIONS, HEIGHT, WIDTH), dtype=bool) ) REGION_AREAS = np.sum(REGION_MASKS, axis=(-1, -2)) # OLFR = "Overlap of Frames with Regions" # Run once to compile the code (although this is not trivial either...) olfr_native_for_loops(*test_params) olfr_fully_vectorized(*test_params) olfr_with_numba(*test_params) olfr_all_flags_numba(*test_params) ... # Timing test_params = (FRAME_MASKS, REGION_MASKS, REGION_AREAS) for prefix, func in [ ["for_loops", lambda: olfr_native_for_loops(*test_params)], ["fully_vectorized", lambda: olfr_fully_vectorized(*test_params)], ["with_numba", lambda: olfr_with_numba(*test_params)], ["all_flags_numba", lambda: olfr_all_flags_numba(*test_params)], ]: speed = timeit(func, number=5) print(f"Done. {prefix} speed={speed:.4f}") </code>

from timeit import timeit

import numba as nb
import numpy as np
import numpy.typing as npt

TIMESTAMPS = 100
REGIONS = 5
HEIGHT = 240
WIDTH = 320

np.random.seed(0)

FRAME_MASKS = np.ascontiguousarray(
    np.random.randint(0, 2, (TIMESTAMPS, HEIGHT, WIDTH), dtype=bool)
)
REGION_MASKS = np.ascontiguousarray(
    np.random.randint(0, 2, (REGIONS, HEIGHT, WIDTH), dtype=bool)
)
REGION_AREAS = np.sum(REGION_MASKS, axis=(-1, -2))

# OLFR = "Overlap of Frames with Regions"

# Run once to compile the code (although this is not trivial either...)
olfr_native_for_loops(*test_params)
olfr_fully_vectorized(*test_params)
olfr_with_numba(*test_params)
olfr_all_flags_numba(*test_params)

...

# Timing
test_params = (FRAME_MASKS, REGION_MASKS, REGION_AREAS)
for prefix, func in [
    ["for_loops", lambda: olfr_native_for_loops(*test_params)],
    ["fully_vectorized", lambda: olfr_fully_vectorized(*test_params)],
    ["with_numba", lambda: olfr_with_numba(*test_params)],
    ["all_flags_numba", lambda: olfr_all_flags_numba(*test_params)],
]:
    speed = timeit(func, number=5)
    print(f"Done. {prefix} speed={speed:.4f}")

Native for loops

<code>def olfr_native_for_loops(

frame_masks: npt.ArrayLike, # T x H x W

region_masks: npt.ArrayLike, # R x H x W

region_areas: npt.ArrayLike, # R

):

ratios = np.zeros((TIMESTAMPS, REGIONS), dtype=np.int32)

for t in range(TIMESTAMPS):

for r in range(REGIONS):

raw_intersection = frame_masks[t] & region_masks[r]

intersection_area = np.sum(raw_intersection, axis=(-1, -2))

ratios[t, r] = intersection_area / region_areas[r]

return ratios

</code>

<code>def olfr_native_for_loops( frame_masks: npt.ArrayLike, # T x H x W region_masks: npt.ArrayLike, # R x H x W region_areas: npt.ArrayLike, # R ): ratios = np.zeros((TIMESTAMPS, REGIONS), dtype=np.int32) for t in range(TIMESTAMPS): for r in range(REGIONS): raw_intersection = frame_masks[t] & region_masks[r] intersection_area = np.sum(raw_intersection, axis=(-1, -2)) ratios[t, r] = intersection_area / region_areas[r] return ratios </code>

def olfr_native_for_loops(
    frame_masks: npt.ArrayLike,  # T x H x W
    region_masks: npt.ArrayLike,  # R x H x W
    region_areas: npt.ArrayLike,  # R
):
    ratios = np.zeros((TIMESTAMPS, REGIONS), dtype=np.int32)

    for t in range(TIMESTAMPS):
        for r in range(REGIONS):
            raw_intersection = frame_masks[t] & region_masks[r]
            intersection_area = np.sum(raw_intersection, axis=(-1, -2))
            ratios[t, r] = intersection_area / region_areas[r]

    return ratios

<code>Done. for_loops speed=0.0792

</code>

<code>Done. for_loops speed=0.0792 </code>

Done. for_loops speed=0.0792

It has some vectorization as I learned in StackOverflow – 38760898 – np-newaxis-with-numba-nopython, but the for loops are easy to vectorize so I tried that next.

Fully vectorized

<code>def olfr_fully_vectorized(

frame_masks: npt.ArrayLike, # T x H x W

region_masks: npt.ArrayLike, # R x H x W

region_areas: npt.ArrayLike, # R

):

raw_intersection = frame_masks[:, np.newaxis, ...] & region_masks[np.newaxis]

intersection_area = np.sum(raw_intersection, axis=(-1, -2))

ratios = intersection_area / region_areas

return ratios

</code>

<code>def olfr_fully_vectorized( frame_masks: npt.ArrayLike, # T x H x W region_masks: npt.ArrayLike, # R x H x W region_areas: npt.ArrayLike, # R ): raw_intersection = frame_masks[:, np.newaxis, ...] & region_masks[np.newaxis] intersection_area = np.sum(raw_intersection, axis=(-1, -2)) ratios = intersection_area / region_areas return ratios </code>

def olfr_fully_vectorized(
    frame_masks: npt.ArrayLike,  # T x H x W
    region_masks: npt.ArrayLike,  # R x H x W
    region_areas: npt.ArrayLike,  # R
):
    raw_intersection = frame_masks[:, np.newaxis, ...] & region_masks[np.newaxis]
    intersection_area = np.sum(raw_intersection, axis=(-1, -2))
    ratios = intersection_area / region_areas
    return ratios

<code>Done. fully_vectorized speed=0.0808

</code>

<code>Done. fully_vectorized speed=0.0808 </code>

Done. fully_vectorized speed=0.0808

This is great, because it is clean, but it got slower. As mentioned in StackOverflow – 38760898, this isn’t too surprising because the array is huge so it is probably getting evicted from the memory cache more easily.

So I want a solution that allows me to write vectorized code, but get the performance benefits of smaller array sizes that don’t get evicted from the cache.

In my production example, I saw CPU usage was half what the “native python” solution had, so I assumed BLAS was not able to schedule enough threads to parallelize these operations (a separate issue) so I turned to numba.

Regular numba

<code>@nb.njit

def olfr_with_numba(

frame_masks: npt.ArrayLike, # T x H x W

region_masks: npt.ArrayLike, # R x H x W

region_areas: npt.ArrayLike, # R

):

raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(

1, REGIONS, -1

)

intersection_area = np.sum(raw_intersection, axis=-1)

result = intersection_area / region_areas

return result

</code>

<code>@nb.njit def olfr_with_numba( frame_masks: npt.ArrayLike, # T x H x W region_masks: npt.ArrayLike, # R x H x W region_areas: npt.ArrayLike, # R ): raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape( 1, REGIONS, -1 ) intersection_area = np.sum(raw_intersection, axis=-1) result = intersection_area / region_areas return result </code>

@nb.njit
def olfr_with_numba(
    frame_masks: npt.ArrayLike,  # T x H x W
    region_masks: npt.ArrayLike,  # R x H x W
    region_areas: npt.ArrayLike,  # R
):
    raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(
        1, REGIONS, -1
    )
    intersection_area = np.sum(raw_intersection, axis=-1)
    result = intersection_area / region_areas
    return result

<code>Done. with_numba speed=0.7177

</code>

<code>Done. with_numba speed=0.7177 </code>

Done. with_numba speed=0.7177

Okay, now it got even slower. Fine, I read more docs and learned that numba is not great with big arrays (StackOverflow – 57819913#7460739), and that I should enable the flags of numba.

numba with flags

<code>@nb.njit(parallel=True, fastmath=True, nogil=True, cache=True)

def olfr_all_flags_numba(

frame_masks: npt.ArrayLike, # T x H x W

region_masks: npt.ArrayLike, # R x H x W

region_areas: npt.ArrayLike, # R

):

raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(

1, REGIONS, -1

)

intersection_area = np.sum(raw_intersection, axis=-1)

result = intersection_area / region_areas

return result

</code>

<code>@nb.njit(parallel=True, fastmath=True, nogil=True, cache=True) def olfr_all_flags_numba( frame_masks: npt.ArrayLike, # T x H x W region_masks: npt.ArrayLike, # R x H x W region_areas: npt.ArrayLike, # R ): raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape( 1, REGIONS, -1 ) intersection_area = np.sum(raw_intersection, axis=-1) result = intersection_area / region_areas return result </code>

@nb.njit(parallel=True, fastmath=True, nogil=True, cache=True)
def olfr_all_flags_numba(
    frame_masks: npt.ArrayLike,  # T x H x W
    region_masks: npt.ArrayLike,  # R x H x W
    region_areas: npt.ArrayLike,  # R
):
    raw_intersection = frame_masks.reshape(TIMESTAMPS, 1, -1) & region_masks.reshape(
        1, REGIONS, -1
    )
    intersection_area = np.sum(raw_intersection, axis=-1)
    result = intersection_area / region_areas
    return result

<code>Done. all_flags_numba speed=0.3640

</code>

<code>Done. all_flags_numba speed=0.3640 </code>

Done. all_flags_numba speed=0.3640

That helped, but we’re still worse than where we started.

In my production code I am able to successfully make it faster by using from multiprocessing.pool import ThreadPool and sending out threads for all my slow “huge vectorization” operations, but why can’t BLAS/numba do that for me automatically?

Thank you in advance!

Thiết kế website giá rẻ

Danh mục

Using `numba` to speed up vectorization on very large `numpy` arrays