This is a follow-up question to a question I asked before, but I think I should start over. I am trying to implement a Monte-Carlo simulation of pi, and I am using numba
to improve performance. Since each iteration of the loop is independent of the others, I thought I could get better performance with parallel=True
and numba.prange
. I tried it and got that for small values of n
, the parallelization isn’t worth it. I tried an improved version where I use parallelization after a certain threshold for n
is crossed, but I found it performs worse than my previous attempts most of the time. I now have a compression of 3 versions of the algorithm: a regular one without parallelization, a parallel version using numba.prange
and an “improved” hybrid version that uses parallelization after a specified threshold for n
is crossed:
from datetime import timedelta
from time import perf_counter
import numba as nb
import numpy as np
import numpy.typing as npt
jit_opts = dict(
nopython=True, nogil=True, cache=False, error_model="numpy", fastmath=True
)
rng = np.random.default_rng()
@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=True,
)
def count_points_in_circle_parallel(
points: npt.NDArray[float],
) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count
def monte_carlo_pi_parallel(
n: int,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = count_points_in_circle_parallel(points)
return points, in_circle, 4 * count / n
@nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=False,
)
def count_points_in_circle(points: npt.NDArray[float]) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in range(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count
def monte_carlo_pi(n: int) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = count_points_in_circle(points)
return points, in_circle, 4 * count / n
def count_points_in_circle_improved(
points: npt.NDArray[float],
) -> tuple[npt.NDArray[bool], int]:
in_circle = np.empty(points.shape[0], dtype=np.bool_)
in_circle_count = 0
for i in nb.prange(points.shape[0]):
in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
in_circle_count += in_
return in_circle, in_circle_count
count_points_in_circle_improved_parallel = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=True,
)(count_points_in_circle_improved)
count_points_in_circle_improved = nb.jit(
[
nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
],
**jit_opts,
parallel=False,
)(count_points_in_circle_improved)
def monte_carlo_pi_improved(
n: int, parallel_threshold: int = 1000
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
points = rng.random((n, 2))
in_circle, count = (
count_points_in_circle_improved_parallel(points)
if n > parallel_threshold
else count_points_in_circle_improved(points)
)
return points, in_circle, 4 * count / n
def main() -> None:
n_values = 10 ** np.arange(1, 9)
n_values = np.concatenate(
([10], n_values)
) # Duplicate 10 to avoid startup overhead
time_results = np.empty((len(n_values), 3), dtype=np.float64)
if jit_opts.get("cache", False):
print("Using cached JIT compilation")
else:
print("Using JIT compilation without caching")
print()
print("Using parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi_parallel(n)
end = perf_counter()
duration = end - start
time_results[i, 0] = duration
delta = timedelta(seconds=duration)
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"N{GREEK SMALL LETTER PI} N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)
print()
print("Using non-parallel count_points_in_circle")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 1] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"N{GREEK SMALL LETTER PI} N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)
print()
print("Improved version:")
for i, n in enumerate(n_values):
start = perf_counter()
points, in_circle, pi_approx = monte_carlo_pi_improved(n)
end = perf_counter()
duration = end - start
delta = timedelta(seconds=duration)
time_results[i, 2] = duration
elapsed_msg = (
f"[{delta} (Raw time: {duration} s)]"
if delta
else f"[Raw time: {duration} s]"
)
print(
f"n = {n:,}:".ljust(20),
f"N{GREEK SMALL LETTER PI} N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
elapsed_msg,
)
print()
print("Comparison:")
result_types = ("parallel", "non-parallel", "improved")
for n, res in zip(n_values, time_results):
res_idx = np.argsort(res)
print(
f"n = {n:,}:".ljust(20),
f"{result_types[res_idx[0]]} N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[1]]} N{LESS-THAN OR EQUAL TO} "
f"{result_types[res_idx[2]]}",
)
if __name__ == "__main__":
main()
(I know, I know, this code isn’t very clean and has repetitions, but it’s for testing purposes, and I’ll end up with one of the algorithms in the end). I tried running it with cache=True
and cache=False
to check if it helps with something and the results are very confusing. It looks like sometimes the non-parallel version is faster even for large values of n
and the hybrid version doesn’t really improve anything. Here’s an example of the results I get:
These results are very confusing and they’re not consistent. In a different run, I got that the non-parallel version is faster, and in another run, I got that the parallel version is faster. It looks like I am doing something wrong here, but I can’t understand what’s going on. Why do I not see a consistent performance improvement in the parallel version, especially for large values of n
, and why my hybrid approach doesn’t seem to improve performance in most cases? Any insight into what’s happening here would be appreciated.