Thiết kế website giá rẻ

Question

This is a follow-up question to a question I asked before, but I think I should start over. I am trying to implement a Monte-Carlo simulation of pi, and I am using numba to improve performance. Since each iteration of the loop is independent of the others, I thought I could get better performance with parallel=True and numba.prange. I tried it and got that for small values of n, the parallelization isn’t worth it. I tried an improved version where I use parallelization after a certain threshold for n is crossed, but I found it performs worse than my previous attempts most of the time. I now have a compression of 3 versions of the algorithm: a regular one without parallelization, a parallel version using numba.prange and an “improved” hybrid version that uses parallelization after a specified threshold for n is crossed:

from datetime import timedelta
from time import perf_counter

import numba as nb
import numpy as np
import numpy.typing as npt

jit_opts = dict(
    nopython=True, nogil=True, cache=False, error_model="numpy", fastmath=True
)

rng = np.random.default_rng()


@nb.jit(
    [
        nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
        nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
    ],
    **jit_opts,
    parallel=True,
)
def count_points_in_circle_parallel(
    points: npt.NDArray[float],
) -> tuple[npt.NDArray[bool], int]:
    in_circle = np.empty(points.shape[0], dtype=np.bool_)
    in_circle_count = 0
    for i in nb.prange(points.shape[0]):
        in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
        in_circle_count += in_
    return in_circle, in_circle_count


def monte_carlo_pi_parallel(
    n: int,
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
    points = rng.random((n, 2))
    in_circle, count = count_points_in_circle_parallel(points)
    return points, in_circle, 4 * count / n


@nb.jit(
    [
        nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
        nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
    ],
    **jit_opts,
    parallel=False,
)
def count_points_in_circle(points: npt.NDArray[float]) -> tuple[npt.NDArray[bool], int]:
    in_circle = np.empty(points.shape[0], dtype=np.bool_)
    in_circle_count = 0
    for i in range(points.shape[0]):
        in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
        in_circle_count += in_
    return in_circle, in_circle_count


def monte_carlo_pi(n: int) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
    points = rng.random((n, 2))
    in_circle, count = count_points_in_circle(points)
    return points, in_circle, 4 * count / n


def count_points_in_circle_improved(
    points: npt.NDArray[float],
) -> tuple[npt.NDArray[bool], int]:
    in_circle = np.empty(points.shape[0], dtype=np.bool_)
    in_circle_count = 0
    for i in nb.prange(points.shape[0]):
        in_ = in_circle[i] = points[i, 0] ** 2 + points[i, 1] ** 2 < 1
        in_circle_count += in_
    return in_circle, in_circle_count


count_points_in_circle_improved_parallel = nb.jit(
    [
        nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
        nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
    ],
    **jit_opts,
    parallel=True,
)(count_points_in_circle_improved)
count_points_in_circle_improved = nb.jit(
    [
        nb.types.Tuple((nb.bool_[:], nb.int64))(nb.float64[:, :]),
        nb.types.Tuple((nb.bool_[:], nb.int32))(nb.float32[:, :]),
    ],
    **jit_opts,
    parallel=False,
)(count_points_in_circle_improved)


def monte_carlo_pi_improved(
    n: int, parallel_threshold: int = 1000
) -> tuple[npt.NDArray[float], npt.NDArray[bool], float]:
    points = rng.random((n, 2))
    in_circle, count = (
        count_points_in_circle_improved_parallel(points)
        if n > parallel_threshold
        else count_points_in_circle_improved(points)
    )
    return points, in_circle, 4 * count / n


def main() -> None:
    n_values = 10 ** np.arange(1, 9)
    n_values = np.concatenate(
        ([10], n_values)
    )  # Duplicate 10 to avoid startup overhead
    time_results = np.empty((len(n_values), 3), dtype=np.float64)

    if jit_opts.get("cache", False):
        print("Using cached JIT compilation")
    else:
        print("Using JIT compilation without caching")
    print()

    print("Using parallel count_points_in_circle")
    for i, n in enumerate(n_values):
        start = perf_counter()
        points, in_circle, pi_approx = monte_carlo_pi_parallel(n)
        end = perf_counter()
        duration = end - start
        time_results[i, 0] = duration
        delta = timedelta(seconds=duration)
        elapsed_msg = (
            f"[{delta} (Raw time: {duration} s)]"
            if delta
            else f"[Raw time: {duration} s]"
        )
        print(
            f"n = {n:,}:".ljust(20),
            f"N{GREEK SMALL LETTER PI} N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
            elapsed_msg,
        )

    print()
    print("Using non-parallel count_points_in_circle")
    for i, n in enumerate(n_values):
        start = perf_counter()
        points, in_circle, pi_approx = monte_carlo_pi(n)
        end = perf_counter()
        duration = end - start
        delta = timedelta(seconds=duration)
        time_results[i, 1] = duration
        elapsed_msg = (
            f"[{delta} (Raw time: {duration} s)]"
            if delta
            else f"[Raw time: {duration} s]"
        )
        print(
            f"n = {n:,}:".ljust(20),
            f"N{GREEK SMALL LETTER PI} N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
            elapsed_msg,
        )

    print()
    print("Improved version:")
    for i, n in enumerate(n_values):
        start = perf_counter()
        points, in_circle, pi_approx = monte_carlo_pi_improved(n)
        end = perf_counter()
        duration = end - start
        delta = timedelta(seconds=duration)
        time_results[i, 2] = duration
        elapsed_msg = (
            f"[{delta} (Raw time: {duration} s)]"
            if delta
            else f"[Raw time: {duration} s]"
        )
        print(
            f"n = {n:,}:".ljust(20),
            f"N{GREEK SMALL LETTER PI} N{ALMOST EQUAL TO} {pi_approx}".ljust(20),
            elapsed_msg,
        )

    print()
    print("Comparison:")
    result_types = ("parallel", "non-parallel", "improved")
    for n, res in zip(n_values, time_results):
        res_idx = np.argsort(res)
        print(
            f"n = {n:,}:".ljust(20),
            f"{result_types[res_idx[0]]} N{LESS-THAN OR EQUAL TO} "
            f"{result_types[res_idx[1]]} N{LESS-THAN OR EQUAL TO} "
            f"{result_types[res_idx[2]]}",
        )


if __name__ == "__main__":
    main()

(I know, I know, this code isn’t very clean and has repetitions, but it’s for testing purposes, and I’ll end up with one of the algorithms in the end). I tried running it with cache=True and cache=False to check if it helps with something and the results are very confusing. It looks like sometimes the non-parallel version is faster even for large values of n and the hybrid version doesn’t really improve anything. Here’s an example of the results I get:

These results are very confusing and they’re not consistent. In a different run, I got that the non-parallel version is faster, and in another run, I got that the parallel version is faster. It looks like I am doing something wrong here, but I can’t understand what’s going on. Why do I not see a consistent performance improvement in the parallel version, especially for large values of n, and why my hybrid approach doesn’t seem to improve performance in most cases? Any insight into what’s happening here would be appreciated.

Thiết kế website giá rẻ

Danh mục

Numba parallelization doesn’t help performance in Monte-Carlo simulation?