I am writing a finite volume code to solve the inviscid, compressible Euler equations. As a part of this, I am performing what is known as the Cauchy-Kovalevskaya process. A code snippet is given below
def cauchyKovalevskaya(drho_x: npt.NDArray[np.float64], dq_x: npt.NDArray[np.float64], dE_x: npt.NDArray[np.float64], k: int, gamma: float) -> npt.NDArray:
if k == 0:
return np.array([drho_x[0], dq_x[0], dE_x[0]], dtype=np.float64)
elif k == 1:
rho_t = -dq_x[1]
q_t = -dq_x[0]**2*drho_x[1]*gamma/(2*drho_x[0]**2) + 3*dq_x[0]**2*drho_x[1]/(2*drho_x[0]**2) + dq_x[0]*dq_x[1]*gamma/drho_x[0] - 3*dq_x[0]*dq_x[1]/drho_x[0] - dE_x[1]*gamma + dE_x[1]
E_t = -dq_x[0]**3*drho_x[1]*gamma/drho_x[0]**3 + dq_x[0]**3*drho_x[1]/drho_x[0]**3 + 3*dq_x[0]**2*dq_x[1]*gamma/(2*drho_x[0]**2) - 3*dq_x[0]**2*dq_x[1]/(2*drho_x[0]**2) - dq_x[0]*dE_x[1]*gamma/drho_x[0] + dq_x[0]*drho_x[1]*dE_x[0]*gamma/drho_x[0]**2 - dq_x[1]*dE_x[0]*gamma/drho_x[0]
return np.array([rho_t, q_t, E_t], dtype=np.float64)
elif k == 2:
rho_tt = dq_x[0]**2*drho_x[2]*gamma/(2*drho_x[0]**2) - 3*dq_x[0]**2*drho_x[2]/(2*drho_x[0]**2) - dq_x[0]**2*drho_x[1]**2*gamma/drho_x[0]**3 + 3*dq_x[0]**2*drho_x[1]**2/drho_x[0]**3 + 2*dq_x[0]*dq_x[1]*drho_x[1]*gamma/drho_x[0]**2 - 6*dq_x[0]*dq_x[1]*drho_x[1]/drho_x[0]**2 - dq_x[0]*dq_x[2]*gamma/drho_x[0] + 3*dq_x[0]*dq_x[2]/drho_x[0] - dq_x[1]**2*gamma/drho_x[0] + 3*dq_x[1]**2/drho_x[0] + dE_x[2]*gamma - dE_x[2]
q_tt = dq_x[0]**3*drho_x[2]*gamma**2/(2*drho_x[0]**3) + dq_x[0]**3*drho_x[2]*gamma/drho_x[0]**3 - 7*dq_x[0]**3*drho_x[2]/(2*drho_x[0]**3) - 3*dq_x[0]**3*drho_x[1]**2*gamma**2/(2*drho_x[0]**4) - 3*dq_x[0]**3*drho_x[1]**2*gamma/drho_x[0]**4 + 21*dq_x[0]**3*drho_x[1]**2/(2*drho_x[0]**4) + 5*dq_x[0]**2*dq_x[1]*drho_x[1]*gamma**2/(2*drho_x[0]**3) + 8*dq_x[0]**2*dq_x[1]*drho_x[1]*gamma/drho_x[0]**3 - 45*dq_x[0]**2*dq_x[1]*drho_x[1]/(2*drho_x[0]**3) - dq_x[0]**2*dq_x[2]*gamma**2/(2*drho_x[0]**2) - 5*dq_x[0]**2*dq_x[2]*gamma/(2*drho_x[0]**2) + 6*dq_x[0]**2*dq_x[2]/drho_x[0]**2 - dq_x[0]*dq_x[1]**2*gamma**2/drho_x[0]**2 - 5*dq_x[0]*dq_x[1]**2*gamma/drho_x[0]**2 + 12*dq_x[0]*dq_x[1]**2/drho_x[0]**2 + 3*dq_x[0]*dE_x[2]*gamma/drho_x[0] - 3*dq_x[0]*dE_x[2]/drho_x[0] - dq_x[0]*drho_x[1]*dE_x[1]*gamma**2/drho_x[0]**2 - 2*dq_x[0]*drho_x[1]*dE_x[1]*gamma/drho_x[0]**2 + 3*dq_x[0]*drho_x[1]*dE_x[1]/drho_x[0]**2 - dq_x[0]*drho_x[2]*dE_x[0]*gamma**2/drho_x[0]**2 + dq_x[0]*drho_x[2]*dE_x[0]*gamma/drho_x[0]**2 + 2*dq_x[0]*drho_x[1]**2*dE_x[0]*gamma**2/drho_x[0]**3 - 2*dq_x[0]*drho_x[1]**2*dE_x[0]*gamma/drho_x[0]**3 + dq_x[1]*dE_x[1]*gamma**2/drho_x[0] + 2*dq_x[1]*dE_x[1]*gamma/drho_x[0] - 3*dq_x[1]*dE_x[1]/drho_x[0] - 2*dq_x[1]*drho_x[1]*dE_x[0]*gamma**2/drho_x[0]**2 + 2*dq_x[1]*drho_x[1]*dE_x[0]*gamma/drho_x[0]**2 + dq_x[2]*dE_x[0]*gamma**2/drho_x[0] - dq_x[2]*dE_x[0]*gamma/drho_x[0]
E_tt = dq_x[0]**4*drho_x[2]*gamma**2/(4*drho_x[0]**4) + 2*dq_x[0]**4*drho_x[2]*gamma/drho_x[0]**4 - 9*dq_x[0]**4*drho_x[2]/(4*drho_x[0]**4) - dq_x[0]**4*drho_x[1]**2*gamma**2/drho_x[0]**5 - 8*dq_x[0]**4*drho_x[1]**2*gamma/drho_x[0]**5 + 9*dq_x[0]**4*drho_x[1]**2/drho_x[0]**5 + dq_x[0]**3*dq_x[1]*drho_x[1]*gamma**2/drho_x[0]**4 + 37*dq_x[0]**3*dq_x[1]*drho_x[1]*gamma/(2*drho_x[0]**4) - 39*dq_x[0]**3*dq_x[1]*drho_x[1]/(2*drho_x[0]**4) - 7*dq_x[0]**3*dq_x[2]*gamma/(2*drho_x[0]**3) + 7*dq_x[0]**3*dq_x[2]/(2*drho_x[0]**3) - 21*dq_x[0]**2*dq_x[1]**2*gamma/(2*drho_x[0]**3) + 21*dq_x[0]**2*dq_x[1]**2/(2*drho_x[0]**3) - dq_x[0]**2*dE_x[2]*gamma**2/(2*drho_x[0]**2) + 3*dq_x[0]**2*dE_x[2]*gamma/drho_x[0]**2 - 3*dq_x[0]**2*dE_x[2]/(2*drho_x[0]**2) + dq_x[0]**2*drho_x[1]*dE_x[1]*gamma**2/(2*drho_x[0]**3) - 15*dq_x[0]**2*drho_x[1]*dE_x[1]*gamma/(2*drho_x[0]**3) + 3*dq_x[0]**2*drho_x[1]*dE_x[1]/drho_x[0]**3 - dq_x[0]**2*drho_x[2]*dE_x[0]*gamma**2/(2*drho_x[0]**3) - 3*dq_x[0]**2*drho_x[2]*dE_x[0]*gamma/(2*drho_x[0]**3) + 3*dq_x[0]**2*drho_x[1]**2*dE_x[0]*gamma**2/(2*drho_x[0]**4) + 9*dq_x[0]**2*drho_x[1]**2*dE_x[0]*gamma/(2*drho_x[0]**4) - dq_x[0]*dq_x[1]*dE_x[1]*gamma**2/drho_x[0]**2 + 8*dq_x[0]*dq_x[1]*dE_x[1]*gamma/drho_x[0]**2 - 3*dq_x[0]*dq_x[1]*dE_x[1]/drho_x[0]**2 - dq_x[0]*dq_x[1]*drho_x[1]*dE_x[0]*gamma**2/drho_x[0]**3 - 7*dq_x[0]*dq_x[1]*drho_x[1]*dE_x[0]*gamma/drho_x[0]**3 + 2*dq_x[0]*dq_x[2]*dE_x[0]*gamma/drho_x[0]**2 + 2*dq_x[1]**2*dE_x[0]*gamma/drho_x[0]**2 + dE_x[0]*dE_x[2]*gamma**2/drho_x[0] - dE_x[0]*dE_x[2]*gamma/drho_x[0] + dE_x[1]**2*gamma**2/drho_x[0] - dE_x[1]**2*gamma/drho_x[0] - drho_x[1]*dE_x[0]*dE_x[1]*gamma**2/drho_x[0]**2 + drho_x[1]*dE_x[0]*dE_x[1]*gamma/drho_x[0]**2
return np.array([rho_tt, q_tt, E_tt], dtype=np.float64)
else:
raise Exception("Invalid order")
There are also statements for k = 3, 4, and 5 which I have excluded for brevity. Suffice to say that these lines become longer and longer (80000+ characters in a line) with loads of similar memory accesses, powers, and multiplications like above.
When I use njit with this code, a) it is slower than without njit (for k = 5 the difference is 500 to 600 times), and b) it gets slower and slower the higher the k I include elif statements for, regardless of which k I am evaluating the function for. As far as I can tell, there should be nothing in the function which makes the performance using Numba so bad.