I am using a pose estimation model from movenet multipose. On a dedicated file that only carries out the pose estimation and nothing else, the scaling is for the keypoints is accurate.
However, on the Flask application, when I am trying to apply the the keypoints are not scaled correctly and are “squished”. The part of the code concerning the keypoints is below:
With this in mind, would it be better to keep the code for the model separate and somehow pass the input from my app.py file to my model? Or is there a way to implement movenet multipose into my app.py fle?
def draw_keypoints(frame, keypoints, confidence_threshold, original_frame_size):
y, x = original_frame_size # Unpack the height and width
shaped = np.squeeze(np.multiply(keypoints, [y, x, 1]))
for kp in shaped:
ky, kx, kp_conf = kp
if kp_conf > confidence_threshold:
# Scale keypoint coordinates to match resized frame size
scaled_kx = int(kx * frame.shape[1] / x)
scaled_ky = int(ky * frame.shape[0] / y)
cv2.circle(frame, (scaled_kx, scaled_ky), 6, (0, 255, 0), -1)
def draw_connections(frame, keypoints, edges, confidence_threshold, original_frame_size):
y, x = original_frame_size # Unpack the height and width
shaped = np.squeeze(np.multiply(keypoints, [y, x, 1]))
for edge, color in edges.items():
p1, p2 = edge
y1, x1, c1 = shaped[p1]
y2, x2, c2 = shaped[p2]
if (c1 > confidence_threshold) and (c2 > confidence_threshold):
# Scale keypoint coordinates to match resized frame size
scaled_x1 = int(x1 * frame.shape[1] / x)
scaled_y1 = int(y1 * frame.shape[0] / y)
scaled_x2 = int(x2 * frame.shape[1] / x)
scaled_y2 = int(y2 * frame.shape[0] / y)
cv2.line(frame, (scaled_x1, scaled_y1), (scaled_x2, scaled_y2), (0, 0, 255), 4)
def process_video(video_path):
cap = cv2.VideoCapture(video_path)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
# Create VideoWriter object to save processed frames as a video
processed_video_path = os.path.splitext(video_path)[0] + '_out.mp4'
out = cv2.VideoWriter(processed_video_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
# Process each frame of the video
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
original_frame_size = frame.shape[:2] # Get original frame size
# Resize frame to match input size expected by the model
input_frame = cv2.resize(frame, (384, 640))
# Perform pose estimation on the frame
results = movenet(tf.cast(tf.expand_dims(input_frame, axis=0), dtype=tf.int32))
keypoints_with_scores = results['output_0'].numpy()[:, :, :51].reshape((6, 17, 3))
# Draw keypoints and connections on the frame
draw_connections(frame, keypoints_with_scores[0], EDGES, 0.1, original_frame_size)
draw_keypoints(frame, keypoints_with_scores[0], 0.1, original_frame_size)
# Write the processed frame to the output video
out.write(frame)
# Debugging: Print the processed frame shape
print("Processed frame shape:", frame.shape)
# Release the video capture object and the output video writer
cap.release()
out.release()
return processed_video_path