I’m trying to detect multi hand gestures using mediapipe. I want to detect both the gestures of both hands independently. Both hands can have the same gesture or different gestures. I the given code the function print_result
is printing the the contents of the object after the inference has been run on the frame. the max_num_hands
parameter has been set to 2
here with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5) as hands:
import cv2
import mediapipe as mp
import time
cap = cv2.VideoCapture(1)
BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
GestureRecognizerResult = mp.tasks.vision.GestureRecognizerResult
VisionRunningMode = mp.tasks.vision.RunningMode
# Callback function to print gesture recognition results
def print_result(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
if result.gestures:
# Get the category name of the recognized gesture
category_name = result.gestures[0][0].category_name
# print(category_name)
print(result)
else:
print("No gestures recognized")
# Initialize MediaPipe drawing utils and hands module
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
# Configure options for the gesture recognizer
options = GestureRecognizerOptions(
base_options=BaseOptions(model_asset_path='C:\Users\golut\OneDrive\Documents\Projects\Virtual Mouse\models\gesture_recognizer.task'),
running_mode=VisionRunningMode.LIVE_STREAM,
result_callback=print_result
)
# Create a gesture recognizer instance
with GestureRecognizer.create_from_options(options) as recognizer:
print('Gesture recognizer created')
while True:
success, img = cap.read()
if not success:
print("Ignoring empty camera frame.")
continue
# Convert BGR image to RGB for MediaPipe processing
rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Detect hand landmarks using MediaPipe Hands
with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5) as hands:
results = hands.process(rgb_img)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
# Draw hand landmarks on the image with specified color and thickness
mp_drawing.draw_landmarks(
img, hand_landmarks, mp_hands.HAND_CONNECTIONS,
mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2),
mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=2)
)
# Prepare image for gesture recognition
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_img)
current_time_ms = int(time.time() * 1000)
# Perform gesture recognition on the processed image
detected_gestures = recognizer.recognize_async(mp_image, current_time_ms)
img = cv2.flip(img, 1) # Flips the image horizontally
cv2.imshow("Imshow", img)
if cv2.waitKey(10) == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
In the object GestureRecognizerResult
we see a list handedness
that contains the category_name
which is either left or right.
The problem is the gesture recognizer only gives one output either left or right hand in the output depending on which hand got detected first and the latter is ignored. In mediapipe’s given try on example, both hands when shown to the camera with different gesture are recognized independently. Link to mediapipe demo
GestureRecognizerResult(gestures=[[Category(index=-1, score=0.7995390892028809,
display_name='', category_name='Open_Palm')]], handedness=[[Category(index=0, score=0.9178019165992737, display_name='Right', category_name='Right')]],
hand_landmarks=[[NormalizedLandmark(x=0.23192565143108368, y=0.8508237600326538, z=3.7175095712882467e-07, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.2964465022087097, y=0.807819128036499, z=-0.02174699306488037, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.3386477530002594, y=0.7381684184074402, z=-0.026875635609030724, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.3652242422103882, y=0.6717657446861267, z=-0.03148443624377251, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.39171433448791504, y=0.627888560295105, z=-0.03597773239016533, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.30005523562431335, y=0.6441321969032288, z=-0.002747688442468643, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.3194928765296936, y=0.5634738802909851, z=-0.015889683738350868, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.3276906907558441, y=0.5102080702781677, z=-0.0299211535602808, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.33434727787971497, y=0.46343517303466797, z=-0.04088740795850754, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.2615800201892853, y=0.6335919499397278, z=-0.002842121757566929, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.26276978850364685, y=0.5426733493804932, z=-0.014345655217766762, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.2621628940105438, y=0.48378312587738037, z=-0.028536789119243622, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.26235222816467285, y=0.43310630321502686, z=-0.03940063342452049, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.22592493891716003, y=0.6417601108551025, z=-0.006861940026283264, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.2230750024318695, y=0.5614591240882874, z=-0.01952073909342289, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.22449643909931183, y=0.5094373822212219, z=-0.029860520735383034, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.229284405708313, y=0.46403464674949646, z=-0.03746004030108452, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.19173786044120789, y=0.663299024105072, z=-0.0136506836861372, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.18222525715827942, y=0.604834794998169, z=-0.025881653651595116, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.18415895104408264, y=0.5673394799232483, z=-0.03144041821360588, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.19118154048919678, y=0.5324922800064087, z=-0.034897807985544205, visibility=0.0, presence=0.0)]],
hand_world_landmarks=[[Landmark(x=-0.012245522812008858, y=0.09203963726758957, z=-0.0038926522247493267, visibility=0.0, presence=0.0),
Landmark(x=0.021369636058807373, y=0.06962162256240845, z=-0.009559692814946175, visibility=0.0, presence=0.0),
Landmark(x=0.042654991149902344, y=0.04227661341428757, z=-0.012077674269676208, visibility=0.0, presence=0.0),
Landmark(x=0.0617685541510582, y=0.014768477529287338, z=-0.011491118930280209, visibility=0.0, presence=0.0),
Landmark(x=0.07398916780948639, y=-0.012367911636829376, z=-0.0075836945325136185, visibility=0.0, presence=0.0),
Landmark(x=0.025482138618826866, y=-0.0010876771993935108, z=0.006445789244025946, visibility=0.0, presence=0.0),
Landmark(x=0.03543740138411522, y=-0.02912675403058529, z=-0.00173004565294832, visibility=0.0, presence=0.0),
Landmark(x=0.040552493184804916, y=-0.0489623099565506, z=-0.007902431301772594, visibility=0.0, presence=0.0),
Landmark(x=0.04358145594596863, y=-0.06487865746021271, z=-0.0319957509636879, visibility=0.0, presence=0.0),
Landmark(x=0.0016808465588837862, y=-0.004498452879488468, z=0.006683729123324156, visibility=0.0, presence=0.0),
Landmark(x=0.004972374066710472, y=-0.04138147830963135, z=-0.003927251789718866, visibility=0.0, presence=0.0),
Landmark(x=0.00558849610388279, y=-0.06327502429485321, z=-0.020593348890542984, visibility=0.0, presence=0.0),
Landmark(x=0.0066368915140628815, y=-0.08291880786418915, z=-0.039193443953990936, visibility=0.0, presence=0.0),
Landmark(x=-0.018360454589128494, y=-0.0009643810335546732, z=-0.0038148483727127314, visibility=0.0, presence=0.0),
Landmark(x=-0.015782665461301804, y=-0.03162727132439613, z=-0.013909644447267056, visibility=0.0, presence=0.0),
Landmark(x=-0.013191262260079384, y=-0.05145301669836044, z=-0.028273196890950203, visibility=0.0, presence=0.0),
Landmark(x=-0.009723789989948273, y=-0.0685187503695488, z=-0.04024944826960564, visibility=0.0, presence=0.0),
Landmark(x=-0.035820234566926956, y=0.011946788057684898, z=-0.0120608601719141, visibility=0.0, presence=0.0),
Landmark(x=-0.03725161403417587, y=-0.009996423497796059, z=-0.017715157940983772, visibility=0.0, presence=0.0),
Landmark(x=-0.036166295409202576, y=-0.028470497578382492, z=-0.026987750083208084, visibility=0.0, presence=0.0),
Landmark(x=-0.030654065310955048, y=-0.03972318768501282, z=-0.03699912130832672, visibility=0.0, presence=0.0)]])
I want to achieve the same result as the demo of recognizing both hands individually at once with different gestures.