Reference:
https://github.com/google/mediapipe
https://developers.google.com/mediapipe/solutions/guide
The framework also supports cv, nlp, audio and other projects, and the speed is very fast:
1. Graphic recognition
Reference: https://developers.google.com/mediapipe/solutions/vision/object_detector/python
https://github.com/google/mediapipe/blob/master/docs/solutions/face_mesh.md
Model download: https://developers.google.com/mediapipe/solutions/vision/object_detector
Code:
import cv2
import numpy as np
IMAGE_FILE="cat_dog.png"
MARGIN = 10 # pixels
ROW_SIZE = 10 # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
TEXT_COLOR = (255, 0, 0) # red
def visualize(
image,
detection_result
) -> np.ndarray:
"""Draws bounding boxes on the input image and return it.
Args:
image: The input RGB image.
detection_result: The list of all "Detection" entities to be visualize.
Returns:
Image with bounding boxes.
"""
for detection in detection_result.detections:
# Draw bounding_box
bbox = detection.bounding_box
start_point = bbox.origin_x, bbox.origin_y
end_point = bbox.origin_x + bbox.width, bbox.origin_y + bbox.height
cv2.rectangle(image, start_point, end_point, TEXT_COLOR, 3)
# Draw label and score
category = detection.categories[0]
category_name = category.category_name
probability = round(category.score, 2)
result_text = category_name + ' (' + str(probability) + ')'
text_location = (MARGIN + bbox.origin_x,
MARGIN + ROW_SIZE + bbox.origin_y)
cv2.putText(image, result_text, text_location, cv2.FONT_HERSHEY_PLAIN,
FONT_SIZE, TEXT_COLOR, FONT_THICKNESS)
return image
# STEP 1: Import the necessary modules.
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
# STEP 2: Create an ObjectDetector object.
base_options = python.BaseOptions(model_asset_path='efficientdet_lite0.tflite')
options = vision.ObjectDetectorOptions(base_options=base_options,
score_threshold=0.5)
detector = vision.ObjectDetector.create_from_options(options)
# STEP 3: Load the input image.
image = mp.Image.create_from_file(IMAGE_FILE)
# STEP 4: Detect objects in the input image.
detection_result = detector.detect(image)
# STEP 5: Process the detection result. In this case, visualize it.
image_copy = np.copy(image.numpy_view())
annotated_image = visualize(image_copy, detection_result)
rgb_annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
# cv2_imshow(rgb_annotated_image)
cv2.imshow('my_window',rgb_annotated_image)
cv2.waitKey(0)
2. Face detection
Only the detection coordinate classification information is output, and information such as no vector cannot be used for subsequent face database retrieval, and additional methods may be required to extract face vector features
Using the high-level solutions interface, the model will be automatically downloaded to the following modules directory when mediapipe is installed. For the methods currently supported by python in solutions, please refer to:
https://github.com/google/mediapipe/blob/master/docs/solutions/solutions.md
Real-time face OpenCV camera:
import cv2
import time
import mediapipe as mp
class FaceDetector():
def __init__(self, confidence=0.5, model=0) -> None:
self.confidence = confidence
self.model = model
self.mp_draws = mp.solutions.drawing_utils
self.mp_faces = mp.solutions.face_detection
self.faces = self.mp_faces.FaceDetection(min_detection_confidence=confidence, model_selection=model)
def face_detection(self, image, draw=True, position=False):
img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = self.faces.process(image)
lst_box = list()
if results.detections:
if draw:
for id, detection in enumerate(results.detections):
h, w, c = image.shape
r_bbox = detection.location_data.relative_bounding_box
print("-"*20)
bbox = int(r_bbox.xmin * w), int(r_bbox.ymin * h), \
int(r_bbox.width * w), int(r_bbox.height * h)
score = detection.score
print(bbox)
lst_box.append([id, bbox, score])
self.draw_box_detection(image, bbox, score)
# self.mp_draws.draw_detection(image, detection)
return lst_box
def draw_box_detection(self, image, bbox, score):
xmin, ymin = bbox[0], bbox[1]
h, w, c = image.shape
l = 30
cv2.rectangle(image, bbox, color=(255, 0, 255), thickness=1)
cv2.line(image, (xmin, ymin), (xmin+l, ymin), (255, 0, 255), thickness=5)
cv2.line(image, (xmin, ymin), (xmin, ymin+l), (255, 0, 255), thickness=5)
cv2.putText(image, f"{str(int(score[0] * 100))}%", (xmin, ymin - 10),
cv2.FONT_HERSHEY_PLAIN, fontScale=1.3,
color=(0, 255,0), thickness=1)
def main():
capture = cv2.VideoCapture(0)
face_detector = FaceDetector()
prev_time = 0
while True:
sucess, frame = capture.read()
lst_position = face_detector.face_detection(frame)
if len(lst_position) != 0:
print(lst_position[0])
# calculate fps
current_time = time.time()
fps = 1 / (current_time - prev_time)
prev_time = current_time
# put fps of video in display
cv2.putText(frame, f"{str(int(fps))}", (19, 50),
cv2.FONT_HERSHEY_PLAIN, 1.5,
(0, 255, 255), thickness=2)
# display video window
cv2.imshow("Video Display", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
capture.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
Real-time face mesh (parameter setting supports max_num_faces
Maximum number of faces to detect. Default to 1. ):
with mp_face_mesh.FaceMesh(
max_num_faces=3,
refine_landmarks=True,
min_detection_confidence=0.5,
min_tracking_confidence=0.5) as face_mesh:
import cv2
import time
import mediapipe as mp
class FaceMesh():
def __init__(self, mode=False, max_face=1,
refine_landmarks=False,
detect_confidence=0.5, track_confidence=0.5) -> None:
self.mode = mode
self.max_face = max_face
self.refine_landmarks = refine_landmarks
self.detect_confidence = detect_confidence
self.track_confidence = track_confidence
self.mp_draws = mp.solutions.drawing_utils
self.mp_face_mesh = mp.solutions.face_mesh
self.face_mesh = self.mp_face_mesh.FaceMesh(static_image_mode=self.mode,
max_num_faces=self.max_face,
refine_landmarks=self.refine_landmarks,
min_detection_confidence=self.detect_confidence,
min_tracking_confidence=self.track_confidence)
def draw_mesh(self, image, thickness=1, circle_radius=1, color=(0,255, 0)):
draw_spec = self.mp_draws.DrawingSpec(thickness=thickness, circle_radius=circle_radius, color=color)
img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = self.face_mesh.process(img_rgb)
lst_mark = list()
if results.multi_face_landmarks:
h, w, c = image.shape
for face_id, landmarks in enumerate(results.multi_face_landmarks):
self.mp_draws.draw_landmarks(image, landmarks,
self.mp_face_mesh.FACEMESH_FACE_OVAL, draw_spec)
for id,mark in enumerate(landmarks.landmark):
cx, cy = mark.x, mark.y
lst_mark.append([face_id, id, cx, cy])
return lst_mark
def main():
capture = cv2.VideoCapture(0)
face_mesh = FaceMesh()
prev_time = 0
while True:
sucess, frame = capture.read()
lst_position = face_mesh.draw_mesh(frame)
if len(lst_position) != 0:
print(lst_position[0])
# calculate fps
current_time = time.time()
fps = 1 / (current_time - prev_time)
prev_time = current_time
# put fps of video in display
cv2.putText(frame, f"{str(int(fps))}", (19, 50), cv2.FONT_HERSHEY_PLAIN, 1.5, (0, 255, 255), thickness=2)
# display video window
cv2.imshow("Video Display", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
capture.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
import cv2
import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_face_mesh = mp.solutions.face_mesh
# For static images:
IMAGE_FILES = []
drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
with mp_face_mesh.FaceMesh(
static_image_mode=True,
max_num_faces=1,
refine_landmarks=True,
min_detection_confidence=0.5) as face_mesh:
for idx, file in enumerate(IMAGE_FILES):
image = cv2.imread(file)
# Convert the BGR image to RGB before processing.
results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
# Print and draw face mesh landmarks on the image.
if not results.multi_face_landmarks:
continue
annotated_image = image.copy()
for face_landmarks in results.multi_face_landmarks:
print('face_landmarks:', face_landmarks)
mp_drawing.draw_landmarks(
image=annotated_image,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACEMESH_TESSELATION,
landmark_drawing_spec=None,
connection_drawing_spec=mp_drawing_styles
.get_default_face_mesh_tesselation_style())
mp_drawing.draw_landmarks(
image=annotated_image,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACEMESH_CONTOURS,
landmark_drawing_spec=None,
connection_drawing_spec=mp_drawing_styles
.get_default_face_mesh_contours_style())
mp_drawing.draw_landmarks(
image=annotated_image,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACEMESH_IRISES,
landmark_drawing_spec=None,
connection_drawing_spec=mp_drawing_styles
.get_default_face_mesh_iris_connections_style())
cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', annotated_image)
# For webcam input:
drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
cap = cv2.VideoCapture(0)
with mp_face_mesh.FaceMesh(
max_num_faces=1,
refine_landmarks=True,
min_detection_confidence=0.5,
min_tracking_confidence=0.5) as face_mesh:
while cap.isOpened():
success, image = cap.read()
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = face_mesh.process(image)
# Draw the face mesh annotations on the image.
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
if results.multi_face_landmarks:
for face_landmarks in results.multi_face_landmarks:
mp_drawing.draw_landmarks(
image=image,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACEMESH_TESSELATION,
landmark_drawing_spec=None,
connection_drawing_spec=mp_drawing_styles
.get_default_face_mesh_tesselation_style())
mp_drawing.draw_landmarks(
image=image,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACEMESH_CONTOURS,
landmark_drawing_spec=None,
connection_drawing_spec=mp_drawing_styles
.get_default_face_mesh_contours_style())
mp_drawing.draw_landmarks(
image=image,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACEMESH_IRISES,
landmark_drawing_spec=None,
connection_drawing_spec=mp_drawing_styles
.get_default_face_mesh_iris_connections_style())
# Flip the image horizontally for a selfie-view display.
cv2.imshow('MediaPipe Face Mesh', cv2.flip(image, 1))
if cv2.waitKey(5) & 0xFF == 27:
break
cap.release()
3. Human body key point detection
Reference: https://www.hackersrealm.net/post/realtime-human-pose-estimation-using-python
https://github.com/realsanjeev/Object-Detection-using-OpenCV
https://github.com/ google/mediapipe/blob/master/docs/solutions/pose.md
import cv2
import mediapipe as mp
import time
class PoseDetector():
def __init__(self, mode=False, complexity=1, smooth_landmarks=True,
enable_segmentation=False, smooth_segmentation=True,
detection_confidence=0.5, tracking_confidence=0.5) -> None:
self.mode = mode
self.complexity = complexity
self.smooth_landmarks = smooth_landmarks
self.enable_segmentation = enable_segmentation
self.smooth_segmentations = smooth_segmentation
self.detection_confidence = detection_confidence
self.tracking_confidence = tracking_confidence
self.mp_pose = mp.solutions.pose
self.mp_draw = mp.solutions.drawing_utils
self.poses = self.mp_pose.Pose(static_image_mode=self.mode,
model_complexity=self.complexity,
smooth_landmarks=self.smooth_landmarks,
enable_segmentation=self.enable_segmentation,
smooth_segmentation=self.smooth_segmentations,
min_detection_confidence=self.detection_confidence,
min_tracking_confidence=self.tracking_confidence
)
def findPose(self, image, draw=True, postion_mark=False):
img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = self.poses.process(img_rgb)
lst_mark_postion = list()
if results.pose_landmarks:
if draw:
self.mp_draw.draw_landmarks(image, results.pose_landmarks,
self.mp_pose.POSE_CONNECTIONS)
if postion_mark:
for id, mark in enumerate(results.pose_landmarks.landmark):
h, w, c = image.shape
cx, cy = int(mark.x * w), int(mark.y * h)
lst_mark_postion.append([id, cx, cy])
return lst_mark_postion
pose_detector = PoseDetector()
cap = cv2.VideoCapture(0)
while cap.isOpened():
# read frame
_, frame = cap.read()
try:
# resize the frame for portrait video
# frame = cv2.resize(frame, (350, 600))
# convert to RGB
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# process the frame for pose detection
pose_results = pose_detector.poses.process(frame_rgb)
# print(pose_results.pose_landmarks)
# draw skeleton on the frame
pose_detector.mp_draw.draw_landmarks(frame, pose_results.pose_landmarks, pose_detector.mp_pose.POSE_CONNECTIONS)
# display the frame
cv2.imshow('Output', frame)
except:
break
if cv2.waitKey(1) == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
4. Hand key point detection
# opencv-python
import cv2
# mediapipe人工智能工具包
import mediapipe as mp
# 进度条库
from tqdm import tqdm
# 时间库
import time
# 导入solution
mp_hands = mp.solutions.hands
# 导入模型
hands = mp_hands.Hands(static_image_mode=False, # 是静态图片还是连续视频帧
max_num_hands=2, # 最多检测几只手
min_detection_confidence=0.7, # 置信度阈值
min_tracking_confidence=0.5) # 追踪阈值
# 导入绘图函数
mpDraw = mp.solutions.drawing_utils
def process_frame(img):
# 记录该帧开始处理的时间
start_time = time.time()
# 获取图像宽高
h, w = img.shape[0], img.shape[1]
# 水平镜像翻转图像,使图中左右手与真实左右手对应
# 参数 1:水平翻转,0:竖直翻转,-1:水平和竖直都翻转
img = cv2.flip(img, 1)
# BGR转RGB
img_RGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 将RGB图像输入模型,获取预测结果
results = hands.process(img_RGB)
if results.multi_hand_landmarks: # 如果有检测到手
handness_str = ''
index_finger_tip_str = ''
for hand_idx in range(len(results.multi_hand_landmarks)):
# 获取该手的21个关键点坐标
hand_21 = results.multi_hand_landmarks[hand_idx]
# 可视化关键点及骨架连线
mpDraw.draw_landmarks(img, hand_21, mp_hands.HAND_CONNECTIONS)
# 记录左右手信息
temp_handness = results.multi_handedness[hand_idx].classification[0].label
handness_str += '{}:{} '.format(hand_idx, temp_handness)
# 获取手腕根部深度坐标
cz0 = hand_21.landmark[0].z
for i in range(21): # 遍历该手的21个关键点
# 获取3D坐标
cx = int(hand_21.landmark[i].x * w)
cy = int(hand_21.landmark[i].y * h)
cz = hand_21.landmark[i].z
depth_z = cz0 - cz
# 用圆的半径反映深度大小
radius = max(int(6 * (1 + depth_z*5)), 0)
if i == 0: # 手腕
img = cv2.circle(img,(cx,cy), radius, (0,0,255), -1)
if i == 8: # 食指指尖
img = cv2.circle(img,(cx,cy), radius, (193,182,255), -1)
# 将相对于手腕的深度距离显示在画面中
index_finger_tip_str += '{}:{:.2f} '.format(hand_idx, depth_z)
if i in [1,5,9,13,17]: # 指根
img = cv2.circle(img,(cx,cy), radius, (16,144,247), -1)
if i in [2,6,10,14,18]: # 第一指节
img = cv2.circle(img,(cx,cy), radius, (1,240,255), -1)
if i in [3,7,11,15,19]: # 第二指节
img = cv2.circle(img,(cx,cy), radius, (140,47,240), -1)
if i in [4,12,16,20]: # 指尖(除食指指尖)
img = cv2.circle(img,(cx,cy), radius, (223,155,60), -1)
scaler = 1
img = cv2.putText(img, handness_str, (25 * scaler, 100 * scaler), cv2.FONT_HERSHEY_SIMPLEX, 1.25 * scaler, (255, 0, 255), 2 * scaler)
img = cv2.putText(img, index_finger_tip_str, (25 * scaler, 150 * scaler), cv2.FONT_HERSHEY_SIMPLEX, 1.25 * scaler, (255, 0, 255), 2 * scaler)
# 记录该帧处理完毕的时间
end_time = time.time()
# 计算每秒处理图像帧数FPS
FPS = 1/(end_time - start_time)
# 在图像上写FPS数值,参数依次为:图片,添加的文字,左上角坐标,字体,字体大小,颜色,字体粗细
img = cv2.putText(img, 'FPS '+str(int(FPS)), (25 * scaler, 50 * scaler), cv2.FONT_HERSHEY_SIMPLEX, 1.25 * scaler, (255, 0, 255), 2 * scaler)
return img
# 调用摄像头逐帧实时处理模板
# 不需修改任何代码,只需定义process_frame函数即可
# 导入opencv-python
import cv2
import time
# 获取摄像头,传入0表示获取系统默认摄像头
cap = cv2.VideoCapture(0)
# 打开cap
cap.open(0)
# 无限循环,直到break被触发
while cap.isOpened():
# 获取画面
success, frame = cap.read()
if not success:
break
## !!!处理帧函数
frame = process_frame(frame)
# 展示处理后的三通道图像
cv2.imshow('my_window', frame)
if cv2.waitKey(1) in [ord('q'),27]: # 按键盘上的q或esc退出(在英文输入法下)
break
# 关闭摄像头
cap.release()
# 关闭图像窗口
cv2.destroyAllWindows()