Gesture control volume and AI face change based on opencv
HandTrackingModule.py
import cv2
import mediapipe as mp
import time
class handDetector():
def __init__(self, mode = False, maxHands = 2, model_complexity = 1, detectionCon = 0.5, trackCon = 0.5):
self.mode = mode
self.maxHands = maxHands
self.model_complexity = model_complexity
self.detectionCon = detectionCon
self.trackCon = trackCon
self.mpHands = mp.solutions.hands
self.hands = self.mpHands.Hands(self.mode, self.maxHands, self.model_complexity, self.detectionCon, self.trackCon)
self.mpDraw = mp.solutions.drawing_utils
def findHands(self, img, draw = True):
# Hand类的对象只能使用RGB图像
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
self.results = self.hands.process(imgRGB)
# print(results.multi_hand_landmarks)
# 如果存在手
if self.results.multi_hand_landmarks:
# 如果存在多个手
for handLms in self.results.multi_hand_landmarks:
if draw:
# 设置连接线等属性
self.connection_drawing_spec = self.mpDraw.DrawingSpec(color=(0, 255, 0), thickness=2)
# 绘制
self.mpDraw.draw_landmarks(img, handLms, self.mpHands.HAND_CONNECTIONS, connection_drawing_spec=self.connection_drawing_spec)
return img
def findPosition(self, img, handNum=0, draw=True):
lmList = []
# 每个点的索引和它的像素比例,若知道窗口的宽度和高度可以计算位置
if self.results.multi_hand_landmarks:
myHand = self.results.multi_hand_landmarks[handNum]
for id, lm in enumerate(myHand.landmark):
# print(id, lm)
h, w, c = img.shape
cx, cy = int(lm.x * w), int(lm.y * h)
# print(id, cx, cy)
lmList.append([id, cx, cy])
if draw:
cv2.circle(img, (cx, cy), 7, (255, 0, 0), cv2.FILLED)
# 绘制每一只手
return lmList
A class called is defined handDetector
for detecting and tracking hands. The following is a detailed analysis of the code:
import library
cv2
: OpenCV library for image processing.mediapipe as mp
: library for multimedia solutions, here for hand detection.time
: Used for time management, but not used in the given code snippet.
handDetector
kind
initialization method__init__
This method is used to initialize handDetector
the object of the class and set some parameters.
mode
: Boolean value that controls the still image mode of the MediaPipe hand solution. The default value isFalse
.maxHands
: Maximum number of hands, controls the number of hands detected at the same time. The default value is2
.model_complexity
: Model complexity, with three levels of 0, 1, and 2. The default value is1
.detectionCon
: detection confidence threshold. The default value is0.5
.trackCon
: Tracking confidence threshold. The default value is0.5
.
Additionally, an instance of the MediaPipe hand solution is created and the drawing tool is initialized.
methodfindHands
This method is used to find hands in a given image and draw hand markers as needed.
img
: Input image.draw
: Boolean, controls whether hand markers are drawn. The default value isTrue
.
The method first converts the image from BGR to RGB and then processes the image to find the hand landmarks. If a hand marker is found, and draw
the parameter is True
, the hand marker and connecting lines are drawn on the image.
methodfindPosition
This method is used to find the positions of the hand markers in the given image and returns a list containing the positions of each marker.
img
: Input image.handNum
: hand index, used to select a specific one among multiple detected hands. The default value is0
.draw
: Boolean, controls whether to draw circles for each marker on the image. The default value isTrue
.
This method iterates over each marker for a given hand and calculates its position in the image. If draw
the parameter is True
, a circle is drawn at each marker's location.
Summarize
handDetector
class is a tool for detecting and tracking hands. It uses MediaPipe's hand solution and provides the ability to draw hand markers and connecting lines on the image. By calling these methods, you can track hands in a video stream or still image, and even find the location of specific hand markers.
VolumeHandControl.py
import cv2
import time
import numpy as np
import HandTrackingModule as htm
import math
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
wCam, hCam = 640, 480
cap = cv2.VideoCapture(0)
# 设置摄像头的宽度
cap.set(3, wCam)
# 设置摄像头的高度
cap.set(4, hCam)
pTime = 0
tiga_img = cv2.imread("tiga.jpg", cv2.IMREAD_UNCHANGED)
detector = htm.handDetector(detectionCon=0.7)
face_Cascade = cv2.CascadeClassifier("haarcascade_frontalface_default.xml")
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = cast(interface, POINTER(IAudioEndpointVolume))
# volume.GetMute()
# volume.GetMasterVolumeLevel()
# 音量范围
volRange = volume.GetVolumeRange()
print(volRange)
# 最小音量
minVol = volRange[0]
# 最大音量
maxVol = volRange[1]
vol = 0
volBar = 400
volPer = 0
def overlay_img(img, img_over, img_over_x, img_over_y):
# 背景图像高宽
img_w, img_h, img_c = img.shape
# 覆盖图像高宽通道数
img_over_h, img_over_w, img_over_c = img_over.shape
# 转换成4通道
if img_over_c == 3:
img_over = cv2.cvtColor(img_over, cv2.COLOR_BGR2BGRA)
# 遍历列
for w in range(0, img_over_w):
#遍历行
for h in range(0, img_over_h):
if img_over[h, w, 3] != 0:
# 遍历三个通道
for c in range(0, 3):
x = img_over_x + w
y = img_over_y + h
if x >= img_w or y >= img_h:
break
img[y-40, x, c] = img_over[h, w, c]
return img
while True:
success, img = cap.read()
gray_frame = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
height, width, channel = img.shape
faces = face_Cascade.detectMultiScale(gray_frame, 1.15, 5)
for (x, y, w, h) in faces:
gw = w
gh = int(height * w / width)
tiga_img = cv2.resize(tiga_img, (gw, gh+gh))
print(gw, gh)
if 0 <= x < img.shape[1] and 0 <= y < img.shape[0]:
overlay_img(img, tiga_img, x, y)
img = detector.findHands(img)
lmList = detector.findPosition(img, draw=False)
if len(lmList) != 0:
# print(lmList[4], lmList[8])
x1, y1 = lmList[4][1], lmList[4][2]
x2, y2 = lmList[8][1], lmList[8][2]
cv2.circle(img, (x1, y1), 15, (255, 0, 255), cv2.FILLED)
cv2.circle(img, (x2, y2), 15, (255, 0, 255), cv2.FILLED)
cv2.line(img, (x1, y1), (x2, y2), (255, 0, 255), 3)
cx, cy = (x1+x2)//2, (y1+y2)//2
cv2.circle(img, (cx, cy), 15, (255, 0, 255), cv2.FILLED)
length = math.hypot(x2 - x1, y2 - y1)
print(length)
# Hand rang 130 25
# Vomume Range -65 0
vol = np.interp(length, [25, 175], [minVol, maxVol])
volBar = np.interp(length, [25, 175], [400, 150])
volPer = np.interp(length, [25, 175], [0, 100])
print(int(length), vol)
volume.SetMasterVolumeLevel(vol, None)
if length<25:
cv2.circle(img, (cx, cy), 15, (0, 255, 0), cv2.FILLED)
cv2.rectangle(img, (50, 150), (85, 400), (255, 0, 0), 3)
cv2.rectangle(img, (50, int(volBar)), (85, 400), (255, 0, 0), cv2.FILLED)
cv2.putText(img, f'{
int(volPer)} %', (40, 450), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 3)
cTime = time.time()
fps = 1/(cTime - pTime)
pTime = cTime
cv2.putText(img, f'FPS:{
int(fps)}', (40, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 3)
cv2.imshow("img", img)
cv2.waitKey(1)
1. Import the necessary libraries
- OpenCV (
cv2
) : for image processing, such as reading images, converting color spaces, drawing shapes, etc. - NumPy (
np
) : for numerical computation, especially linear interpolation. HandTrackingModule as htm
: Import a custom hand detection module.math
: Provides mathematical functions, such as calculating the distance between two points.ctypes
,comtypes
,pycaw.pycaw
: Used to interact with the operating system's volume control.
2. Initialize parameters and objects
- Camera Size (
wCam
,hCam
) : Defines the width and height of the camera. - Camera (
cap
) : Initialize the camera through OpenCV, and set the width and height. - time(
pTime
) : Used to calculate the frame rate. - imageoverlay(
tiga_img
) : reads an image file to be used later for overlay. - HandDetector (
detector
) : Create a detector object with a custom hand detection module and set the detection confidence to 0.7. - FaceDetect(
face_Cascade
) : Loads OpenCV's Haar cascade classifier to detect faces. - volume control (
volume
) : access the volume control of the system through pycaw to get the volume range.
3. Define the image overlay functionoverlay_img
This function is responsible for superimposing one image on top of another at a specific position. It iterates through each pixel of the overlay image and copies non-transparent pixels to the corresponding position of the background image.
4. Main loop
In the infinite loop, the code performs the following tasks:
a. Face detection and image overlay
- Read Image : Captures an image from the camera.
- Grayscale conversion : Convert the image to grayscale for face detection.
- Face Detection : Detect faces using a cascade of classifiers.
- Resize Overlay Image : Adjust the size of the overlay image according to the face size.
- Overlay Image : Call
overlay_img
the function to overlay the image on the face.
b. Hand detection and volume control
- Detect Hands : call to
detector.findHands
detect and draw hands on the image. - Find position : Call
detector.findPosition
to get the position of the hand marker. - Calculate Distance : Calculate the distance between hand marks 4 and 8.
- Draw Shapes : Draw a circle on these two points and a line between them.
- Volume Mapping : Uses NumPy's
np.interp
function to map hand distances to volume ranges. - Set Volume : Call
volume.SetMasterVolumeLevel
to set the system volume.
c. Visualization
- Draw Volume Bar : Draws a rectangular bar representing the volume level on the image.
- Calculate frame rate : Use the current time and the time of the previous frame to calculate the frame rate.
- Draw Frame Rate : Draw frame rate text on the image.
d. Display the result
- Display Image : Use OpenCV's
imshow
method to display the processed image. - Wait
waitKey
: Wait for 1 millisecond through the method of OpenCV , so that the image can be updated in real time.
Summarize
This code integrates multiple functions: capture images through the camera, detect faces and overlay images on them, detect hands and control the system volume through the distance between fingers, and then display the results in real time through OpenCV. It combines image processing, face and hand detection, system interaction, and real-time visualization to demonstrate the power of computer vision and human-computer interaction.
Effect
(B station demo video) [https://www.bilibili.com/video/BV1Xu41177Gz/?spm_id_from=333.999.0.0]