Python realizes video frame extraction, file batch operation, and file batch processing (toolbox for data set production)

Toolbox for Dataset Creation

Table of contents

  1. Environmental preparation
  2. Dataset production
  3. Batch file renaming
  4. Batch file movement
  5. Rename files in batches according to a certain format
  6. How to modify the content of the xml file
  7. Common interfaces of the Pathlib library

introduction

In computer vision projects, file batch operations and file batch preprocessing are essential steps. They involve handling large numbers of image files, including reading, processing, saving, and preprocessing. This article will introduce some common techniques and methods to help you efficiently perform file batch operations and file batch preprocessing in computer vision projects.

Environmental preparation

from pathlib import Path, PurePath
import xml.etree.ElementTree as ET
from typing import Union
import numpy as np
from tqdm import tqdm
import time
import cv2
import os

Dataset production

Artificial intelligence is how much intelligence is generated by artificial intelligence. In computer vision projects, data sets are very important. The following is how to use video frame extraction technology to realize the production of data sets. The function in the following code _videoPlayis to display the imported video in real time. CutVideoThe function realizes manual frame extraction. When displaying, cpress the button to extract the current frame, and Escpress the button to close the video. ExtractAllThe function is an automatic frame extraction function, and frameGapthe parameter is how many frames are automatically extracted.

class ExtractImg(object):
    def __init__(self, videopath: Path, savepath: Path, delay: int = 1) -> None:
        self.spath = savepath
        self.vpath = videopath
        self.delay = delay
        cv2.namedWindow("cv", cv2.WINDOW_NORMAL)
        cv2.resizeWindow("cv", 640, 480)
        self.cap = cv2.VideoCapture(str(self.vpath))
        self._timeflag = 0
        if not savepath.exists():
            os.mkdir(Path(savepath))

    def _videoPlay(self, size: list) -> None:
        self.cap.set(3, size[0])
        self.cap.set(4, size[1])
        while self.cap.isOpened():
            ret, frame = self.cap.read()
            # frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            cv2.imshow("cv", frame)
            if cv2.waitKey(self.delay) & 0xFF == ord('c'):
                cv2.imwrite(str(PurePath.joinpath(self.spath,
                                                  "{}.jpg".format(str(time.time())))), frame)
                print("保存成功")
                time.sleep(1)
            elif cv2.waitKey(self.delay) & 0xFF == 27:
                break

    def ExtractAll(self, frameGap: int = 3) -> None:
        """
        这是将视频流中的帧全部抽出
        :frame: 跳帧
        :return:
        """
        while self.cap.isOpened():
            self._timeflag += 1

            ret, frame = self.cap.read()
            if ret:
                cv2.imshow("cv", frame)
                if self._timeflag % frameGap == 0:
                    cv2.imwrite(str(PurePath.joinpath(self.spath,
                                                      "{}.jpg".format(str(time.time())))), frame)
                    print("保存成功")
            if (cv2.waitKey(self.delay) & 0xFF == 27) or not ret:
                break
        cv2.destroyAllWindows()
        self.cap.release()
        self._timeflag = 0

    def CutVideo(self) -> None:
        """
        这是手动抽帧
        :return:
        """
        ifm = input("文件中已经存在{}张图片,是否有继续添加"
                    "(Y or N):".format(len(os.listdir(self.spath))))
        if self.spath.exists() and ifm == 'Y':
            self._videoPlay(size=[640, 480])
        elif self.spath.exists() and ifm == 'N':
            return None
        else:
            print("\n请输入Y(yes)或者N(no)")
        cv2.destroyAllWindows()
        self.cap.release()

Batch file renaming

Rename the pictures in the folder in ascending order.

  def statistics(path: Union[str, Path], dstpath: Union[Path, str], count: int = 0, random: bool = False) -> None:
        """
        这是存放图片的文件夹安升序重命名
        :param path:需要重命名的文件文件
        :param count:观察图片总数添加使用
        """
        assert isinstance(path, (Path, str)), print("请输入的路径")
        l = os.listdir(str(path))
        if not Path.exists(dstpath):
            Path.mkdir(dstpath)
        # l = sorted(l)
        print(l)
        # print(l)
        print("存在文件{}张!".format(len(l)))
        if random:
            np.random.shuffle(l)
        # print(l)
        # 将保存图片文件中的图片按照升序的方法重命名

        suffix = Path(l[0]).suffix
        for file in tqdm(l):
            src = PurePath.joinpath(path, file)
            dst = PurePath.joinpath(dstpath, Path(str(count + int(Path(file).stem))).with_suffix(suffix))
            os.rename(src, dst)

Batch file movement

The following is to select files in batches according to certain rules and put them in the target folder.

    def choosen(src: Union[str, Path] , folder: Union[Path,str] ,dst: Union[str, Path] , suffix: str) -> None:
        """
        1.将xml/jpg文件夹中的文件名字拿出来并且在jpg/xml对应的文件夹中将名称相同的文件挑出来
        2.将文件夹中的文件随取出
        :param xmlsrc:目标xml文件
        :param imgsrc:frameImg文件
        :param dst:根据xml挑选出的img文件
        :return: None
        """
        # l = os.listdir(str(xmlsrc))
        if not isinstance(folder,Path):
            pa = Path(folder)
        if not isinstance(src,Path):
            l = Path(src)

        # parent = src.parent
        for i in l.iterdir():
            file = Path(i.name).with_suffix(suffix)
            (pa / file).rename(Path(dst) / file)

Rename files in batches according to a certain format

Next, rename the file according to the format of 5 digits 1.jpg->00001.jpg.

    def batchrenames(src: Union[str, Path], dst: Union[str, Path], sorted: bool = False) -> None:
        """
        进行特定格式的重命名
        :param src:原文件
        :param dst: 存储文件
        :param sorted: 是否已经有顺序,若有学按照1.jpg ->00001.jpg
        :return: None
        """

        d = {
    
    1: "0000",  # 这是命名格式的字典
             2: "000",
             3: "00",
             4: "0",
             5: ""}
        l = os.listdir(src)
        suffix = Path(l[0]).suffix
        l.sort(key=lambda x: int(x.split('.')[0]))
        if sorted:
            for obj in tqdm(l):
                old = PurePath.joinpath(src, obj)
                new = PurePath.joinpath(dst, d[len(obj.split('.')[0])] + obj.split('.')[0] + suffix)
                os.rename(old, new)
        else:
            # for c, i in tqdm(enumerate(l)):
            pass

How to modify the content of the xml file

Here is xmlthe code that modifies the content of the file.

    def revampXml(xml_path: Union[Path, str], update_content: str) -> None:
        """
        这是一个修改xml文件内容的方法,将xml文件爱中的类别改称另一个类别
        :param xml_path: 存放xml文件的路径
        :param xml_dw: xpath关键字
        :param update_content: 更新的内容
        :return:None
        """
        # 打开xml文档
        if not isinstance(xml_path, Path):
            xml_path = Path(xml_path)
        for i in tqdm(xml_path.iterdir()):
            xmlfile = xml_path / f"{
      
      i}"
            doc = ET.parse(xmlfile)
            root = doc.getroot()
            # 查找修改路劲
            for obj in root.iter("object"):
                sub1 = obj.find("name")
                if sub1.text == "motorboat":
                    # 修改标签内容
                    sub1.text = update_content
                    # 保存修改
                    doc.write(xmlfile)

Common file operation APIs of Pathlib library and os library

pathlib is one of the standard libraries for manipulating filesystem paths. The library can conveniently perform path splicing, file/directory creation, copy/move, delete and other operations.

Functional description pathlib operation os and os.path operations
get absolute path Path.resolve() os.path.abspath()
Modify file permissions and timestamps Path.chmod() os.chmod()
Create a directory Path.mkdir() os.mkdir()
File or folder renaming, moved and renamed if path is different Path.rename() os.rename()
File or folder rename, moves and renames if path is different, destroys existing target if exists Path.replace() os.replace()
delete directory Path.rmdir() os.rmdir()
delete a file Path.unlink() os.remove()
delete a file Path.unlink() os.unlink()
get the current working directory Path.cwd() os.getcwd()
Determine whether a file or directory name exists Path.exists() os.path.exists()
Return to the user directory of the computer Path.home() os.path.expanduser()
Verify that the given path is a file Path.is_dir() os.path.isdir()
Verify that the given path is a directory Path.is_file() os.path.isfile()
Verify that the given path is a symbolic link Path.is_symlink() os.path.islink()
get file properties Path.stat() os.stat()
Determine whether it is an absolute path PurePath.is_absolute() os.path.isabs()
concatenate directory with filename or directory PurePath.joinpath() os.path.join()
return filename PurePath.name os.path.basename()
return file path PurePath.parent os.path.dirname()
Determine whether two paths are the same Path.samefile() os.path.samefile()
Separate filename and extension PurePath.suffix os.path.splitext()

Summarize

This article introduces the file batch operation and file batch preprocessing technology in the computer vision project. Mastering these techniques will enable you to efficiently process large-scale image data and provide strong support for the successful implementation of computer vision projects.

Hope this article inspires you on batch file operation and batch preprocessing in computer vision projects!

The following is the complete code

# -*- coding: utf-8 -*-
# @Author  : cvYouTian
# @Software: PyCharm

from pathlib import Path, PurePath
import xml.etree.ElementTree as ET
from typing import Union
import numpy as np
# import torch
from tqdm import tqdm
import time
import cv2
import os


class ExtractImg(object):
    def __init__(self, videopath: Path, savepath: Path, delay: int = 1) -> None:
        self.spath = savepath
        self.vpath = videopath
        self.delay = delay
        cv2.namedWindow("cv", cv2.WINDOW_NORMAL)
        cv2.resizeWindow("cv", 640, 480)
        self.cap = cv2.VideoCapture(str(self.vpath))
        self._timeflag = 0
        if not savepath.exists():
            os.mkdir(Path(savepath))

    def _videoPlay(self, size: list) -> None:
        self.cap.set(3, size[0])
        self.cap.set(4, size[1])
        while self.cap.isOpened():
            ret, frame = self.cap.read()
            # frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            cv2.imshow("cv", frame)
            if cv2.waitKey(self.delay) & 0xFF == ord('c'):
                cv2.imwrite(str(PurePath.joinpath(self.spath,
                                                  "{}.jpg".format(str(time.time())))), frame)
                print("保存成功")
                time.sleep(1)
            elif cv2.waitKey(self.delay) & 0xFF == 27:
                break

    def ExtractAll(self, frameGap: int = 3) -> None:
        """
        这是将视频流中的帧全部抽出
        :frame: 跳帧
        :return:
        """
        while self.cap.isOpened():
            self._timeflag += 1

            ret, frame = self.cap.read()
            if ret:
                cv2.imshow("cv", frame)
                if self._timeflag % frameGap == 0:
                    cv2.imwrite(str(PurePath.joinpath(self.spath,
                                                      "{}.jpg".format(str(time.time())))), frame)
                    print("保存成功")
            if (cv2.waitKey(self.delay) & 0xFF == 27) or not ret:
                break
        cv2.destroyAllWindows()
        self.cap.release()
        self._timeflag = 0

    def CutVideo(self) -> None:
        """
        这是手动抽帧
        :return:
        """
        ifm = input("文件中已经存在{}张图片,是否有继续添加"
                    "(Y or N):".format(len(os.listdir(self.spath))))
        if self.spath.exists() and ifm == 'Y':
            self._videoPlay(size=[640, 480])
        elif self.spath.exists() and ifm == 'N':
            return None
        else:
            print("\n请输入Y(yes)或者N(no)")
        cv2.destroyAllWindows()
        self.cap.release()

    @staticmethod
    def statistics(path: Union[str, Path], dstpath: Union[Path, str], count: int = 5305, random: bool = False) -> None:
        """
        这是存放图片的文件夹安升序重命名
        :param path:需要重命名的文件文件
        :param count:观察图片总数添加使用
        """
        assert isinstance(path, (Path, str)), print("请输入的路径")
        l = os.listdir(str(path))
        if not Path.exists(dstpath):
            Path.mkdir(dstpath)
        # l = sorted(l)
        print(l)
        # print(l)
        print("存在文件{}张!".format(len(l)))
        if random:
            np.random.shuffle(l)
        # print(l)
        # 将保存图片文件中的图片按照升序的方法重命名

        suffix = Path(l[0]).suffix
        for file in tqdm(l):
            src = PurePath.joinpath(path, file)
            dst = PurePath.joinpath(dstpath, Path(str(count + int(Path(file).stem))).with_suffix(suffix))
            os.rename(src, dst)

    @staticmethod
    def choosen(src: Union[str, Path]="/home/you/Desktop/2023海上高速目标检测/val", folder: Union[Path,str]="/home/you/Desktop/2023海上高速目标检测/annotations",dst: Union[str, Path]="/home/you/Desktop/2023海上高速目标检测/train", suffix: str=".xml") -> None:
        """
        1.将xml/jpg文件夹中的文件名字拿出来并且在jpg/xml对应的文件夹中将名称相同的文件挑出来
        2.将文件夹中的文件随取出
        :param xmlsrc:目标xml文件
        :param imgsrc:frameImg文件
        :param dst:根据xml挑选出的img文件
        :return: None
        """
        # l = os.listdir(str(xmlsrc))
        if not isinstance(folder,Path):
            pa = Path(folder)
        if not isinstance(src,Path):
            l = Path(src)

        # parent = src.parent
        for i in l.iterdir():
            file = Path(i.name).with_suffix(suffix)
            (pa / file).rename(Path(dst) / file)

    @staticmethod
    def batchrenames(src: Union[str, Path], dst: Union[str, Path], sorted: bool = False) -> None:
        """
        进行特定格式的重命名
        :param src:原文件
        :param dst: 存储文件
        :param sorted: 是否已经有顺序,若有学按照1.jpg ->00001.jpg
        :return: None
        """

        d = {
    
    1: "0000",  # 这是命名格式的字典
             2: "000",
             3: "00",
             4: "0",
             5: ""}
        l = os.listdir(src)
        suffix = Path(l[0]).suffix
        l.sort(key=lambda x: int(x.split('.')[0]))
        if sorted:
            for obj in tqdm(l):
                old = PurePath.joinpath(src, obj)
                new = PurePath.joinpath(dst, d[len(obj.split('.')[0])] + obj.split('.')[0] + suffix)
                os.rename(old, new)
        else:
            # for c, i in tqdm(enumerate(l)):
            pass

    @staticmethod
    def text(file: Union[Path, str]):
        l = []
        f = open(file)
        for i in f.readlines():
            i = i.strip()
            stem = Path(i).stem
            suffix = Path(i).suffix
            n1, n2 = int(stem) - 1, int(stem) + 1
            l.append(str(n1) + ".xml")
            l.append(str(n2) + ".xml")
        print(l)

    @staticmethod
    def revampXml(xml_path: Union[Path, str], update_content: str) -> None:
        """
        这是一个修改xml文件内容的方法,将xml文件爱中的类别改称另一个类别
        :param xml_path: 存放xml文件的路径
        :param xml_dw: xpath关键字
        :param update_content: 更新的内容
        :return:None
        """
        # 打开xml文档
        if not isinstance(xml_path, Path):
            xml_path = Path(xml_path)
        for i in tqdm(xml_path.iterdir()):
            xmlfile = xml_path / f"{
      
      i}"
            doc = ET.parse(xmlfile)
            root = doc.getroot()
            # 查找修改路劲
            for obj in root.iter("object"):
                sub1 = obj.find("name")
                if sub1.text == "motorboat":
                    # 修改标签内容
                    sub1.text = update_content
                    # 保存修改
                    doc.write(xmlfile)

    @staticmethod
    def movefile(folder_path: Union[Path, str], dst: Union[Path, str], suffix: str) -> None:
        """
        批量移动剪切文件
        :param folder_path: 原文件夹路径
        :param dst: 目标文件夹路径
        :param suffix: 移动的文件格式/后缀
        :return:
        """
        if not isinstance(folder_path, Path):
            folder_path = Path(folder_path)
        # for i in folder_path.iterdir():
        #     if i.is_dir():
        #         ExtractImg.movefile(folder_path / i, suffix, res)
        #     else:
        #         if i.suffix == suffix:
        #             res.append(str(i))
        # # return res if suffix is None or suffix == "" else list(filter(lambda x: str(x).endswith(suffix),res))
        # return res
        for i in tqdm(folder_path.rglob(f"*{
      
      suffix}")):
            i.rename(dst / i.name)
    @staticmethod
    def convert_box(size, box):
            dw, dh = 1. / size[0], 1. / size[1]
            x, y, w, h = (box[0] + box[1]) / 2.0 - 1, (box[2] + box[3]) / 2.0 - 1, box[1] - box[0], box[3] - box[2]
            return x * dw, y * dh, w * dw, h * dh

  

if __name__ == "__main__":
    # 目标视频文件
    videopath = Path("videoSet/seabird6.mp4")
    # 图片保存文件
    savepath = Path("./dataset/imgs")
    # xin = Path("./VOC6detect/imgss")
    # savepath = Path("frameSave")
    # 目标xml文件
    # xmlpath = Path("./VOC6detect/annotations")
    # old = Path("/home/you/Desktop/dateset/20(pass)/seabird5")
    # new = Path("/home/you/Desktop/dateset/11(pass)/temp")
    # pa = Path("./labels/")
    # xin = Path()
    # renamepath = Path("/home/you/Desktop/dateset/4(pass)/a-1")
    # 实例化
    a = ExtractImg(videopath=videopath, savepath=savepath)
    a.choosen()
    # VOC2YOLO
    # a.convert_label()


    # 将帧全部抽出
    # a.ExtractAll(frameGap=8)

    # 手动抽帧
    # a.CutVideo()

    # 根据xml文选出对应的文件
    # a.choosen(xmlsrc=xmlpath, imgsrc=savepath, dst=xin)

    # 将数字命名的图片按照加上一个数字的方式命名
    # a.statistics(path=Path("./DATA/xml"), dstpath=Path("./DATA/t"), count=5305)

    # 对已经有顺序或者没顺序的文件进行特定格式的重命名78.jpg -> 00078.jpg
    # a.batchrenames(src=new, dst=old, sorted=True)
    # a.text("./data1.txt")
    # 对xml文件进行修改
    # a.revampXml(xml_path= "/home/you/Desktop/tools/dataset/annotations", update_content="speedboat")

    # 批量拿到文件夹中的某格式的文件
    # a.movefile(folder_path="/home/you/Desktop/网上快艇", dst=pa, suffix=".jpg")

Guess you like

Origin blog.csdn.net/m0_46114594/article/details/131956883