DPDK分析——UIO

目录

一、概述

二、实现

2.1 Linux UIO

2.1.1 API

2.1.2 UIO初始化

2.1.3 UIO注册 

2.2 igb_uio

2.2.1 初始化

2.3 用户态使用


一、概述

DPDK通过内核的UIO机制将硬件资源(MMIO, IO,interrupt)映射到用户态,其基本原理很简单,都是内核的基本机制。

扫描二维码关注公众号,回复: 6095243 查看本文章

二、实现

2.1 Linux UIO

2.1.1 API

  •  uio_register_device(parent, info)

其中info对应下面的结构,需要用户自行提供:

struct uio_info {
	struct uio_device	*uio_dev;
	const char		*name;
	const char		*version;
	struct uio_mem		mem[MAX_UIO_MAPS];
	struct uio_port		port[MAX_UIO_PORT_REGIONS];
	long			irq;
	unsigned long		irq_flags;
	void			*priv;
	irqreturn_t (*handler)(int irq, struct uio_info *dev_info);
	int (*mmap)(struct uio_info *info, struct vm_area_struct *vma);
	int (*open)(struct uio_info *info, struct inode *inode);
	int (*release)(struct uio_info *info, struct inode *inode);
	int (*irqcontrol)(struct uio_info *info, s32 irq_on);
};

2.1.2 UIO初始化

[drivers/uio/uio.c]

uio_init->uio_major_init

模块初始化uio_init,内部其实就是注册一个字符设备驱动,比较常规,留意一下uio cdev的方法:

static const struct file_operations uio_fops = {
	.owner		= THIS_MODULE,
	.open		= uio_open,
	.release	= uio_release,
	.read		= uio_read,
	.write		= uio_write,
	.mmap		= uio_mmap,
	.poll		= uio_poll,
	.fasync		= uio_fasync,
	.llseek		= noop_llseek,
};

最后关注一下uio的主设备号和cdev

	uio_major = MAJOR(uio_dev);
	uio_cdev = cdev;

2.1.3 UIO注册 

#define uio_register_device(parent, info) \
    __uio_register_device(THIS_MODULE, parent, info)

int __uio_register_device(struct module *owner,
			  struct device *parent,
			  struct uio_info *info)
{
	struct uio_device *idev;
	int ret = 0;

	idev = devm_kzalloc(parent, sizeof(*idev), GFP_KERNEL);

	idev->owner = owner;
	idev->info = info;
	init_waitqueue_head(&idev->wait);
	atomic_set(&idev->event, 0);

	ret = uio_get_minor(idev);
	if (ret)
		return ret;

	idev->dev = device_create(&uio_class, parent,
				  MKDEV(uio_major, idev->minor), idev,
				  "uio%d", idev->minor);

	ret = uio_dev_add_attributes(idev);
	if (ret)
		goto err_uio_dev_add_attributes;

	info->uio_dev = idev;

	if (info->irq && (info->irq != UIO_IRQ_CUSTOM)) {
		ret = request_irq(info->irq, uio_interrupt,
				  info->irq_flags, info->name, idev);
		if (ret)
			goto err_request_irq;
	}

	return 0;
}

代码删除了无关部分,可以看出一个uio注册的过程就是根据uio_info构建一个uio_dev,并将其纳入驱动模型中,我们来看一下uio_dev

struct uio_device {
        struct module           *owner;
        struct device           *dev;
        int                     minor;
        atomic_t                event;
        struct fasync_struct    *async_queue;
        wait_queue_head_t       wait;
        struct uio_info         *info;
        struct kobject          *map_dir;
        struct kobject          *portio_dir;
};
  • uio_get_minor分配一个新的次设备号
  • 剩下部分最重要的是uio_dev_add_attributes

uio_dev_add_attributes实现对资源的存储,这里先看一下uio的资源抽象:

struct uio_mem {
	const char		*name;
	phys_addr_t		addr;
	unsigned long		offs;
	resource_size_t		size;
	int			memtype;
	void __iomem		*internal_addr;
	struct uio_map		*map;
};

我们逐段来看uio_dev_add_attributes,首先是对uio_mem的初始化

	for (mi = 0; mi < MAX_UIO_MAPS; mi++) {
		mem = &idev->info->mem[mi];
		if (mem->size == 0)
			break;
		if (!map_found) {
			map_found = 1;
			idev->map_dir = kobject_create_and_add("maps",
							&idev->dev->kobj);
			if (!idev->map_dir) {
				ret = -ENOMEM;
				goto err_map;
			}
		}
		map = kzalloc(sizeof(*map), GFP_KERNEL);
		if (!map) {
			ret = -ENOMEM;
			goto err_map;
		}
		kobject_init(&map->kobj, &map_attr_type);
		map->mem = mem;
		mem->map = map;
		ret = kobject_add(&map->kobj, idev->map_dir, "map%d", mi);
		if (ret)
			goto err_map_kobj;
		ret = kobject_uevent(&map->kobj, KOBJ_ADD);
		if (ret)
			goto err_map_kobj;
	}
  • 根据传递进来的mem参数,在parent dev对应的目录下建立maps目录,下面根据mem映射的个数建立maps/map_X文件,
	for (pi = 0; pi < MAX_UIO_PORT_REGIONS; pi++) {
		port = &idev->info->port[pi];
		if (port->size == 0)
			break;
		if (!portio_found) {
			portio_found = 1;
			idev->portio_dir = kobject_create_and_add("portio",
							&idev->dev->kobj);
			if (!idev->portio_dir) {
				ret = -ENOMEM;
				goto err_portio;
			}
		}
		portio = kzalloc(sizeof(*portio), GFP_KERNEL);
		if (!portio) {
			ret = -ENOMEM;
			goto err_portio;
		}
		kobject_init(&portio->kobj, &portio_attr_type);
		portio->port = port;
		port->portio = portio;
		ret = kobject_add(&portio->kobj, idev->portio_dir,
							"port%d", pi);
		if (ret)
			goto err_portio_kobj;
		ret = kobject_uevent(&portio->kobj, KOBJ_ADD);
		if (ret)
			goto err_portio_kobj;
	
  • 和上面类似只不过名称变为portio,表示portio的映射

2.2 igb_uio

igb_uio是一个pci driver,这一节主要看dpdk是如何使用Linux UIO接口的

2.2.1 初始化

都是标准的东西,就直接贴了

static int __init igbuio_pci_init_module(void)
{
	igbuio_config_intr_mode(intr_mode);
	return pci_register_driver(&igbuio_pci_driver);
}

intr_mode是模块参数,有msix, msi, legacy,默认情况下msix中断

static struct pci_driver igbuio_pci_driver = {
	.name = "igb_uio",
	.id_table = NULL,
	.probe = igbuio_pci_probe,
	.remove = igbuio_pci_remove,
};
  • 以上是igb_uio的driver

当系统进行总线扫描的时候,如果匹配会执行driver的probe,即igbuio_pci_probe

[kernel/linux/igb_uio/igb_uio.c]

static int igbuio_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) {


    udev = kzalloc(sizeof(struct rte_uio_pci_dev), GFP_KERNEL);
    pci_enable_device(dev);
    pci_set_master(dev);
    igbuio_setup_bars(dev, &udev->info);    
    pci_set_dma_mask(dev,  DMA_BIT_MASK(64));
	
    /* fill uio infos */
	udev->info.name = "igb_uio";
	udev->info.version = "0.1";
	udev->info.irqcontrol = igbuio_pci_irqcontrol;
	udev->info.open = igbuio_pci_open;
	udev->info.release = igbuio_pci_release;
	udev->info.priv = udev;
	udev->pdev = dev;
	atomic_set(&udev->refcnt, 0);

	err = sysfs_create_group(&dev->dev.kobj, &dev_attr_grp);
	if (err != 0)
		goto fail_release_iomem;

	/* register uio driver */
	err = uio_register_device(&dev->dev, &udev->info);
	if (err != 0)
		goto fail_remove_group;

	pci_set_drvdata(dev, udev);
}
  • igbuio_pci_probe执行了pci驱动的基本操作,如是能,dma映射,它主要的工作还是对igb相关的配置空间和memory空间,msix中断进行了映射

上一节分析过的参数在device私有数据中:

struct rte_uio_pci_dev {
	struct uio_info info;
	struct pci_dev *pdev;
	enum rte_intr_mode mode;
	atomic_t refcnt;
};

igbuio_setup_bars会去读bar地址,根据bar指向空间的类型(mem, io),对

static int igbuio_pci_setup_iomem(struct pci_dev *dev, struct uio_info *info,
		       int n, int pci_bar, const char *name)
{
	unsigned long addr, len;
	void *internal_addr;

	addr = pci_resource_start(dev, pci_bar);
	len = pci_resource_len(dev, pci_bar);
	if (addr == 0 || len == 0)
		return -1;
	if (wc_activate == 0) {
		internal_addr = ioremap(addr, len);
		if (internal_addr == NULL)
			return -1;
	} else {
		internal_addr = NULL;
	}
	info->mem[n].name = name;
	info->mem[n].addr = addr;
	info->mem[n].internal_addr = internal_addr;
	info->mem[n].size = len;
	info->mem[n].memtype = UIO_MEM_PHYS;
	return 0;
}
  • addr是memory 空间(pci域)的首地址,internal_addr是经过ioremap过的地址,其范围在vmalloc空间,但是不会分配内存,ioport类似。

2.3 用户态使用

到现在为止,uio将igb_uio的资源:mmio和ioport的资源都记录下来了,并提供给用户对应的字符设备文件,接下来就分析一下用户态映射设备资源的过程。

在pci driver probe过程中,如果driver设置了RTE_PCI_DRV_NEED_MAPPING标记,就会执行地址映射,对应函数

int rte_pci_map_device(struct rte_pci_device *dev)
{
	int ret = -1;

	/* try mapping the NIC resources using VFIO if it exists */
	switch (dev->kdrv) {
	case RTE_KDRV_VFIO:
#ifdef VFIO_PRESENT
		if (pci_vfio_is_enabled())
			ret = pci_vfio_map_resource(dev);
#endif
		break;
	case RTE_KDRV_IGB_UIO:
	case RTE_KDRV_UIO_GENERIC:
		if (rte_eal_using_phys_addrs()) {
			/* map resources for devices that use uio */
			ret = pci_uio_map_resource(dev);
		}
		break;
	default:
		RTE_LOG(DEBUG, EAL,
			"  Not managed by a supported kernel driver, skipped\n");
		ret = 1;
		break;
	}

	return ret;
}

这里我们先关注pci_uio_map_resource,这个函数主要有两个部分先看pci_uio_alloc_resource

[driver/bus/pci/pci_common_uio.c]

int pci_uio_alloc_resource(struct rte_pci_device *dev,
		struct mapped_pci_resource **uio_res)
{
	char dirname[PATH_MAX];
	char cfgname[PATH_MAX];
	char devname[PATH_MAX]; /* contains the /dev/uioX */
	int uio_num;
	struct rte_pci_addr *loc;

	loc = &dev->addr;

	/* find uio resource */
	uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname), 1);
	if (uio_num < 0) {
		RTE_LOG(WARNING, EAL, "  "PCI_PRI_FMT" not managed by UIO driver, "
				"skipping\n", loc->domain, loc->bus, loc->devid, loc->function);
		return 1;
	}
	snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);

	/* save fd if in primary process */
	dev->intr_handle.fd = open(devname, O_RDWR);
	if (dev->intr_handle.fd < 0) {
		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
			devname, strerror(errno));
		goto error;
	}

	snprintf(cfgname, sizeof(cfgname),
			"/sys/class/uio/uio%u/device/config", uio_num);
	dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
	if (dev->intr_handle.uio_cfg_fd < 0) {
		RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
			cfgname, strerror(errno));
		goto error;
	}

	if (dev->kdrv == RTE_KDRV_IGB_UIO)
		dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
	else {
		dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX;

		/* set bus master that is not done by uio_pci_generic */
		if (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
			RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
			goto error;
		}
	}

	/* allocate the mapping details for secondary processes*/
	*uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0);
	if (*uio_res == NULL) {
		RTE_LOG(ERR, EAL,
			"%s(): cannot store uio mmap details\n", __func__);
		goto error;
	}

	snprintf((*uio_res)->path, sizeof((*uio_res)->path), "%s", devname);
	memcpy(&(*uio_res)->pci_addr, &dev->addr, sizeof((*uio_res)->pci_addr));

	return 0;

error:
	pci_uio_free_resource(dev, *uio_res);
	return -1;
}
  • 先找到对应的字符设备,位置在/sys/bus/pci/devices/DBDF/uio目录,找到实际对应的uioX,这样就可以打开/dev/uioX, 赋值给dev->intr_handle.fd
  • 同时打开位于/sys/class/uio/uio%u/device/config,赋值给dev->intr_handle.uio_cfg_fd

对/dev/uioX的open动作会调用到uio的内核字符驱动open方法uio_open

函数做两件事情,首先分配文件的私有数据:

struct uio_listener {
	struct uio_device *dev;
	s32 event_count;
};

如果注册了idev->info->open则调用,igb_uio中对应的是igbuio_pci_open->igbuio_pci_enable_interrupts

这里我们只分析默认的msix中断:

	case RTE_INTR_MODE_MSIX:
		/* Only 1 msi-x vector needed */
#ifndef HAVE_ALLOC_IRQ_VECTORS
		msix_entry.entry = 0;
		if (pci_enable_msix(udev->pdev, &msix_entry, 1) == 0) {
			dev_dbg(&udev->pdev->dev, "using MSI-X");
			udev->info.irq_flags = IRQF_NO_THREAD;
			udev->info.irq = msix_entry.vector;
			udev->mode = RTE_INTR_MODE_MSIX;
			break;
		}
#else
		if (pci_alloc_irq_vectors(udev->pdev, 1, 1, PCI_IRQ_MSIX) == 1) {
			dev_dbg(&udev->pdev->dev, "using MSI-X");
			udev->info.irq_flags = IRQF_NO_THREAD;
			udev->info.irq = pci_irq_vector(udev->pdev, 0);
			udev->mode = RTE_INTR_MODE_MSIX;
			break;
		}
#endif

if (udev->info.irq != UIO_IRQ_NONE)
	err = request_irq(udev->info.irq, igbuio_pci_irqhandler,
				  udev->info.irq_flags, udev->info.name,
				  udev);
dev_info(&udev->pdev->dev, "uio device registered with irq %ld\n",
		 udev->info.irq);
  • pci_enable_msi,获取irq,request_irq注册

打开并赋值了两个fd后,这个函数填充以下结构:

struct mapped_pci_resource {
    TAILQ_ENTRY(mapped_pci_resource) next;

    struct rte_pci_addr pci_addr;
    char path[PATH_MAX];  //       /dev/uioX
    int nb_maps;
    struct pci_map maps[PCI_MAX_RESOURCE];
    struct pci_msix_table msix_table;
};

在这之后可以开始真正的映射工作了:

int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx,
		struct mapped_pci_resource *uio_res, int map_idx)
{
	int fd = -1;
	char devname[PATH_MAX];
	void *mapaddr;
	struct rte_pci_addr *loc;
	struct pci_map *maps;
	int wc_activate = 0;

	if (dev->driver != NULL)
		wc_activate = dev->driver->drv_flags & RTE_PCI_DRV_WC_ACTIVATE;

	loc = &dev->addr;
	maps = uio_res->maps;

	/* allocate memory to keep path */
	maps[map_idx].path = rte_malloc(NULL, sizeof(devname), 0);
	if (maps[map_idx].path == NULL) {
		RTE_LOG(ERR, EAL, "Cannot allocate memory for path: %s\n",
				strerror(errno));
		return -1;
	}

	/*
	 * open resource file, to mmap it
	 */
	if (!wc_activate || fd < 0) {
		snprintf(devname, sizeof(devname),
			"%s/" PCI_PRI_FMT "/resource%d",
			rte_pci_get_sysfs_path(),
			loc->domain, loc->bus, loc->devid,
			loc->function, res_idx);

		/* then try to map resource file */
		fd = open(devname, O_RDWR);
	}

	/* try mapping somewhere close to the end of hugepages */
	if (pci_map_addr == NULL)
		pci_map_addr = pci_find_max_end_va();

	mapaddr = pci_map_resource(pci_map_addr, fd, 0,
			(size_t)dev->mem_resource[res_idx].len, 0);
	close(fd);
	if (mapaddr == MAP_FAILED)
		goto error;

	pci_map_addr = RTE_PTR_ADD(mapaddr,
			(size_t)dev->mem_resource[res_idx].len);

	maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
	maps[map_idx].size = dev->mem_resource[res_idx].len;
	maps[map_idx].addr = mapaddr;
	maps[map_idx].offset = 0;
	strcpy(maps[map_idx].path, devname);
	dev->mem_resource[res_idx].addr = mapaddr;

	return 0;

error:
	rte_free(maps[map_idx].path);
	return -1;
}
  • 资源的映射还是通过/sys/bus/pci/DBDF/resource
  • 在大页中找一个靠近结尾的地址,根据mem长度使用mmap(pci_map_resource)进行映射

关于pci设备通过sysfs 映射出来的资源,可以参照Document/filesystems/sysfs-pci.txt

       resource		   PCI resource host addresses (ascii, ro)
       resource0..N	   PCI resource N, if present (binary, mmap, rw[1])
       resource0_wc..N_wc  PCI WC map resource N, if prefetchable (binary, mmap)

这部分代码的实现在

[drivers/pci/pci-sysfs.c]

static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine)
{
	/* allocate attribute structure, piggyback attribute name */
	int name_len = write_combine ? 13 : 10;
	struct bin_attribute *res_attr;
	char *res_attr_name;
	int retval;

	res_attr = kzalloc(sizeof(*res_attr) + name_len, GFP_ATOMIC);
	if (!res_attr)
		return -ENOMEM;

	res_attr_name = (char *)(res_attr + 1);

	sysfs_bin_attr_init(res_attr);
	if (write_combine) {
		pdev->res_attr_wc[num] = res_attr;
		sprintf(res_attr_name, "resource%d_wc", num);
		res_attr->mmap = pci_mmap_resource_wc;
	} else {
		pdev->res_attr[num] = res_attr;
		sprintf(res_attr_name, "resource%d", num);
		if (pci_resource_flags(pdev, num) & IORESOURCE_IO) {
			res_attr->read = pci_read_resource_io;
			res_attr->write = pci_write_resource_io;
			if (arch_can_pci_mmap_io())
				res_attr->mmap = pci_mmap_resource_uc;
		} else {
			res_attr->mmap = pci_mmap_resource_uc;
		}
	}
	res_attr->attr.name = res_attr_name;
	res_attr->attr.mode = S_IRUSR | S_IWUSR;
	res_attr->size = pci_resource_len(pdev, num);
	res_attr->private = (void *)(unsigned long)num;
	retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr);
	if (retval)
		kfree(res_attr);

	return retval;
}

注意,pci的这种mmio空间的映射和一般内存的映射不同,它是不分配物理内存的,只建立对应的页表,在访问这部分pci对应的虚拟地址时,通过MMU转换成物理地址,这部分地址是被PCI主控制器“认领的”(而非内存控制器),主控制器将物理地址转换为总线地址完成访问。

在用户调用mmap时,最终调用sysfs的pci_mmap_resource_uc

int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
			    struct vm_area_struct *vma,
			    enum pci_mmap_state mmap_state, int write_combine)
{
	unsigned long size;
	int ret;

	size = ((pci_resource_len(pdev, bar) - 1) >> PAGE_SHIFT) + 1;
	if (vma->vm_pgoff + vma_pages(vma) > size)
		return -EINVAL;

	if (write_combine)
		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
	else
		vma->vm_page_prot = pgprot_device(vma->vm_page_prot);

	if (mmap_state == pci_mmap_io) {
		ret = pci_iobar_pfn(pdev, bar, vma);
		if (ret)
			return ret;
	} else
		vma->vm_pgoff += (pci_resource_start(pdev, bar) >> PAGE_SHIFT);

	vma->vm_ops = &pci_phys_vm_ops;

	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
				  vma->vm_end - vma->vm_start,
				  vma->vm_page_prot);
}

实现还是很常规,构造一个vma,再通过io_remap_pfn_range将页表建立起来。

OK,那么至此bar空间到用户空间的映射算是建立起来了,填充的是

struct pci_map {
	void *addr;
	char *path;
	uint64_t offset;
	uint64_t size;
	uint64_t phaddr;
};
	maps[map_idx].phaddr = dev->mem_resource[res_idx].phys_addr;
	maps[map_idx].size = dev->mem_resource[res_idx].len;
	maps[map_idx].addr = mapaddr;

猜你喜欢

转载自blog.csdn.net/whenloce/article/details/88374867
今日推荐