利用周末时间做了一个MDEV虚拟化PCI设备的小试验,简单记录一下:
DEMO架构,此图参考了内核文档:Documentation/driver-api/vfio-mediated-device.rst
host kernel watchdog pci driver:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/uuid.h>
#include <linux/vfio.h>
#include <linux/iommu.h>
#include <linux/sysfs.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/mdev.h>
#include <linux/pci.h>#define IO_BAR0_SIZE 32
#define IO_CONF_SIZE 0x100
#define CZL_WDG_DEVICE_VENDOR_ID 0xbeef
#define CZL_WDG_DEVICE_DEVICE_ID 0x1001
#define API_DBG(fmt, ...) do { \printk("%s line %d, "fmt, __func__, __LINE__, ##__VA_ARGS__); \} while (0)struct czl_wdg_dev {dev_t wdg_devt;struct class *wdg_class;struct cdev wdg_cdev;struct device dev;
};struct mdev_region_info {u64 start;u64 phys_start;u32 size;u64 vfio_offset;
};struct wdg_mdev_state {u8 *config;u8 *iobase;struct mdev_device *mdev;struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];u32 bar_mask[VFIO_PCI_NUM_REGIONS];struct list_head next;struct vfio_device_info dev_info;int index;struct mutex ops_lock;
};static const struct file_operations czl_wdg_fops = {.owner = THIS_MODULE,
};static struct mutex wdg_mdev_list_lock;
static struct list_head wdg_mdev_devices_list;
#define WDG_VFIO_PCI_OFFSET_SHIFT (40)
#define WDG_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> WDG_VFIO_PCI_OFFSET_SHIFT)
#define WDG_VFIO_PCI_INDEX_TO_OFFSET(index) \((u64)(index) << WDG_VFIO_PCI_OFFSET_SHIFT)
#define WDG_VFIO_PCI_OFFSET_MASK \(((u64)(1) << WDG_VFIO_PCI_OFFSET_SHIFT) - 1)
#define MAX_WDGS (16)
static struct czl_wdg_dev czl_wdg;static ssize_t
czl_wdg_dev_show(struct device *dev, struct device_attribute *attr,char *buf)
{return sprintf(buf, "mdev emulated pci watchdog device by caozilong.\n");
}
static DEVICE_ATTR_RO(czl_wdg_dev);static struct attribute *wdg_dev_attrs[] = {&dev_attr_czl_wdg_dev.attr,NULL,
};static const struct attribute_group wdg_dev_group = {.name = "czl_wdg",.attrs = wdg_dev_attrs,
};static const struct attribute_group *wdg_dev_groups[] = {&wdg_dev_group,NULL,
};static ssize_t
mdev_dev_show(struct device *dev, struct device_attribute *attr,char *buf)
{if (mdev_from_dev(dev)) {return sprintf(buf, "This is watchdog %s\n", dev_name(dev));}return sprintf(buf, "\n");
}static DEVICE_ATTR_RO(mdev_dev);static struct attribute *mdev_dev_attrs[] = {&dev_attr_mdev_dev.attr,NULL,
};static const struct attribute_group mdev_dev_group = {.name = "caozilong",.attrs = mdev_dev_attrs,
};static const struct attribute_group *mdev_dev_groups[] = {&mdev_dev_group,NULL,
};static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
{int i;char name[128];const char *name_str[3] = {"Soft Watchdog", "Hardware Watchdog", "Dummy Watchdog"};for (i = 0; i < 3; i++) {snprintf(name, 128, "%s-%d", dev_driver_string(dev), i + 1);if (!strcmp(kobj->name, name)) {return sprintf(buf, "%s\n", name_str[i]);}}return -EINVAL;
}static ssize_t device_api_show(struct kobject *kobj, struct device *dev,char *buf)
{return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
}static ssize_t
available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
{struct wdg_mdev_state *mds;int used = 0;list_for_each_entry(mds, &wdg_mdev_devices_list, next) {used ++;}return sprintf(buf, "%d\n", (MAX_WDGS - used));
}static MDEV_TYPE_ATTR_RO(name);
static MDEV_TYPE_ATTR_RO(device_api);
static MDEV_TYPE_ATTR_RO(available_instances);static struct attribute *mdev_types_attrs[] = {&mdev_type_attr_name.attr,&mdev_type_attr_device_api.attr,&mdev_type_attr_available_instances.attr,NULL,
};static struct attribute_group mdev_type_group1 = {.name = "1",.attrs = mdev_types_attrs,
};static struct attribute_group mdev_type_group2 = {.name = "2",.attrs = mdev_types_attrs,
};static struct attribute_group mdev_type_group3 = {.name = "3",.attrs = mdev_types_attrs,
};static struct attribute_group *mdev_type_groups[] = {&mdev_type_group1,&mdev_type_group2,&mdev_type_group3,NULL,
};static int czl_wdg_open(struct mdev_device *mdev)
{pr_info("%s line %d, wdg device opened.\n",__func__, __LINE__);return 0;
}static void czl_wdg_close(struct mdev_device *mdev)
{pr_info("%s line %d, wdg device close.\n",__func__, __LINE__);return;
}// fill pci config space meta data & capabilities.
int wdg_create_config_space(struct wdg_mdev_state *mstate)
{// vendor id, device id.*((unsigned int *)&mstate->config[0]) = CZL_WDG_DEVICE_VENDOR_ID |(CZL_WDG_DEVICE_DEVICE_ID << 16);*((unsigned short *)&mstate->config[4]) = 0x0001;*((unsigned short *)&mstate->config[6]) = 0x0200;mstate->config[0x8] = 0x10;mstate->config[0x9] = 0x02;mstate->config[0xa] = 0x00;mstate->config[0xb] = 0x07;*((unsigned int *)&mstate->config[0x10]) = 0x000001;mstate->bar_mask[0] = ~(IO_BAR0_SIZE) + 1;*((unsigned int *)&mstate->config[0x2c]) = 0x10011af4;// cap ptr.mstate->config[0x34] = 0x00;mstate->config[0x3d] = 0x01;mstate->config[0x40] = 0x23;mstate->config[0x43] = 0x80;mstate->config[0x44] = 0x23;mstate->config[0x48] = 0x23;mstate->config[0x4c] = 0x23;mstate->config[0x60] = 0x50;mstate->config[0x61] = 0x43;mstate->config[0x62] = 0x49;mstate->config[0x63] = 0x20;mstate->config[0x64] = 0x53;mstate->config[0x65] = 0x65;mstate->config[0x66] = 0x72;mstate->config[0x67] = 0x69;mstate->config[0x68] = 0x61;mstate->config[0x69] = 0x6c;mstate->config[0x6a] = 0x2f;mstate->config[0x6b] = 0x55;mstate->config[0x6c] = 0x41;mstate->config[0x6d] = 0x52;mstate->config[0x6e] = 0x54;return 0;
}static int czl_wdg_create(struct kobject *kobj, struct mdev_device *mdev)
{int i;struct wdg_mdev_state *mstate;char name[32];if (!mdev)return -EINVAL;for (i = 0; i < 3; i++) {snprintf(name, 32, "%s-%d", dev_driver_string(mdev_parent_dev(mdev)), i + 1);if (!strcmp(kobj->name, name)) {break;}}if (i >= 3) {return -EINVAL;}mstate = kzalloc(sizeof(struct wdg_mdev_state), GFP_KERNEL);if (mstate == NULL)return -ENOMEM;// group number in mdev_type.mstate->index = i + 1;mstate->config = kzalloc(IO_CONF_SIZE, GFP_KERNEL);if (mstate->config == NULL) {pr_err("%s line %d, alloc pci config buffer failure.\n",__func__, __LINE__);kfree(mstate);return -ENOMEM;}mstate->iobase = kzalloc(IO_BAR0_SIZE, GFP_KERNEL);if (mstate->iobase == NULL) {pr_err("%s line %d, alloc pci io buffer failure.\n",__func__, __LINE__);kfree(mstate->config);kfree(mstate);return -ENOMEM;}memset(mstate->config, 0x00, IO_CONF_SIZE);mutex_init(&mstate->ops_lock);mstate->mdev = mdev;mdev_set_drvdata(mdev, mstate);wdg_create_config_space(mstate);mutex_lock(&wdg_mdev_list_lock);list_add(&mstate->next, &wdg_mdev_devices_list);mutex_unlock(&wdg_mdev_list_lock);return 0;
}static int czl_wdg_remove(struct mdev_device *mdev)
{struct wdg_mdev_state *mds, *tmp_mds;struct wdg_mdev_state *mstate = mdev_get_drvdata(mdev);int ret = -EINVAL;mutex_lock(&wdg_mdev_list_lock);list_for_each_entry_safe(mds, tmp_mds, &wdg_mdev_devices_list, next) {if (mstate == mds) {list_del(&mstate->next);mdev_set_drvdata(mdev, NULL);kfree(mstate->config);kfree(mstate->iobase);kfree(mstate);ret = 0;break;}}mutex_unlock(&wdg_mdev_list_lock);return ret;
}static void handle_pci_cfg_space_write(struct wdg_mdev_state *mstate, u16 offset,u8 *buf, u32 count)
{u32 cfg_addr, bar_mask;switch (offset) {case 0x04: /* device control */case 0x06: /* device status */// do nothingbreak;case 0x3c:mstate->config[0x3c] = buf[0];break;case 0x3d:break;case 0x10: /* BAR0 */cfg_addr = *(u32 *)buf;pr_info("BAR0 addr 0x%x\n", cfg_addr);if (cfg_addr == 0xffffffff) {bar_mask = mstate->bar_mask[0];cfg_addr = (cfg_addr & bar_mask);}cfg_addr |= (mstate->config[offset] & 0x3ul);*((unsigned int *)&mstate->config[offset]) = cfg_addr;break;case 0x14: /* BAR1 */case 0x18: /* BAR2 */case 0x20: /* BAR4 */*((unsigned int *)&mstate->config[offset]) = 0;break;default:pr_info("PCI config write @0x%x of %d bytes not handled\n",offset, count);break;}return;
}static void handle_pci_cfg_space_read(struct wdg_mdev_state *mstate, u16 offset,u8 *buf, u32 count)
{memcpy(buf, (mstate->config + offset), count);return;
}static void mdev_read_base(struct wdg_mdev_state *mstate)
{int index, pos;u32 start_lo, start_hi;u32 mem_type;pos = PCI_BASE_ADDRESS_0;for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) {if (!mstate->region_info[index].size)continue;start_lo = (*(u32 *)(mstate->config + pos)) &PCI_BASE_ADDRESS_MEM_MASK;mem_type = (*(u32 *)(mstate->config + pos)) &PCI_BASE_ADDRESS_MEM_TYPE_MASK;switch (mem_type) {case PCI_BASE_ADDRESS_MEM_TYPE_64:start_hi = (*(u32 *)(mstate->config + pos + 4));pos += 4;break;case PCI_BASE_ADDRESS_MEM_TYPE_32:case PCI_BASE_ADDRESS_MEM_TYPE_1M:default:start_hi = 0;break;}pos += 4;mstate->region_info[index].start = ((u64)start_hi << 32) | start_lo;}return;
}static void handle_bar_write(unsigned int index, struct wdg_mdev_state *mstate,u16 offset, u8 *buf, u32 count)
{pr_info("%s line %d, bar %d, write offset 0x%x, count 0x%x, val 0x%x.\n",__func__, __LINE__, index, offset, count, *buf);memcpy(mstate->iobase + offset, buf, count);return;
}static void handle_bar_read(unsigned int index, struct wdg_mdev_state *mstate,u16 offset, u8 *buf, u32 count)
{pr_info("%s line %d, bar %d, read offset 0x%x, count 0x%x, val 0x%x.\n",__func__, __LINE__, index, offset, count, *buf);memcpy(buf, mstate->iobase + offset, count);return;
}static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,loff_t pos, bool is_write)
{int ret = 0;unsigned int index;loff_t offset;struct wdg_mdev_state *mstate;if (!mdev || !buf)return -EINVAL;mstate = mdev_get_drvdata(mdev);if (!mstate) {pr_err("%s line %d. get mstate failure.\n", __func__, __LINE__);return -EINVAL;}mutex_lock(&mstate->ops_lock);index = WDG_VFIO_PCI_OFFSET_TO_INDEX(pos);offset = pos & WDG_VFIO_PCI_OFFSET_MASK;switch (index) {case VFIO_PCI_CONFIG_REGION_INDEX:pr_info("%s: PCI config space %s at offset 0x%llx\n",__func__, is_write ? "write" : "read", offset);if (is_write) {handle_pci_cfg_space_write(mstate, offset, buf, count);} else {handle_pci_cfg_space_read(mstate, offset, buf, count);}break;case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:if (!mstate->region_info[index].start)mdev_read_base(mstate);if (is_write) {pr_info("%s: write bar%d offset 0x%llx, val 0x%x.\n",__func__, index, offset, *buf);handle_bar_write(index, mstate, offset, buf, count);} else {pr_info("%s: read bar%d offset 0x%llx, val 0x%x.\n",__func__, index, offset, *buf);handle_bar_read(index, mstate, offset, buf, count);}break;default:ret = -1;goto failed;}ret = count;failed:mutex_unlock(&mstate->ops_lock);return ret;
}static ssize_t czl_wdg_read(struct mdev_device *mdev, char __user *buf,size_t count, loff_t *ppos)
{unsigned int done = 0;int ret;pr_info("%s line %d, read count 0x%lx, pos 0x%llx.\n", __func__, __LINE__, count, *ppos);while (count) {size_t filled;if (count >= 4 && !(*ppos % 4)) {u32 val;ret = mdev_access(mdev, (u8 *)&val, sizeof(val),*ppos, false);if (ret <= 0)goto read_err;if (copy_to_user(buf, &val, sizeof(val)))goto read_err;filled = 4;} else if (count >= 2 && !(*ppos % 2)) {u16 val;ret = mdev_access(mdev, (u8 *)&val, sizeof(val),*ppos, false);if (ret <= 0)goto read_err;if (copy_to_user(buf, &val, sizeof(val)))goto read_err;filled = 2;} else {u8 val;ret = mdev_access(mdev, (u8 *)&val, sizeof(val),*ppos, false);if (ret <= 0)goto read_err;if (copy_to_user(buf, &val, sizeof(val)))goto read_err;filled = 1;}count -= filled;done += filled;*ppos += filled;buf += filled;}pr_info("%s line %d, read count 0x%x.\n", __func__, __LINE__, done);return done;read_err:pr_err("%s line %d, read err happend.\n", __func__, __LINE__);return -EFAULT;
}static ssize_t czl_wdg_write(struct mdev_device *mdev, const char __user *buf,size_t count, loff_t *ppos)
{unsigned int done = 0;int ret;pr_info("%s line %d, write count 0x%lx, pos 0x%llx.\n", __func__, __LINE__, count, *ppos);while (count) {size_t filled;if (count >= 4 && !(*ppos % 4)) {u32 val;if (copy_from_user(&val, buf, sizeof(val)))goto write_err;ret = mdev_access(mdev, (u8 *)&val, sizeof(val),*ppos, true);if (ret <= 0)goto write_err;filled = 4;} else if (count >= 2 && !(*ppos % 2)) {u16 val;if (copy_from_user(&val, buf, sizeof(val)))goto write_err;ret = mdev_access(mdev, (u8 *)&val, sizeof(val),*ppos, true);if (ret <= 0)goto write_err;filled = 2;} else {u8 val;if (copy_from_user(&val, buf, sizeof(val)))goto write_err;ret = mdev_access(mdev, (u8 *)&val, sizeof(val),*ppos, true);if (ret <= 0)goto write_err;filled = 1;}count -= filled;done += filled;*ppos += filled;buf += filled;}pr_info("%s line %d, write count 0x%x.\n", __func__, __LINE__, done);return done;write_err:pr_err("%s line %d, write failure.\n", __func__, __LINE__);return -EFAULT;
}static int wdg_get_device_info(struct mdev_device *mdev, struct vfio_device_info *dev_info)
{dev_info->flags = VFIO_DEVICE_FLAGS_PCI;dev_info->num_regions = VFIO_PCI_NUM_REGIONS;dev_info->num_irqs = VFIO_PCI_NUM_IRQS;return 0;
}static int wdg_get_region_info(struct mdev_device *mdev, struct vfio_region_info *region_info)
{unsigned int size = 0;struct wdg_mdev_state *mstate;u32 bar_index;if (!mdev) {pr_err("%s line %d,mdev is null.\n", __func__, __LINE__);return -EINVAL;}mstate = mdev_get_drvdata(mdev);if (!mstate) {pr_err("%s line %d,mstat is null.\n", __func__, __LINE__);return -EINVAL;}bar_index = region_info->index;if (bar_index >= VFIO_PCI_NUM_REGIONS) {pr_err("%s line %d,bar index %d exceeds.\n", __func__, __LINE__, bar_index);return -EINVAL;}mutex_lock(&mstate->ops_lock);switch (bar_index) {case VFIO_PCI_CONFIG_REGION_INDEX:size = IO_CONF_SIZE;break;case VFIO_PCI_BAR0_REGION_INDEX:size = IO_BAR0_SIZE;break;default:size = 0;break;}mstate->region_info[bar_index].size = size;mstate->region_info[bar_index].vfio_offset =WDG_VFIO_PCI_INDEX_TO_OFFSET(bar_index);region_info->size = size;region_info->offset = WDG_VFIO_PCI_INDEX_TO_OFFSET(bar_index);region_info->flags = VFIO_REGION_INFO_FLAG_READ |VFIO_REGION_INFO_FLAG_WRITE;mutex_unlock(&mstate->ops_lock);return 0;
}static int wdg_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info)
{switch (irq_info->index) {case VFIO_PCI_INTX_IRQ_INDEX:case VFIO_PCI_MSI_IRQ_INDEX:case VFIO_PCI_REQ_IRQ_INDEX:break;default:pr_err("%s line %d, irq idx %d is invalid.\n",__func__, __LINE__, irq_info->index);return -EINVAL;}irq_info->flags = VFIO_IRQ_INFO_EVENTFD;irq_info->count = 1;if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE |VFIO_IRQ_INFO_AUTOMASKED);elseirq_info->flags |= VFIO_IRQ_INFO_NORESIZE;return 0;
}static long czl_wdg_ioctl(struct mdev_device *mdev, unsigned int cmd,unsigned long arg)
{int ret = 0;unsigned long minsz;struct wdg_mdev_state *mstate;pr_info("czl wdg ioctl enter.\n");if (!mdev) {pr_err("%s line %d, mdev is null.\n", __func__, __LINE__);return -EINVAL;}mstate = mdev_get_drvdata(mdev);if (!mstate) {pr_err("%s line %d, cant find mstate data.\n", __func__, __LINE__);return -ENODEV;}switch (cmd) {case VFIO_DEVICE_GET_INFO: {struct vfio_device_info info;minsz = offsetofend(struct vfio_device_info, num_irqs);if (copy_from_user(&info, (void __user *)arg, minsz))return -EFAULT;if (info.argsz < minsz) {pr_err("%s line %d, info.argsz %d < minsz %ld.\n",__func__, __LINE__, info.argsz, minsz);return -EINVAL;}ret = wdg_get_device_info(mdev, &info);if (ret) {pr_err("%s line %d, get device info failure.\n", __func__, __LINE__);return ret;}memcpy(&mstate->dev_info, &info, sizeof(info));if (copy_to_user((void __user *)arg, &info, minsz))return -EFAULT;return 0;}case VFIO_DEVICE_GET_REGION_INFO: {struct vfio_region_info info;minsz = offsetofend(struct vfio_region_info, offset);if (copy_from_user(&info, (void __user *)arg, minsz))return -EFAULT;if (info.argsz < minsz) {pr_err("%s line %d, info.argsz %d < minsz %ld.\n",__func__, __LINE__, info.argsz, minsz);return -EINVAL;}ret = wdg_get_region_info(mdev, &info);if (ret) {pr_err("%s line %d, get region info failure.\n", __func__, __LINE__);return ret;}if (copy_to_user((void __user *)arg, &info, minsz))return -EFAULT;return 0;}case VFIO_DEVICE_GET_IRQ_INFO: {struct vfio_irq_info info;minsz = offsetofend(struct vfio_irq_info, count);if (copy_from_user(&info, (void __user *)arg, minsz))return -EFAULT;if ((info.argsz < minsz) ||(info.index >= mstate->dev_info.num_irqs))return -EINVAL;ret = wdg_get_irq_info(mdev, &info);if (ret)return ret;if (copy_to_user((void __user *)arg, &info, minsz))return -EFAULT;return 0;}case VFIO_DEVICE_SET_IRQS: {pr_info("%s line %d, set irqs.\n", __func__, __LINE__);return 0;}case VFIO_DEVICE_RESET:pr_info("%s line %d, reset.\n", __func__, __LINE__);return 0;}return -EINVAL;
}static const struct mdev_parent_ops wdg_mdev_fops = {.owner = THIS_MODULE,.dev_attr_groups = wdg_dev_groups,.mdev_attr_groups = mdev_dev_groups,.supported_type_groups = mdev_type_groups,.create = czl_wdg_create,.remove = czl_wdg_remove,.open = czl_wdg_open,.release = czl_wdg_close,.read = czl_wdg_read,.write = czl_wdg_write,.ioctl = czl_wdg_ioctl,
};static void wdg_device_release(struct device *dev)
{pr_info("czl wdg devide release.\n");
}static int mdev_wdg_init(void)
{int ret = 0;pr_info("czl wdg init.\n");memset(&czl_wdg, 0x00, sizeof(czl_wdg));ret = alloc_chrdev_region(&czl_wdg.wdg_devt, 0, MINORMASK + 1, "czl_wdg");if (ret < 0) {pr_err("error: failed to register czl wdg device, err:%d\n", ret);return -1;}cdev_init(&czl_wdg.wdg_cdev, &czl_wdg_fops);cdev_add(&czl_wdg.wdg_cdev, czl_wdg.wdg_devt, MINORMASK + 1);pr_info("major_number:%d\n", MAJOR(czl_wdg.wdg_devt));czl_wdg.wdg_class = class_create(THIS_MODULE, "czl_wdg");if (IS_ERR(czl_wdg.wdg_class)) {pr_err("error: failed to create wdg class.\n");ret = -1;goto failed1;}czl_wdg.dev.class = czl_wdg.wdg_class;czl_wdg.dev.release = wdg_device_release;dev_set_name(&czl_wdg.dev, "%s", "czl_wdg");ret = device_register(&czl_wdg.dev);if (ret) {pr_err("%s line %d, register wdg device failure.\n", __func__, __LINE__);ret = -1;goto failed2;}ret = mdev_register_device(&czl_wdg.dev, &wdg_mdev_fops);if (ret) {pr_err("%s line %d, register wdg mdev device failure.\n", __func__, __LINE__);ret = -1;goto failed3;}mutex_init(&wdg_mdev_list_lock);INIT_LIST_HEAD(&wdg_mdev_devices_list);pr_info("czl wdg init success.\n");goto done;
failed3:device_unregister(&czl_wdg.dev);
failed2:class_destroy(czl_wdg.wdg_class);
failed1:cdev_del(&czl_wdg.wdg_cdev);unregister_chrdev_region(czl_wdg.wdg_devt, MINORMASK + 1);
done:return ret;
}static void mdev_wdg_exit(void)
{czl_wdg.dev.bus = NULL;mdev_unregister_device(&czl_wdg.dev);device_unregister(&czl_wdg.dev);cdev_del(&czl_wdg.wdg_cdev);unregister_chrdev_region(czl_wdg.wdg_devt, MINORMASK + 1);class_destroy(czl_wdg.wdg_class);czl_wdg.wdg_class = NULL;pr_info("czl_wdg_unload.\n");return;
}module_init(mdev_wdg_init)
module_exit(mdev_wdg_exit)
MODULE_LICENSE("GPL v2");
virtual machine pci watchdog pci driver
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/uuid.h>
#include <linux/vfio.h>
#include <linux/iommu.h>
#include <linux/sysfs.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/mdev.h>
#include <linux/pci.h>
#include <linux/idr.h>static int devno;
static DEFINE_IDR(wdg_minors);
static DEFINE_MUTEX(wdg_minors_lock);
#define WDG_MINORS_COUNT 256struct wdg_pci_state {struct pci_dev *pdev;struct device *dev;int iobase;int iolen;int major;int minor;
};static struct class *wdg_class;
static const struct pci_device_id czl_pci_table[] = {{ PCI_DEVICE(0xbeef, 0x1001), },{ 0, }
};static int czl_wdg_open(struct inode *inode, struct file *file)
{int rc = 0;int major, minor;major = imajor(inode);minor = iminor(inode);mutex_lock(&wdg_minors_lock);file->private_data = idr_find(&wdg_minors, minor);mutex_unlock(&wdg_minors_lock);if (!file->private_data) {pr_err("%s line %d, cant find wdg structure.\n",__func__, __LINE__);rc = -1;}return rc;
}static int czl_wdg_release(struct inode *inode, struct file *file)
{return 0;
}ssize_t czl_wdg_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{int i;struct wdg_pci_state *wdgdev = NULL;unsigned char *kbuf = NULL;int actuallen = 0;wdgdev = file->private_data;if (!wdgdev) {pr_err("%s line %d, read failure.\n", __func__, __LINE__);return -1;}if (*ppos > wdgdev->iolen) {pr_err("%s line %d, read pos %lld exceed max io len %d.\n",__func__, __LINE__, *ppos, wdgdev->iolen);return -1;}kbuf = kzalloc(GFP_KERNEL, size);if (kbuf == NULL) {pr_err("%s line %d, alloc kbuf failure.\n",__func__, __LINE__);return -1;}for (i = 0; (i < size) && ((*ppos + i) < wdgdev->iolen); i++) {kbuf[i] = inb(wdgdev->iobase + *ppos + i);actuallen ++;}copy_to_user(buf, kbuf, actuallen);kfree(kbuf);return actuallen;
}static ssize_t czl_wdg_write(struct file *file, const char __user *buf,size_t count, loff_t *ppos)
{int i;struct wdg_pci_state *wdgdev = NULL;unsigned char *kbuf = NULL;int actuallen = 0;wdgdev = file->private_data;if (!wdgdev) {pr_err("%s line %d, read failure.\n", __func__, __LINE__);return -1;}if (*ppos > wdgdev->iolen) {pr_err("%s line %d, read pos %lld exceed max io len %d.\n",__func__, __LINE__, *ppos, wdgdev->iolen);return -1;}kbuf = kzalloc(GFP_KERNEL, count);if (kbuf == NULL) {pr_err("%s line %d, alloc kbuf failure.\n",__func__, __LINE__);return -1;}copy_from_user(kbuf, buf, count);for (i = 0; (i < count) && ((*ppos + i) < wdgdev->iolen); i++) {outb((u8)kbuf[i], wdgdev->iobase + *ppos + i);actuallen ++;}kfree(kbuf);return actuallen;
}static const struct file_operations czl_wdg_fops = {.owner = THIS_MODULE,.open = czl_wdg_open,.release = czl_wdg_release,.read = czl_wdg_read,.write = czl_wdg_write,
};static char *wdg_devnode(struct device *dev, umode_t *mode)
{if (mode)*mode = 06666;return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
}static int wdg_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{struct wdg_pci_state *wdgdev = NULL;pr_info("%s line %d, wdg pci device & driver binding.\n", __func__, __LINE__);wdgdev = kzalloc(GFP_KERNEL, sizeof(*wdgdev));if (!wdgdev) {pr_err("%s line %d, fail to alloc buffer.\n",__func__, __LINE__);goto err0;}wdgdev->major = devno;wdgdev->pdev = pci_dev_get(pdev);wdgdev->iobase = pci_resource_start(pdev, 0);wdgdev->iolen = pci_resource_len(pdev, 0);mutex_lock(&wdg_minors_lock);wdgdev->minor = idr_alloc(&wdg_minors, wdgdev, 0, WDG_MINORS_COUNT, GFP_KERNEL);mutex_unlock(&wdg_minors_lock);if (wdgdev->minor < 0) {pr_err("%s line %d, get minor failure from idr.\n", __func__, __LINE__);goto err1;}pr_info("%s line %d, major %d, minor %d, iobase 0x%x.\n", __func__, __LINE__,devno, wdgdev->minor, wdgdev->iobase);wdgdev->dev = device_create(wdg_class, NULL, MKDEV(devno, wdgdev->minor),NULL, "czl-wdg-%d", wdgdev->minor);if (!wdgdev->dev || IS_ERR(wdgdev->dev)) {pr_err("%s line %d, create wdg device failure.\n",__func__, __LINE__);goto err2;}pci_set_drvdata(pdev, wdgdev);return 0;
err2:idr_remove(&wdg_minors, wdgdev->minor);
err1:if (wdgdev) {kfree(wdgdev);}
err0:return -1;
}static void wdg_pci_remove(struct pci_dev *pdev)
{struct wdg_pci_state *wdgdev;pr_info("%s line %d, wdg pci device & driver removing.\n", __func__, __LINE__);wdgdev = pci_get_drvdata(pdev);pci_set_drvdata(pdev, NULL);pci_dev_put(pdev);wdgdev->pdev = NULL;device_destroy(wdg_class, MKDEV(devno, wdgdev->minor));idr_remove(&wdg_minors, wdgdev->minor);kfree(wdgdev);return;
}static struct pci_driver czl_wdg_driver = {.name = "czl-mdev-wdg",.id_table = czl_pci_table,.probe = wdg_pci_probe,.remove = wdg_pci_remove,
};
static int czl_wdg_init(void)
{int ret;wdg_class = class_create(THIS_MODULE, "czl-wdg");if (!wdg_class) {pr_err("%s line %d, create watchdog class failure.\n",__func__, __LINE__);return -1;}wdg_class->devnode = wdg_devnode;devno = register_chrdev(0, "czl-wdg", &czl_wdg_fops);if (devno < 0) {pr_err("%s line %d, register wdg device chrno failure.\n",__func__, __LINE__);class_destroy(wdg_class);return -1;}ret = pci_register_driver(&czl_wdg_driver);return ret;
}static void czl_wdg_exit(void)
{pci_unregister_driver(&czl_wdg_driver);unregister_chrdev(devno, "czl-wdg");class_destroy(wdg_class);idr_destroy(&wdg_minors);return;
}module_init(czl_wdg_init)
module_exit(czl_wdg_exit)
MODULE_LICENSE("GPL v2");
virtual machine kernel space test case
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdarg.h>void dump_buf(unsigned char *buf, int len)
{int i;for (i = 0; i < len; i++) {if (i % 16 == 0)printf("\n0x%04x: ", i);printf("0x%02x ", buf[i]);}printf("\n");return;
}int main(void)
{int wdgfd;int status;unsigned char buf[32];wdgfd = open("/dev/czl-wdg-0", O_RDWR);if (wdgfd < 0) {printf("%s line %d, open failure.\n",__func__, __LINE__);return -1;}while (1) {memset(buf, 0x00, 32);status = read(wdgfd, buf, 32);if (status < 0) {printf("%s line %d, read failure.\n",__func__, __LINE__);return -1;}printf("%s line %d, read %d.\n", __func__, __LINE__, status);dump_buf(buf, 32);memset(buf, 0x5a, 32);lseek(wdgfd, 0, SEEK_SET);status = write(wdgfd, buf, 32);if (status < 0) {printf("%s line %d, read failure.\n",__func__, __LINE__);return -1;}printf("%s line %d, read %d.\n", __func__, __LINE__, status);sleep(1);}close(wdgfd);return 0;
}
测试过程:
1.安装WDG MDEV驱动:
sudo insmod czl-mdev-wdg.ko
2.创建mdev设备
创建两个mdev设备
echo "f422fd86-35c0-11ef-8e50-9342c1138a56" > /sys/devices/virtual/czl_wdg/czl_wdg/mdev_supported_types/czl_wdg-1/create
echo "c04de378-35d8-11ef-95c3-339660dfc874" > /sys/devices/virtual/czl_wdg/czl_wdg/mdev_supported_types/czl_wdg-2/create
3.将第二步创建的mdev设别透传给QEMU虚拟机启动:
qemu-system-x86_64 -m 4096 -smp 4 --enable-kvm -drive file=/home/zlcao/Workspace/iso/ps.img -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/f422fd86-35c0-11ef-8e50-9342c1138a56 -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/c04de378-35d8-11ef-95c3-339660dfc874
系统启动后,可以看到虚拟机环境下出现了透传的MDEV PCI设备,设备vendor/device id为0xbeef1001,符合代码设定。
4.虚拟机内安装WDG PCI设备驱动:
上图中可以看到,两个透传的MDEV设备已经和一个名为"serial"的PCI设备驱动绑定,这并不符合预期,需要将默认的"serial"驱动和MDEV设备解绑,在QEMU虚拟机控制台中输入如下命令解绑驱动:
echo -n 0000:00:04.0 > /sys/bus/pci/drivers/serial/unbind
echo -n 0000:00:05.0 > /sys/bus/pci/drivers/serial/unbind
之后就可以安装我们的WDG PCI驱动了:
sudo insmod czl-mdev-drv.ko
安装成功后,虚拟机设备目录下出现了WDG PCI的设备节点:
此时,两个MDEV PCI设备也显示绑定到了正确的驱动:
5.运行测试用例,读写WDG PCI设备的BAR0地址空间:
此时可以看到,虚拟机中对WDG设备BAR0空间的读写调用被“透传"到了HOST机的MDEV PCI设备驱动上,可以基于对BAR0空间的回调实现我们的业务逻辑。