这里主要记录nvme_dev_start函数的阅读记录
内核3.13版本。
static int nvme_dev_start(struct nvme_dev *dev)
{int result;result = nvme_dev_map(dev); //pcie bar空间映射if (result)return result;result = nvme_configure_admin_queue(dev); //管理队列配置if (result)goto unmap;spin_lock(&dev_list_lock);list_add(&dev->node, &dev_list); //dev_list没太明白是定义在那个文件的?spin_unlock(&dev_list_lock);result = nvme_setup_io_queues(dev); //io队列配置if (result && result != -EBUSY)goto disable;return result;
disable:spin_lock(&dev_list_lock);list_del_init(&dev->node);spin_unlock(&dev_list_lock);
unmap:nvme_dev_unmap(dev);return result;
}
该函数主要做了4减事情。
1:nvme_dev_map,主要就是pci bar 那一套编程套路
2:管理队列的配置
3:dev_list?
4:io队列的配置
nvme_dev_map
static int nvme_dev_map(struct nvme_dev *dev)
{int bars, result = -ENOMEM;struct pci_dev *pdev = dev->pci_dev;if (pci_enable_device_mem(pdev))return result;dev->entry[0].vector = pdev->irq;pci_set_master(pdev);bars = pci_select_bars(pdev, IORESOURCE_MEM);if (pci_request_selected_regions(pdev, bars, "nvme"))goto disable_pci;if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) && dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))goto disable;pci_set_drvdata(pdev, dev);dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);//bar0if (!dev->bar)goto disable;dev->db_stride = NVME_CAP_STRIDE(readq(&dev->bar->cap)); //doorbell stride 32-35dev->dbs = ((void __iomem *)dev->bar) + 4096; //sq tailreturn 0;
disable:pci_release_regions(pdev);
disable_pci:pci_disable_device(pdev);return result;
}
这个函数没啥介绍的,对应的unmap如下:
static void nvme_dev_unmap(struct nvme_dev *dev)
{if(dev->pci_dev->msi_enabled)pci_disable_msi(dev->pci_dev);else if (dev->pci_dev->msix_enabled)pci_disable_msix(dev->pci_dev);if(dev->bar) {iounmap(dev->bar);dev->bar = NULL;}pci_release_regions(dev->pci_dev);if (pci_is_enabled(dev->pci_dev))pci_disable_device(dev->pci_dev);
}
nvme_configure_admin_queue
static int nvme_configure_admin_queue(struct nvme_dev *dev)
{int result;u32 aqa;u64 cap = readq(&dev->bar->cap);struct nvme_queue *nvmeq;result = nvme_disable_ctrl(dev, cap); //根据文档说明,配置队列时先disable控制器if (result < 0)return result;nvmeq = dev->queues[0];if (!nvmeq) {nvmeq = nvme_alloc_queue(dev, 0, 64, 0);//admin 队列,深度为64if (!nvmeq)return -ENOMEM;dev->queues[0] = nvmeq; //保存nvmeq起始地址}aqa = nvmeq->q_depth - 1;aqa |= aqa << 16;//使能控制器状态dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;//设置一个page的大小dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;//配置cqe和sqe元素的大小,2^4,2^6dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; writel(aqa, &dev->bar->aqa);//配置cq队列和sq队列深度writeq(nvmeq->sq_dma_addr, &dev->bar->asq); //配置sq队列dma起始地址writeq(nvmeq->cq_dma_addr, &dev->bar->acq); //配置cq队列dma起始地址/*cc->Controller Configuration31-24 : Reserved23-20 : I/O Completion queue entry size19-16 : I/O Submission Queue Entry Size15-14 : Shutdown Notification13-11 : Arbitration Mechanism Selected10-7 : Memory Page size6-4 : IO Command Set Selected3-1 : Reserved0 : Enable*/writel(dev->ctrl_config, &dev->bar->cc);//这里使能了控制器//读Controller Configuration的第一个bit,判断控制器是否已经enableresult = nvme_enable_ctrl(dev, cap);if (result)return result;result = queue_request_irq(dev, nvmeq, "nvme admin");if (result)return result;spin_lock(&nvmeq->q_lock);nvme_init_queue(nvmeq, 0); //admin queue初始化spin_unlock(&nvmeq->q_lock);return result;
}
注释都解释的比较清楚了,这里在贴一些相关函数调用的代码。
enable/disable控制器相关的。起始就是对寄存器的一些值的判断。
static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
{unsigned long timeout;u32 bit = enabled ? NVME_CSTS_RDY : 0;timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {msleep(100);if (fatal_signal_pending(current))//可以被一些fatal的信号打断return -EINTR;if (time_after(jiffies, timeout)) { //jiffies比timeout靠后返回1dev_err(&dev->pci_dev->dev, "Device not ready; aborting initialisation\n");return -ENODEV;}}return 0;
}static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
{u32 cc = readl(&dev->bar->cc);if (cc & NVME_CC_ENABLE)writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc);//取反使能bitreturn nvme_wait_ready(dev, cap, false);
}static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
{return nvme_wait_ready(dev, cap, true);
}
队列的alloc:
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector)
{struct device *dmadev = &dev->pci_dev->dev;unsigned extra = nvme_queue_extra(depth);struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);if (!nvmeq)return NULL;nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), &nvmeq->cq_dma_addr, GFP_KERNEL);if (!nvmeq->cqes)goto free_nvmeq;memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), &nvmeq->sq_dma_addr, GFP_KERNEL);if (!nvmeq->sq_cmds)goto free_cqdma;nvmeq->q_dmadev = dmadev;nvmeq->dev = dev;spin_lock_init(&nvmeq->q_lock);//队列的自旋锁nvmeq->cq_head = 0;//cq head位置nvmeq->cq_phase = 1;//后续需要它确定cq tail的位置init_waitqueue_head(&nvmeq->sq_full);//初始化等待队列头init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);//初始化等待队列bio_list_init(&nvmeq->sq_cong);//初始化bio 链表(单向链表)nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];//qid队列的门铃寄存器基地址nvmeq->q_depth = depth;//队列深度nvmeq->cq_vector = vector;//队列的中断向量nvmeq->q_suspended = 1;//还没初始化,所以这个先设置为1dev->queue_count++;//总的队列个数加一return nvmeq;
free_cqdma:dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
free_nvmeq:kfree(nvmeq);return NULL;
}static unsigned nvme_queue_extra(int depth)
{/*DIV_ROUND_UP(depth, 8)用来记录bit位(一共有depth个bit)depth * sizeof(struct nvme_cmd_info)用来记录depth个bit的struct nvme_cmd_info信息*/return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
}
队列的init:
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{struct nvme_dev *dev = nvmeq->dev;unsigned extra = nvme_queue_extra(nvmeq->q_depth);nvmeq->sq_tail = 0;nvmeq->cq_head = 0;nvmeq->cq_phase = 1;nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];memset(nvmeq->cmdid_data, 0, extra);memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));nvme_cancel_ios(nvmeq, false);nvmeq->q_suspended = 0;
}
static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
{int depth = nvmeq->q_depth - 1;struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);unsigned long now = jiffies;int cmdid;for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { //从第一个bit开始遍历每一个bit,返回被置位(值为1)的位置void *ctx;nvme_completion_fn fn;static struct nvme_completion cqe = {.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), //7 << 1};/*初始化时timeout为0,所以流程不会走到下面now比info[cmdid].timeout大,函数返回1,所以流程要往下走,time_afte函数要返回1(即时间超时?now大于timeout)*/if (timeout && !time_after(now, info[cmdid].timeout))continue;if (info[cmdid].ctx == CMD_CTX_CANCELLED) //也有可能是这个(提交的命令出现错误的情况?),因为bit还没清,所以遍历的时候是有这种情况的continue;dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid)ctx = cancel_cmdid(nvmeq, cmdid, &fn);fn(nvmeq->dev, ctx, &cqe);}
}
io队列的配置
static int nvme_setup_io_queues(struct nvme_dev *dev)
{struct pci_dev *pdev = dev->pci_dev;int result, cpu, i, vecs, nr_io_queues, size, q_depth;nr_io_queues = num_online_cpus();result = set_queue_count(dev, nr_io_queues);if (result < 0)return result;if (result < nr_io_queues)nr_io_queues = result;size = db_bar_size(dev, nr_io_queues);if (size > 8192) { //因为dev_map函数里map的size是8192,大于8192需要重新映射iounmap(dev->bar);//先解除映射do {dev->bar = ioremap(pci_resource_start(pdev, 0), size);//再重新映射if (dev->bar)//如果一次性映射成功是最好的break;if (!--nr_io_queues)//如果一次性映射不成功,那么就逐步减少nr_io_queues的值,直到映射成功return -ENOMEM;size = db_bar_size(dev, nr_io_queues);//重新计算nr_io_queues减少以后的size,直到map成功} while (1);//管理队列的需要重新赋值dev->dbs = ((void __iomem *)dev->bar) + 4096;dev->queues[0]->q_db = dev->dbs;}//注销管理队列的中断free_irq(dev->entry[0].vector, dev->queues[0]);vecs = nr_io_queues;//entry 初始化for (i = 0; i < vecs; i++)dev->entry[i].entry = i;for (;;) {//请求分配vecs个中断,返回0表示成功result = pci_enable_msix(pdev, dev->entry, vecs);if (result <= 0)break;vecs = result;}if (result < 0) { //考虑result小于0的情况vecs = nr_io_queues;if (vecs > 32)vecs = 32;for (;;) {result = pci_enable_msi_block(pdev, vecs);if (result == 0) {for (i = 0; i < vecs; i++)dev->entry[i].vector = i + pdev->irq;break;} else if (result < 0) {vecs = 1;break;}vecs = result;}}//应该调查分配比中断向量更多的队列是否有性能优势;它可能允许提交路径更好地扩展,即使接收路径受到中断数量的限制。nr_io_queues = vecs;result = queue_request_irq(dev, dev->queues[0], "nvme admin");if (result) {dev->queues[0]->q_suspended = 1;goto free_queues;}/*释放以前分配的不再可用的队列,即队列id比nr_io_queues还大的队列不过这里感觉应该不会进来,因为前面只创建了管理队列*/spin_lock(&dev_list_lock);for (i = dev->queue_count - 1; i > nr_io_queues; i--) {struct nvme_queue *nvmeq = dev->queues[i];spin_lock(&nvmeq->q_lock);nvme_cancel_ios(nvmeq, false);spin_unlock(&nvmeq->q_lock);nvme_free_queue(nvmeq);dev->queue_count--;dev->queues[i] = NULL;}spin_unlock(&dev_list_lock);/*将不同的队列和cpu进行绑定*/cpu = cpumask_first(cpu_online_mask);for (i = 0; i < nr_io_queues; i++) {//https://zhuanlan.zhihu.com/p/163850501irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));cpu = cpumask_next(cpu, cpu_online_mask);}/*page45Maximum Queue Entries Supported (MQES):该字段表示控制器支持的最大单个队列大小。对于基于pcie实现的NVMe,此值适用于主机创建的I/O提交队列和I/O完成队列。对于基于fabricimplementation的NVMe,这个值只适用于主机创建的I/O提交队列。这是一个基于0的值。最小值为1h,即2条*/q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, NVME_Q_DEPTH);for (i = dev->queue_count - 1; i < nr_io_queues; i++) {dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i);if (!dev->queues[i + 1]) {result = -ENOMEM;goto free_queues;}}for (; i < num_possible_cpus(); i++) {//该函数是取数的最高二进制阶数,即将给定值四舍五入到最接近的二次方int target = i % rounddown_pow_of_two(dev->queue_count - 1);dev->queues[i + 1] = dev->queues[target + 1];}//创建io queue,索引从1开始for (i = 1; i < dev->queue_count; i++) {result = nvme_create_queue(dev->queues[i], i);if (result) {for (--i; i > 0; i--)nvme_disable_queue(dev, i);goto free_queues;}}return 0;
free_queues:nvme_free_queues(dev);return result;
}static int set_queue_count(struct nvme_dev *dev, int count)
{int status;u32 result, q_count = (count - 1) | ((count - 1) << 16);/*设置队列个数(不包括管理队列), page212:0-15bit:number of I/O submission queues Requested(NSQR)16:31bit:Number of I/O completion queue requested(NCQR)最大值65535,q_count设置为0说明配置的队列个数是1,如果配置65535将发生错误(要配置65534)*/status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, &result);if (status)return status < 0 ? -EIO : -EBUSY;return min(result & 0xffff, result >> 16) + 1;
}
这个函数也没啥说的,看注释吧。
差不多就总结到这里了。关于nvme_dev_start函数的总结。