NVME Doorbell 寄存器数据请求时doorbell 处理

3.NVMe寄存器配置
3.1 寄存器定义
NVMe寄存器主要分为两部分，一部分定义了Controller整体属性，一部分用来存放每组队列的头尾DB寄存器。

CAP——控制器能力，定义了内存页大小的最大最小值、支持的I/O指令集、DB寄存器步长、等待时间界限、仲裁机制、队列是否物理上连续、队列大小；
VS——版本号，定义了控制器实现NVMe协议的版本号；
INTMS——中断掩码，每个bit对应一个中断向量，使用MSI-X中断时，此寄存器无效；
INTMC——中断有效，每个bit对应一个中断向量，使用MSI-X中断时，此寄存器无效；
CC——控制器配置，定义了I/O SQ和CQ队列元素大小、关机状态提醒、仲裁机制、内存页大小、支持的I/O指令集、使能；
CSTS——控制器状态，包括关机状态、控制器致命错误、就绪状态；
AQA——Admin 队列属性，包括SQ大小和CQ大小；
ASQ——Admin SQ基地址；
ACQ——Admin CQ基地址；
1000h之后的寄存器定义了队列的头、尾DB寄存器。
3.2寄存器理解
CAP寄存器标识的是Controller具有多少能力，而CC寄存器则是指当前Controller选择了哪些能力，可以理解为CC是CAP的一个子集；如果重启（reset）的话，可以更换CC配置；
CC.EN置一，表示Controller已经可以开始处理NVM命令，从1到0表示Controller重启；
CC.EN与CSTS.RDY关系密切，CSTS.RDY总是在CC.EN之后由Controller改变，其他不符合执行顺序的操作都将产生未定义的行为；
Admin队列有host直接创建，AQA、ASQ、ACQ三个寄存器标识了Admin队列，而其他I/O队列则有Admin命令创建（eg，创建I/O CQ命令）；
Admin队列的头、尾DB寄存器标识为0，其他I/O队列标识由host按照一定规则分配；只有16bit的有效位，是因为队列深度最大64K。
实际的物理设备CAP.DSTRD值为0，dev->db_stride为1，之后分析中默认db_stride为1

原文链接：https://blog.csdn.net/qq_39021670/article/details/114896973

由dev->dbs使用方式可知，每一个DB寄存器对，前4个字节为SQ Tail DB，后四个字节为CQ Head DB

/** Write sq tail if we are asked to, or if the next command would wrap.*/
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
{if (!write_sq) {u16 next_tail = nvmeq->sq_tail + 1;if (next_tail == nvmeq->q_depth)next_tail = 0;if (next_tail != nvmeq->last_sq_tail)return;}if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))//前4字节写入sq tialwritel(nvmeq->sq_tail, nvmeq->q_db);nvmeq->last_sq_tail = nvmeq->sq_tail;
}

static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
{u16 head = nvmeq->cq_head;//后4字节写入 cq headif (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,nvmeq->dbbuf_cq_ei))writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
}

static irqreturn_t nvme_irq(int irq, void *data)
{struct nvme_queue *nvmeq = data;irqreturn_t ret = IRQ_NONE;u16 start, end;/** The rmb/wmb pair ensures we see all updates from a previous run of* the irq handler, even if that was on another CPU.*/rmb();if (nvmeq->cq_head != nvmeq->last_cq_head)ret = IRQ_HANDLED;//找到当前CQ队列的尾部，并更新cq_headnvme_process_cq(nvmeq, &start, &end, -1);nvmeq->last_cq_head = nvmeq->cq_head;wmb();if (start != end) {// 依次处理CQ队列中的请求nvme_complete_cqes(nvmeq, start, end);return IRQ_HANDLED;}return ret;
}

依次取出ssd 中已经返回的数据，然后写入cq 的head 到Doorbell 寄存器

static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,u16 *end, unsigned int tag)
{int found = 0;*start = nvmeq->cq_head;while (nvme_cqe_pending(nvmeq)) {if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)found++;nvme_update_cq_head(nvmeq);}*end = nvmeq->cq_head;if (*start != *end)nvme_ring_cq_doorbell(nvmeq);return found;
}

static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
{u16 head = nvmeq->cq_head;if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,nvmeq->dbbuf_cq_ei))writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
}

依次处理cq 中的数据返回给block 层

static inline void nvme_end_request(struct request *req, __le16 status,union nvme_result result)
{struct nvme_request *rq = nvme_req(req);rq->status = le16_to_cpu(status) >> 1;rq->result = result;/* inject error when permitted by fault injection framework */nvme_should_fail(req);//block 请求返回blk_mq_complete_request(req);
}static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];struct request *req;/** AEN requests are special as they don't time out and can* survive any kind of queue freeze and often don't respond to* aborts.  We don't even bother to allocate a struct request* for them but rather special case them here.*/if (unlikely(nvmeq->qid == 0 &&cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {nvme_complete_async_event(&nvmeq->dev->ctrl,cqe->status, &cqe->result);return;}//将通过tag 将reqeust 转换出来req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);if (unlikely(!req)) {dev_warn(nvmeq->dev->ctrl.device,"invalid id %d completed on queue %d\n",cqe->command_id, le16_to_cpu(cqe->sq_id));return;}trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);nvme_end_request(req, cqe->status, cqe->result);
}static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
{while (start != end) {nvme_handle_cqe(nvmeq, start);if (++start == nvmeq->q_depth)start = 0;}
}

static const struct blk_mq_ops nvme_mq_admin_ops = {.queue_rq       = nvme_queue_rq,.complete       = nvme_pci_complete_rq,.init_hctx      = nvme_admin_init_hctx,.init_request   = nvme_init_request,.timeout        = nvme_timeout,
};static const struct blk_mq_ops nvme_mq_ops = {.queue_rq       = nvme_queue_rq,.complete       = nvme_pci_complete_rq,.commit_rqs     = nvme_commit_rqs,.init_hctx      = nvme_init_hctx,.init_request   = nvme_init_request,.map_queues     = nvme_pci_map_queues,.timeout        = nvme_timeout,.poll           = nvme_poll,
};

admin queue

nvme_queue_rq

io queue

nvme_queue_rq

nvme_commit_rqs

static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,struct request *rq,blk_qc_t *cookie, bool last)
{struct request_queue *q = rq->q;struct blk_mq_queue_data bd = {.rq = rq,.last = last,};blk_qc_t new_cookie;blk_status_t ret;new_cookie = request_to_qc_t(hctx, rq);/** For OK queue, we are done. For error, caller may kill it.* Any other error (busy), just add it to our list as we* previously would have done.*/ret = q->mq_ops->queue_rq(hctx, &bd);switch (ret) {case BLK_STS_OK:blk_mq_update_dispatch_busy(hctx, false);*cookie = new_cookie;break;case BLK_STS_RESOURCE:case BLK_STS_DEV_RESOURCE:blk_mq_update_dispatch_busy(hctx, true);__blk_mq_requeue_request(rq);break;default:blk_mq_update_dispatch_busy(hctx, false);*cookie = BLK_QC_T_NONE;break;}return ret;
}

*/
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,bool got_budget)
{struct blk_mq_hw_ctx *hctx;struct request *rq, *nxt;bool no_tag = false;int errors, queued;blk_status_t ret = BLK_STS_OK;bool no_budget_avail = false;if (list_empty(list))return false;WARN_ON(!list_is_singular(list) && got_budget);/** Now process all the entries, sending them to the driver.*/errors = queued = 0;do {struct blk_mq_queue_data bd;rq = list_first_entry(list, struct request, queuelist);hctx = rq->mq_hctx;if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {blk_mq_put_driver_tag(rq);no_budget_avail = true;break;}if (!blk_mq_get_driver_tag(rq)) {/** The initial allocation attempt failed, so we need to* rerun the hardware queue when a tag is freed. The* waitqueue takes care of that. If the queue is run* before we add this entry back on the dispatch list,* we'll re-run it below.*/if (!blk_mq_mark_tag_wait(hctx, rq)) {blk_mq_put_dispatch_budget(hctx);/** For non-shared tags, the RESTART check* will suffice.*/if (hctx->flags & BLK_MQ_F_TAG_SHARED)no_tag = true;break;}}list_del_init(&rq->queuelist);bd.rq = rq;/** Flag last if we have no more requests, or if we have more* but can't assign a driver tag to it.*/if (list_empty(list))bd.last = true;else {nxt = list_first_entry(list, struct request, queuelist);bd.last = !blk_mq_get_driver_tag(nxt);}//下发ioret = q->mq_ops->queue_rq(hctx, &bd);if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {blk_mq_handle_dev_resource(rq, list);break;}if (unlikely(ret != BLK_STS_OK)) {errors++;blk_mq_end_request(rq, BLK_STS_IOERR);continue;}queued++;} while (!list_empty(list));hctx->dispatched[queued_to_index(queued)]++;/** Any items that need requeuing? Stuff them into hctx->dispatch,* that is where we will continue on next queue run.*/if (!list_empty(list)) {bool needs_restart;/** If we didn't flush the entire list, we could have told* the driver there was more coming, but that turned out to* be a lie.*/if (q->mq_ops->commit_rqs)//nvme io commitq->mq_ops->commit_rqs(hctx);spin_lock(&hctx->lock);list_splice_tail_init(list, &hctx->dispatch);spin_unlock(&hctx->lock);/** Order adding requests to hctx->dispatch and checking* SCHED_RESTART flag. The pair of this smp_mb() is the one* in blk_mq_sched_restart(). Avoid restart code path to* miss the new added requests to hctx->dispatch, meantime* SCHED_RESTART is observed here.*/smp_mb();/** If SCHED_RESTART was set by the caller of this function and* it is no longer set that means that it was cleared by another* thread and hence that a queue rerun is needed.** If 'no_tag' is set, that means that we failed getting* a driver tag with an I/O scheduler attached. If our dispatch* waitqueue is no longer active, ensure that we run the queue* AFTER adding our entries back to the list.** If no I/O scheduler has been configured it is possible that* the hardware queue got stopped and restarted before requests* were pushed back onto the dispatch list. Rerun the queue to* avoid starvation. Notes:* - blk_mq_run_hw_queue() checks whether or not a queue has*   been stopped before rerunning a queue.* - Some but not all block drivers stop a queue before*   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq*   and dm-rq.** If driver returns BLK_STS_RESOURCE and SCHED_RESTART* bit is set, run queue after a delay to avoid IO stalls* that could otherwise occur if the queue is idle.  We'll do* similar if we couldn't get budget and SCHED_RESTART is set.*/needs_restart = blk_mq_sched_needs_restart(hctx);if (!needs_restart ||(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))blk_mq_run_hw_queue(hctx, true);else if (needs_restart && (ret == BLK_STS_RESOURCE ||no_budget_avail))blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);blk_mq_update_dispatch_busy(hctx, true);return false;} elseblk_mq_update_dispatch_busy(hctx, false);/** If the host/device is unable to accept more work, inform the* caller of that.*/if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)return false;return (queued + errors) != 0;
}