From a2f8265e5d437e49244ffeb2926b46c056adb9fb Mon Sep 17 00:00:00 2001 From: Ian Moffett Date: Sat, 13 Jul 2024 17:39:10 -0400 Subject: kernel: nvme: Add nvme bdevsw + support for reads Signed-off-by: Ian Moffett --- sys/dev/ic/nvme.c | 138 +++++++++++++++++++++++++++++++++++++++++++ sys/include/dev/ic/nvmevar.h | 26 ++++++++ 2 files changed, 164 insertions(+) (limited to 'sys') diff --git a/sys/dev/ic/nvme.c b/sys/dev/ic/nvme.c index 76127f2..4bceffd 100644 --- a/sys/dev/ic/nvme.c +++ b/sys/dev/ic/nvme.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include #include #include #include @@ -46,6 +48,7 @@ #define pr_trace(fmt, ...) kprintf("nvme: " fmt, ##__VA_ARGS__) #define pr_error(...) pr_trace(__VA_ARGS__) +static struct bdevsw nvme_bdevsw; static TAILQ_HEAD(,nvme_ns) namespaces; static struct pci_device *nvme_dev; static struct timer tmr; @@ -58,6 +61,25 @@ is_4k_aligned(void *ptr) return ((uintptr_t)ptr & (0x1000 - 1)) == 0; } +/* + * Fetch a namespace from its device ID + * + * @dev: Device ID of namespace to fetch. + */ +static struct nvme_ns * +nvme_get_ns(dev_t dev) +{ + struct nvme_ns *ns; + + TAILQ_FOREACH(ns, &namespaces, link) { + if (ns->dev == dev) { + return ns; + } + } + + return NULL; +} + /* * Poll register to have 'bits' set/unset. * @@ -347,6 +369,107 @@ nvme_init_pci(void) pci_writel(nvme_dev, PCIREG_CMDSTATUS, tmp); } +/* + * Issue a read/write command for a specific + * namespace. + * + * `buf' must be 4k aligned. + */ +static int +nvme_rw(struct nvme_ns *ns, char *buf, off_t slba, size_t count, bool write) +{ + struct nvme_cmd cmd = {0}; + struct nvme_rw_cmd *rw = &cmd.rw; + + if (!is_4k_aligned(buf)) { + return -1; + } + + rw->opcode = write ? NVME_OP_WRITE : NVME_OP_READ; + rw->nsid = ns->nsid; + rw->slba = slba; + rw->len = count - 1; + rw->prp1 = VIRT_TO_PHYS(buf); + return nvme_poll_submit_cmd(&ns->ioq, cmd); +} + +/* + * Device interface read/write helper. + * + * @dev: Device ID. + * @sio: SIO transaction descriptor. + * @write: True if this is a write operation. + * + * This routine uses an internal buffer aligned on a + * 4 KiB boundary to enable flexibility with the input + * SIO buffer. This allows the SIO buffer to be unaligned + * and/or sized smaller than the namespace block size. + */ +static int +nvme_dev_rw(dev_t dev, struct sio_txn *sio, bool write) +{ + struct nvme_ns *ns; + size_t block_count, len; + off_t block_off, read_off; + int status; + char *buf; + + if (sio == NULL) + return -EINVAL; + if (sio->len == 0 || sio->buf == NULL) + return -EINVAL; + + /* + * Get the NVMe namespace. This should not fail + * but handle if it does just in case. + */ + ns = nvme_get_ns(dev); + if (__unlikely(ns == NULL)) + return -EIO; + + /* Calculate the block count and offset */ + block_count = ALIGN_UP(sio->len, ns->lba_bsize); + block_count /= ns->lba_bsize; + block_off = sio->offset / ns->lba_bsize; + + /* Allocate internal buffer */ + len = block_count * ns->lba_bsize; + buf = dynalloc_memalign(len, 0x1000); + if (buf == NULL) + return -ENOMEM; + + /* + * If this is a write, zero the internal buffer and copy over + * the contents of the SIO buffer. + */ + if (write) { + memset(buf, 0, len); + memcpy(buf, sio->buf, sio->len); + } + + /* + * Perform the r/w operation and copy internal buffer + * out if this is a read operation. + */ + status = nvme_rw(ns, buf, block_off, block_count, write); + if (status == 0 && !write) { + read_off = sio->offset & (ns->lba_bsize - 1); + memcpy(sio->buf, buf + read_off, sio->len); + } + + dynfree(buf); + return status; +} + +/* + * Device interface read + */ +static int +nvme_dev_read(dev_t dev, struct sio_txn *sio, int flags) +{ + return nvme_dev_rw(dev, sio, false); +} + /* * Initializes an NVMe namespace. * @@ -356,6 +479,8 @@ nvme_init_pci(void) static int nvme_init_ns(struct nvme_ctrl *ctrl, uint8_t nsid) { + devmajor_t major; + char devname[128]; struct nvme_ns *ns = NULL; struct nvme_id_ns *idns = NULL; uint8_t lba_format; @@ -391,6 +516,15 @@ nvme_init_ns(struct nvme_ctrl *ctrl, uint8_t nsid) } TAILQ_INSERT_TAIL(&namespaces, ns, link); + snprintf(devname, sizeof(devname), "nvme0n%d", ns->nsid); + + /* Allocate major and minor */ + major = dev_alloc_major(); + ns->dev = dev_alloc(major); + + /* Register the namespace */ + dev_register(major, ns->dev, &nvme_bdevsw); + devfs_create_entry(devname, major, ns->dev, 0444); done: if (ns != NULL && status != 0) dynfree(ns); @@ -519,4 +653,8 @@ nvme_init(void) return nvme_init_ctrl(bar); } +static struct bdevsw nvme_bdevsw = { + .read = nvme_dev_read +}; + DRIVER_EXPORT(nvme_init); diff --git a/sys/include/dev/ic/nvmevar.h b/sys/include/dev/ic/nvmevar.h index b16f51d..ee829dc 100644 --- a/sys/include/dev/ic/nvmevar.h +++ b/sys/include/dev/ic/nvmevar.h @@ -41,6 +41,10 @@ #define ID_CNS_CTRL 0x01 /* Identify controller */ #define ID_CNS_NSID_LIST 0x07 /* Active NSID list */ +/* I/O commands */ +#define NVME_OP_WRITE 0x01 +#define NVME_OP_READ 0x02 + struct nvme_identify_cmd { uint8_t opcode; uint8_t flags; @@ -94,11 +98,32 @@ struct nvme_create_iosq_cmd { uint64_t unused3[2]; }; +/* Read/write */ +struct nvme_rw_cmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t unused; + uint64_t metadata; + uint64_t prp1; + uint64_t prp2; + uint64_t slba; + uint16_t len; + uint16_t control; + uint32_t dsmgmt; + uint32_t ref; + uint16_t apptag; + uint16_t appmask; +}; + + struct nvme_cmd { union { struct nvme_identify_cmd identify; struct nvme_create_iocq_cmd create_iocq; struct nvme_create_iosq_cmd create_iosq; + struct nvme_rw_cmd rw; }; }; @@ -203,6 +228,7 @@ struct nvme_ns { struct nvme_queue ioq; /* I/O queue */ struct nvme_lbaf lba_fmt; /* LBA format */ struct nvme_ctrl *ctrl; /* NVMe controller */ + dev_t dev; TAILQ_ENTRY(nvme_ns) link; }; -- cgit v1.2.3