#include <linux/scatterlist.h>
#include <linux/bio.h>
#include <linux/blkdev.h> // For SECTOR_SHIFT
// Next sector to assign a mapped_buf to. Skip first disk block
atomic64_t nvgpu_swap_next_sector = {4};
// Callback for completion of the I/O chain
// TODO: Error checking and handling
static void complete_swap_io(struct bio *bio) {
struct nvgpu_mapped_buf *m = bio->bi_private;
bio_put(bio);
complete(&m->os_priv.swap_io_done);
}
// Queue a command to copy out an SGT to disk
// TODO: Cache bdev
// TODO: Track, allocate, and recycle individual swap buffers on disk instead
// of only supporting a global reset
int copy(struct sg_table *sgt, int op, struct nvgpu_mapped_buf *m) {
unsigned int i;
struct scatterlist *sg;
struct bio *bio;
int err = 0;
int sg_cnt = sgt->nents;
sector_t sector = m->os_priv.swap_sector;
// Find and open the block device
struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ | FMODE_WRITE, copy);
if (unlikely(IS_ERR(bdev))) {
printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev));
return -ENODEV;
}
// Assign a sector on-disk (0 indicates unassigned, we start at 4)
if (sector == 0) {
// Read block device size in sectors, and fail if we'd use more than 1/3rd
// of the disk (to stay in SLC-emulation-mode).
// TODO: Issue NVMe DSM commands to try to manage this better? Read-only
// regions should be able to be moved to TLC safely, whereas other
// data should be kept in the SLC cache to reduce wear.
if (atomic64_read(&nvgpu_swap_next_sector) >= i_size_read(bdev->bd_inode)/3) {
err = -ENOMEM;
goto out_put;
}
// Hand out sectors sequentially, and statically
// TODO: Intelligent sector allocation
sector = atomic64_add_return(m->size >> SECTOR_SHIFT, &nvgpu_swap_next_sector);
sector -= (m->size >> SECTOR_SHIFT);
m->os_priv.swap_sector = sector;
}
// Reset the .done variable in the completion
reinit_completion(&m->os_priv.swap_io_done);
// bio_alloc() will never fail when allocating <= BIO_MAX_PAGES
bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
bio->bi_iter.bi_sector = sector;
bio_set_op_attrs(bio, op, REQ_SYNC); // REQ_SYNC is identical to WRITE_ODIRECT
bio->bi_private = m;
bio->bi_end_io = complete_swap_io;
// Copy the scatter-gather table (sgt) into a block I/O vector (bio vec)
// bio_chain() approach borrowed from drivers/nvme/target/io-cmd.c:nvmet_execute_rw()
for_each_sg(sgt->sgl, sg, sgt->nents, i) {
// On most iterations, this inner loop shouldn't happen at all. This loop
// conditional only triggers if we fill up the bio and are unable to map
// the full length of an SGL entry.
while (bio_add_page(bio, sg_page(sg), sg_dma_len(sg), sg->offset) != sg_dma_len(sg)) {
// Uh oh! We ran out of space in the bio. Allocate a new one and chain it...
struct bio *prev = bio;
bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
bio->bi_iter.bi_sector = sector;
bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);
bio_chain(bio, prev);
// Get the I/O started
submit_bio(prev);
// No need to call bio_put() as that's automatically managed for chained bios
}
sector += sg_dma_len(sg) >> 9;
sg_cnt--;
}
// Async submit. Caller should wait_for_completion_io(&m->os_priv.swap_io_done);
// Does not fail. Error reported via completion handler.
submit_bio(bio);
out_put:
// Release our block device handle
blkdev_put(bdev, FMODE_WRITE | FMODE_READ); // Is this safe?
return err;
}
// Patterned off how __nvgpu_vm_find_mapped_buf_reverse() works in vm.c
// Needs struct nvgpu_rbtree_node *node, struct nvgpu_rbtree_node *root,
// and struct nvgpu_mapped_buf *m.
// Steps until end of rbtree OR !m
#define for_each_buffer(node, root, m) \
for (nvgpu_rbtree_enum_start(0, &node, root); \
node && (uintptr_t)(m = mapped_buffer_from_rbtree_node(node)); \
nvgpu_rbtree_enum_next(&node, node))
// New, fast replacement to looking through with the above macro to match
struct nvgpu_mapped_buf* dmabuf_to_mapped_buf(struct dma_buf *dmabuf) {
struct list_head *nvmap_priv = nvmap_get_priv_list(dmabuf);
struct nvgpu_mapped_buf *mapped_buffer;
struct nvgpu_mapped_buf_priv *priv;
if (IS_ERR(nvmap_priv))
return ERR_PTR(-EOPNOTSUPP);
priv = list_first_entry_or_null(nvmap_priv, struct nvgpu_mapped_buf_priv, nvmap_priv_entry);
if (unlikely(!priv)) {
printk(KERN_ERR "nvgpu: State tracking error for fast reverse lookups. Have unattached dmabuf!");
return ERR_PTR(-ENOTRECOVERABLE);
}
mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
if (unlikely(mapped_buffer->os_priv.dmabuf != dmabuf)) {
printk(KERN_ERR "nvgpu: dmabuf_to_mapped_buf mapping inconsistent! BUG!\n");
return ERR_PTR(-ENOTRECOVERABLE);
}
if (!list_is_singular(&priv->nvmap_priv_entry)) {
printk(KERN_WARNING "nvgpu: Requesting paging on memory with multiple mappings! Aborting...\n");
return ERR_PTR(-EOPNOTSUPP);
}
return mapped_buffer;
}
int copy_all(struct vm_gk20a *vm) {
struct nvgpu_rbtree_node *node;
struct nvgpu_mapped_buf *m;
for_each_buffer(node, vm->mapped_buffers, m) {
// TODO
continue;
}
return 0;
}
int copy_out(struct sg_table *sgt, struct nvgpu_mapped_buf *m) {
return copy(sgt, REQ_OP_WRITE, m);
}
int copy_in(struct sg_table *sgt, struct nvgpu_mapped_buf *m) {
return copy(sgt, REQ_OP_READ, m);
}