#include #include #include // For SECTOR_SHIFT // Next sector to assign a mapped_buf to. Skip first disk block atomic64_t nvgpu_swap_next_sector = {4}; // Callback for completion of the I/O chain // TODO: Error checking and handling static void complete_swap_io(struct bio *bio) { struct nvgpu_mapped_buf *m = bio->bi_private; bio_put(bio); complete(&m->os_priv.swap_io_done); } // Queue a command to copy out an SGT to disk // TODO: Cache bdev // TODO: Track, allocate, and recycle individual swap buffers on disk instead // of only supporting a global reset int copy(struct sg_table *sgt, int op, struct nvgpu_mapped_buf *m) { unsigned int i; struct scatterlist *sg; struct bio *bio; int err = 0; int sg_cnt = sgt->nents; sector_t sector = m->os_priv.swap_sector; // Find and open the block device struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ | FMODE_WRITE, copy); if (unlikely(IS_ERR(bdev))) { printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev)); return -ENODEV; } // Assign a sector on-disk (0 indicates unassigned, we start at 4) if (sector == 0) { // Read block device size in sectors, and fail if we'd use more than 1/3rd // of the disk (to stay in SLC-emulation-mode). // TODO: Issue NVMe DSM commands to try to manage this better? Read-only // regions should be able to be moved to TLC safely, whereas other // data should be kept in the SLC cache to reduce wear. if (atomic64_read(&nvgpu_swap_next_sector) >= i_size_read(bdev->bd_inode)/3) { err = -ENOMEM; goto out_put; } // Hand out sectors sequentially, and statically // TODO: Intelligent sector allocation sector = atomic64_add_return(m->size >> SECTOR_SHIFT, &nvgpu_swap_next_sector); sector -= (m->size >> SECTOR_SHIFT); m->os_priv.swap_sector = sector; } // Reset the .done variable in the completion reinit_completion(&m->os_priv.swap_io_done); // bio_alloc() will never fail when allocating <= BIO_MAX_PAGES bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels bio->bi_iter.bi_sector = sector; bio_set_op_attrs(bio, op, REQ_SYNC); // REQ_SYNC is identical to WRITE_ODIRECT bio->bi_private = m; bio->bi_end_io = complete_swap_io; // Copy the scatter-gather table (sgt) into a block I/O vector (bio vec) // bio_chain() approach borrowed from drivers/nvme/target/io-cmd.c:nvmet_execute_rw() for_each_sg(sgt->sgl, sg, sgt->nents, i) { // On most iterations, this inner loop shouldn't happen at all. This loop // conditional only triggers if we fill up the bio and are unable to map // the full length of an SGL entry. while (bio_add_page(bio, sg_page(sg), sg_dma_len(sg), sg->offset) != sg_dma_len(sg)) { // Uh oh! We ran out of space in the bio. Allocate a new one and chain it... struct bio *prev = bio; bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels bio->bi_iter.bi_sector = sector; bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0); bio_chain(bio, prev); // Get the I/O started submit_bio(prev); // No need to call bio_put() as that's automatically managed for chained bios } sector += sg_dma_len(sg) >> 9; sg_cnt--; } // Async submit. Caller should wait_for_completion_io(&m->os_priv.swap_io_done); // Does not fail. Error reported via completion handler. submit_bio(bio); out_put: // Release our block device handle blkdev_put(bdev, FMODE_WRITE | FMODE_READ); // Is this safe? return err; } // Patterned off how __nvgpu_vm_find_mapped_buf_reverse() works in vm.c // Needs struct nvgpu_rbtree_node *node, struct nvgpu_rbtree_node *root, // and struct nvgpu_mapped_buf *m. // Steps until end of rbtree OR !m #define for_each_buffer(node, root, m) \ for (nvgpu_rbtree_enum_start(0, &node, root); \ node && (uintptr_t)(m = mapped_buffer_from_rbtree_node(node)); \ nvgpu_rbtree_enum_next(&node, node)) // New, fast replacement to looking through with the above macro to match struct nvgpu_mapped_buf* dmabuf_to_mapped_buf(struct dma_buf *dmabuf) { struct list_head *nvmap_priv = nvmap_get_priv_list(dmabuf); struct nvgpu_mapped_buf *mapped_buffer; struct nvgpu_mapped_buf_priv *priv; if (IS_ERR(nvmap_priv)) return ERR_PTR(-EOPNOTSUPP); priv = list_first_entry_or_null(nvmap_priv, struct nvgpu_mapped_buf_priv, nvmap_priv_entry); if (unlikely(!priv)) { printk(KERN_ERR "nvgpu: State tracking error for fast reverse lookups. Have unattached dmabuf!"); return ERR_PTR(-ENOTRECOVERABLE); } mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv); if (unlikely(mapped_buffer->os_priv.dmabuf != dmabuf)) { printk(KERN_ERR "nvgpu: dmabuf_to_mapped_buf mapping inconsistent! BUG!\n"); return ERR_PTR(-ENOTRECOVERABLE); } if (!list_is_singular(&priv->nvmap_priv_entry)) { printk(KERN_WARNING "nvgpu: Requesting paging on memory with multiple mappings! Aborting...\n"); return ERR_PTR(-EOPNOTSUPP); } return mapped_buffer; } int copy_all(struct vm_gk20a *vm) { struct nvgpu_rbtree_node *node; struct nvgpu_mapped_buf *m; for_each_buffer(node, vm->mapped_buffers, m) { // TODO continue; } return 0; } int copy_out(struct sg_table *sgt, struct nvgpu_mapped_buf *m) { return copy(sgt, REQ_OP_WRITE, m); } int copy_in(struct sg_table *sgt, struct nvgpu_mapped_buf *m) { return copy(sgt, REQ_OP_READ, m); }