#include <linux/scatterlist.h>
#include <linux/bio.h>
#include <linux/blkdev.h> // For SECTOR_SHIFT

// Next sector to assign a mapped_buf to. Skip first disk block
atomic64_t nvgpu_swap_next_sector = {4};

// Callback for completion of the I/O chain
// TODO: Error checking and handling
static void complete_swap_io(struct bio *bio) {
  struct nvgpu_mapped_buf *m = bio->bi_private;
  bio_put(bio);
  complete(&m->os_priv.swap_io_done);
}

// Queue a command to copy out an SGT to disk
// TODO: Cache bdev
// TODO: Track, allocate, and recycle individual swap buffers on disk instead
//       of only supporting a global reset
int copy(struct sg_table *sgt, int op, struct nvgpu_mapped_buf *m) {
  unsigned int i;
  struct scatterlist *sg;
  struct bio *bio;
  int err = 0;
  int sg_cnt = sgt->nents;
  sector_t sector = m->os_priv.swap_sector;
  // Find and open the block device
  struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ | FMODE_WRITE, copy);
  if (unlikely(IS_ERR(bdev))) {
    printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev));
    return -ENODEV;
  }
  // Assign a sector on-disk (0 indicates unassigned, we start at 4)
  if (sector == 0) {
    // Read block device size in sectors, and fail if we'd use more than 1/3rd
    // of the disk (to stay in SLC-emulation-mode).
    // TODO: Issue NVMe DSM commands to try to manage this better? Read-only
    //       regions should be able to be moved to TLC safely, whereas other
    //       data should be kept in the SLC cache to reduce wear.
    if (atomic64_read(&nvgpu_swap_next_sector) >= i_size_read(bdev->bd_inode)/3) {
      err = -ENOMEM;
      goto out_put;
    }
    // Hand out sectors sequentially, and statically
    // TODO: Intelligent sector allocation
    sector = atomic64_add_return(m->size >> SECTOR_SHIFT, &nvgpu_swap_next_sector);
    sector -= (m->size >> SECTOR_SHIFT);
    m->os_priv.swap_sector = sector;
  }
  // Reset the .done variable in the completion
  reinit_completion(&m->os_priv.swap_io_done);
  // bio_alloc() will never fail when allocating <= BIO_MAX_PAGES
  bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
  bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
  bio->bi_iter.bi_sector = sector;
  bio_set_op_attrs(bio, op, REQ_SYNC); // REQ_SYNC is identical to WRITE_ODIRECT
  bio->bi_private = m;
  bio->bi_end_io = complete_swap_io;
  // Copy the scatter-gather table (sgt) into a block I/O vector (bio vec)
  // bio_chain() approach borrowed from drivers/nvme/target/io-cmd.c:nvmet_execute_rw()
  for_each_sg(sgt->sgl, sg, sgt->nents, i) {
    // On most iterations, this inner loop shouldn't happen at all. This loop
    // conditional only triggers if we fill up the bio and are unable to map
    // the full length of an SGL entry.
    while (bio_add_page(bio, sg_page(sg), sg_dma_len(sg), sg->offset) != sg_dma_len(sg)) {
      // Uh oh! We ran out of space in the bio. Allocate a new one and chain it...
      struct bio *prev = bio;
      bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
      bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
      bio->bi_iter.bi_sector = sector;
      bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);
      bio_chain(bio, prev);
      // Get the I/O started
      submit_bio(prev);
      // No need to call bio_put() as that's automatically managed for chained bios
    }
    sector += sg_dma_len(sg) >> 9;
    sg_cnt--;
  }

  // Async submit. Caller should wait_for_completion_io(&m->os_priv.swap_io_done);
  // Does not fail. Error reported via completion handler.
  submit_bio(bio);

out_put:
  // Release our block device handle
  blkdev_put(bdev, FMODE_WRITE | FMODE_READ); // Is this safe?
  return err;
}

// Patterned off how __nvgpu_vm_find_mapped_buf_reverse() works in vm.c
// Needs struct nvgpu_rbtree_node *node, struct nvgpu_rbtree_node *root,
// and struct nvgpu_mapped_buf *m.
// Steps until end of rbtree OR !m
#define for_each_buffer(node, root, m) \
  for (nvgpu_rbtree_enum_start(0, &node, root); \
       node && (uintptr_t)(m = mapped_buffer_from_rbtree_node(node)); \
       nvgpu_rbtree_enum_next(&node, node))

// New, fast replacement to looking through with the above macro to match
struct nvgpu_mapped_buf* dmabuf_to_mapped_buf(struct dma_buf *dmabuf) {
  struct list_head *nvmap_priv = nvmap_get_priv_list(dmabuf);
  struct nvgpu_mapped_buf *mapped_buffer;
  struct nvgpu_mapped_buf_priv *priv;

  if (IS_ERR(nvmap_priv))
    return ERR_PTR(-EOPNOTSUPP);

  priv = list_first_entry_or_null(nvmap_priv, struct nvgpu_mapped_buf_priv, nvmap_priv_entry);
  if (unlikely(!priv)) {
    printk(KERN_ERR "nvgpu: State tracking error for fast reverse lookups. Have unattached dmabuf!");
    return ERR_PTR(-ENOTRECOVERABLE);
  }

  mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
  if (unlikely(mapped_buffer->os_priv.dmabuf != dmabuf)) {
    printk(KERN_ERR "nvgpu: dmabuf_to_mapped_buf mapping inconsistent! BUG!\n");
    return ERR_PTR(-ENOTRECOVERABLE);
  }
  if (!list_is_singular(&priv->nvmap_priv_entry)) {
    printk(KERN_WARNING "nvgpu: Requesting paging on memory with multiple mappings! Aborting...\n");
    return ERR_PTR(-EOPNOTSUPP);
  }
  return mapped_buffer;
}

int copy_all(struct vm_gk20a *vm) {
	struct nvgpu_rbtree_node *node;
	struct nvgpu_mapped_buf *m;

	for_each_buffer(node, vm->mapped_buffers, m) {
		// TODO
		continue;
	}
	return 0;
}

int copy_out(struct sg_table *sgt, struct nvgpu_mapped_buf *m) {
  return copy(sgt, REQ_OP_WRITE, m);
}

int copy_in(struct sg_table *sgt, struct nvgpu_mapped_buf *m) {
  return copy(sgt, REQ_OP_READ, m);
}