drivers/gpu/nvgpu/os/linux/swap.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

#include <linux/scatterlist.h>
#include <linux/bio.h>
#include <linux/blkdev.h> // For SECTOR_SHIFT

// Next sector to assign a mapped_buf to. Skip first disk block
atomic64_t nvgpu_swap_next_sector = {4};

// Callback for completion of the I/O chain
// TODO: Error checking and handling
static void complete_swap_io(struct bio *bio) {
  struct nvgpu_mapped_buf *m = bio->bi_private;
  bio_put(bio);
  complete(&m->os_priv.swap_io_done);
}

// Queue a command to copy out an SGT to disk
// TODO: Cache bdev
// TODO: Track, allocate, and recycle individual swap buffers on disk instead
//       of only supporting a global reset
int copy(struct sg_table *sgt, int op, struct nvgpu_mapped_buf *m) {
  unsigned int i;
  struct scatterlist *sg;
  struct bio *bio;
  int err = 0;
  int sg_cnt = sgt->nents;
  sector_t sector = m->os_priv.swap_sector;
  // Find and open the block device
  struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ | FMODE_WRITE, copy);
  if (unlikely(IS_ERR(bdev))) {
    printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev));
    return -ENODEV;
  }
  // Assign a sector on-disk (0 indicates unassigned, we start at 4)
  if (sector == 0) {
    // Read block device size in sectors, and fail if we'd use more than 1/3rd
    // of the disk (to stay in SLC-emulation-mode).
    // TODO: Issue NVMe DSM commands to try to manage this better? Read-only
    //       regions should be able to be moved to TLC safely, whereas other
    //       data should be kept in the SLC cache to reduce wear.
    if (atomic64_read(&nvgpu_swap_next_sector) >= i_size_read(bdev->bd_inode)/3) {
      err = -ENOMEM;
      goto out_put;
    }
    // Hand out sectors sequentially, and statically
    // TODO: Intelligent sector allocation
    sector = atomic64_add_return(m->size >> SECTOR_SHIFT, &nvgpu_swap_next_sector);
    sector -= (m->size >> SECTOR_SHIFT);
    m->os_priv.swap_sector = sector;
  }
  // Reset the .done variable in the completion
  reinit_completion(&m->os_priv.swap_io_done);
  // bio_alloc() will never fail when allocating <= BIO_MAX_PAGES
  bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
  bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
  bio->bi_iter.bi_sector = sector;
  bio_set_op_attrs(bio, op, REQ_SYNC); // REQ_SYNC is identical to WRITE_ODIRECT
  bio->bi_private = m;
  bio->bi_end_io = complete_swap_io;
  // Copy the scatter-gather table (sgt) into a block I/O vector (bio vec)
  // bio_chain() approach borrowed from drivers/nvme/target/io-cmd.c:nvmet_execute_rw()
  for_each_sg(sgt->sgl, sg, sgt->nents, i) {
    // On most iterations, this inner loop shouldn't happen at all. This loop
    // conditional only triggers if we fill up the bio and are unable to map
    // the full length of an SGL entry.
    while (bio_add_page(bio, sg_page(sg), sg_dma_len(sg), sg->offset) != sg_dma_len(sg)) {
      // Uh oh! We ran out of space in the bio. Allocate a new one and chain it...
      struct bio *prev = bio;
      bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
      bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
      bio->bi_iter.bi_sector = sector;
      bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);
      bio_chain(bio, prev);
      // Get the I/O started
      submit_bio(prev);
      // No need to call bio_put() as that's automatically managed for chained bios
    }
    sector += sg_dma_len(sg) >> 9;
    sg_cnt--;
  }

  // Async submit. Caller should wait_for_completion_io(&m->os_priv.swap_io_done);
  // Does not fail. Error reported via completion handler.
  submit_bio(bio);

out_put:
  // Release our block device handle
  blkdev_put(bdev, FMODE_WRITE | FMODE_READ); // Is this safe?
  return err;
}

// Patterned off how __nvgpu_vm_find_mapped_buf_reverse() works in vm.c
// Needs struct nvgpu_rbtree_node *node, struct nvgpu_rbtree_node *root,
// and struct nvgpu_mapped_buf *m.
// Steps until end of rbtree OR !m
#define for_each_buffer(node, root, m) \
  for (nvgpu_rbtree_enum_start(0, &node, root); \
       node && (uintptr_t)(m = mapped_buffer_from_rbtree_node(node)); \
       nvgpu_rbtree_enum_next(&node, node))

// New, fast replacement to looking through with the above macro to match
struct nvgpu_mapped_buf* dmabuf_to_mapped_buf(struct dma_buf *dmabuf) {
  struct list_head *nvmap_priv = nvmap_get_priv_list(dmabuf);
  struct nvgpu_mapped_buf *mapped_buffer;
  struct nvgpu_mapped_buf_priv *priv;

  if (IS_ERR(nvmap_priv))
    return ERR_PTR(-EOPNOTSUPP);

  priv = list_first_entry_or_null(nvmap_priv, struct nvgpu_mapped_buf_priv, nvmap_priv_entry);
  if (unlikely(!priv)) {
    printk(KERN_ERR "nvgpu: State tracking error for fast reverse lookups. Have unattached dmabuf!");
    return ERR_PTR(-ENOTRECOVERABLE);
  }

  mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
  if (unlikely(mapped_buffer->os_priv.dmabuf != dmabuf)) {
    printk(KERN_ERR "nvgpu: dmabuf_to_mapped_buf mapping inconsistent! BUG!\n");
    return ERR_PTR(-ENOTRECOVERABLE);
  }
  if (!list_is_singular(&priv->nvmap_priv_entry)) {
    printk(KERN_WARNING "nvgpu: Requesting paging on memory with multiple mappings! Aborting...\n");
    return ERR_PTR(-EOPNOTSUPP);
  }
  return mapped_buffer;
}

int copy_all(struct vm_gk20a *vm) {
	struct nvgpu_rbtree_node *node;
	struct nvgpu_mapped_buf *m;

	for_each_buffer(node, vm->mapped_buffers, m) {
		// TODO
		continue;
	}
	return 0;
}

int copy_out(struct sg_table *sgt, struct nvgpu_mapped_buf *m) {
  return copy(sgt, REQ_OP_WRITE, m);
}

int copy_in(struct sg_table *sgt, struct nvgpu_mapped_buf *m) {
  return copy(sgt, REQ_OP_READ, m);
}