diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-13 13:55:58 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-13 13:55:58 -0400 |
| commit | 9bf12df31f282e845b3dfaac1e5d5376a041da22 (patch) | |
| tree | 10d7a21d34c7f2c47eff3e807f5efef46228d507 | |
| parent | 399a946edbbe90bd03aec2e93ce58c9b3f18e70b (diff) | |
| parent | d9b2c8714aef102dea95544a8cd9372b21af463f (diff) | |
Merge git://git.kvack.org/~bcrl/aio-next
Pull aio changes from Ben LaHaise:
"First off, sorry for this pull request being late in the merge window.
Al had raised a couple of concerns about 2 items in the series below.
I addressed the first issue (the race introduced by Gu's use of
mm_populate()), but he has not provided any further details on how he
wants to rework the anon_inode.c changes (which were sent out months
ago but have yet to be commented on).
The bulk of the changes have been sitting in the -next tree for a few
months, with all the issues raised being addressed"
* git://git.kvack.org/~bcrl/aio-next: (22 commits)
aio: rcu_read_lock protection for new rcu_dereference calls
aio: fix race in ring buffer page lookup introduced by page migration support
aio: fix rcu sparse warnings introduced by ioctx table lookup patch
aio: remove unnecessary debugging from aio_free_ring()
aio: table lookup: verify ctx pointer
staging/lustre: kiocb->ki_left is removed
aio: fix error handling and rcu usage in "convert the ioctx list to table lookup v3"
aio: be defensive to ensure request batching is non-zero instead of BUG_ON()
aio: convert the ioctx list to table lookup v3
aio: double aio_max_nr in calculations
aio: Kill ki_dtor
aio: Kill ki_users
aio: Kill unneeded kiocb members
aio: Kill aio_rw_vect_retry()
aio: Don't use ctx->tail unnecessarily
aio: io_cancel() no longer returns the io_event
aio: percpu ioctx refcount
aio: percpu reqs_available
aio: reqs_active -> reqs_available
aio: fix build when migration is disabled
...
| -rw-r--r-- | drivers/staging/android/logger.c | 2 | ||||
| -rw-r--r-- | drivers/staging/lustre/lustre/llite/file.c | 4 | ||||
| -rw-r--r-- | drivers/usb/gadget/inode.c | 9 | ||||
| -rw-r--r-- | fs/aio.c | 726 | ||||
| -rw-r--r-- | fs/anon_inodes.c | 66 | ||||
| -rw-r--r-- | fs/block_dev.c | 2 | ||||
| -rw-r--r-- | fs/nfs/direct.c | 1 | ||||
| -rw-r--r-- | fs/ocfs2/file.c | 6 | ||||
| -rw-r--r-- | fs/read_write.c | 3 | ||||
| -rw-r--r-- | fs/udf/file.c | 2 | ||||
| -rw-r--r-- | include/linux/aio.h | 21 | ||||
| -rw-r--r-- | include/linux/anon_inodes.h | 3 | ||||
| -rw-r--r-- | include/linux/migrate.h | 3 | ||||
| -rw-r--r-- | include/linux/mm_types.h | 5 | ||||
| -rw-r--r-- | kernel/fork.c | 2 | ||||
| -rw-r--r-- | mm/migrate.c | 2 | ||||
| -rw-r--r-- | mm/page_io.c | 1 | ||||
| -rw-r--r-- | net/socket.c | 15 |
18 files changed, 561 insertions, 312 deletions
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c index a8c344422a77..d42f5785f098 100644 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c | |||
| @@ -481,7 +481,7 @@ static ssize_t logger_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 481 | header.sec = now.tv_sec; | 481 | header.sec = now.tv_sec; |
| 482 | header.nsec = now.tv_nsec; | 482 | header.nsec = now.tv_nsec; |
| 483 | header.euid = current_euid(); | 483 | header.euid = current_euid(); |
| 484 | header.len = min_t(size_t, iocb->ki_left, LOGGER_ENTRY_MAX_PAYLOAD); | 484 | header.len = min_t(size_t, iocb->ki_nbytes, LOGGER_ENTRY_MAX_PAYLOAD); |
| 485 | header.hdr_size = sizeof(struct logger_entry); | 485 | header.hdr_size = sizeof(struct logger_entry); |
| 486 | 486 | ||
| 487 | /* null writes succeed, return zero */ | 487 | /* null writes succeed, return zero */ |
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c index 253f02688f4f..bc534db12431 100644 --- a/drivers/staging/lustre/lustre/llite/file.c +++ b/drivers/staging/lustre/lustre/llite/file.c | |||
| @@ -1009,7 +1009,7 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count, | |||
| 1009 | local_iov->iov_len = count; | 1009 | local_iov->iov_len = count; |
| 1010 | init_sync_kiocb(kiocb, file); | 1010 | init_sync_kiocb(kiocb, file); |
| 1011 | kiocb->ki_pos = *ppos; | 1011 | kiocb->ki_pos = *ppos; |
| 1012 | kiocb->ki_left = count; | 1012 | kiocb->ki_nbytes = count; |
| 1013 | 1013 | ||
| 1014 | result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos); | 1014 | result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos); |
| 1015 | *ppos = kiocb->ki_pos; | 1015 | *ppos = kiocb->ki_pos; |
| @@ -1068,7 +1068,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, | |||
| 1068 | local_iov->iov_len = count; | 1068 | local_iov->iov_len = count; |
| 1069 | init_sync_kiocb(kiocb, file); | 1069 | init_sync_kiocb(kiocb, file); |
| 1070 | kiocb->ki_pos = *ppos; | 1070 | kiocb->ki_pos = *ppos; |
| 1071 | kiocb->ki_left = count; | 1071 | kiocb->ki_nbytes = count; |
| 1072 | 1072 | ||
| 1073 | result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos); | 1073 | result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos); |
| 1074 | *ppos = kiocb->ki_pos; | 1074 | *ppos = kiocb->ki_pos; |
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index 465ef8e2cc91..b94c049ab0d0 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c | |||
| @@ -524,7 +524,7 @@ struct kiocb_priv { | |||
| 524 | unsigned actual; | 524 | unsigned actual; |
| 525 | }; | 525 | }; |
| 526 | 526 | ||
| 527 | static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) | 527 | static int ep_aio_cancel(struct kiocb *iocb) |
| 528 | { | 528 | { |
| 529 | struct kiocb_priv *priv = iocb->private; | 529 | struct kiocb_priv *priv = iocb->private; |
| 530 | struct ep_data *epdata; | 530 | struct ep_data *epdata; |
| @@ -540,7 +540,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) | |||
| 540 | // spin_unlock(&epdata->dev->lock); | 540 | // spin_unlock(&epdata->dev->lock); |
| 541 | local_irq_enable(); | 541 | local_irq_enable(); |
| 542 | 542 | ||
| 543 | aio_put_req(iocb); | ||
| 544 | return value; | 543 | return value; |
| 545 | } | 544 | } |
| 546 | 545 | ||
| @@ -709,11 +708,11 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 709 | if (unlikely(usb_endpoint_dir_in(&epdata->desc))) | 708 | if (unlikely(usb_endpoint_dir_in(&epdata->desc))) |
| 710 | return -EINVAL; | 709 | return -EINVAL; |
| 711 | 710 | ||
| 712 | buf = kmalloc(iocb->ki_left, GFP_KERNEL); | 711 | buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL); |
| 713 | if (unlikely(!buf)) | 712 | if (unlikely(!buf)) |
| 714 | return -ENOMEM; | 713 | return -ENOMEM; |
| 715 | 714 | ||
| 716 | return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); | 715 | return ep_aio_rwtail(iocb, buf, iocb->ki_nbytes, epdata, iov, nr_segs); |
| 717 | } | 716 | } |
| 718 | 717 | ||
| 719 | static ssize_t | 718 | static ssize_t |
| @@ -728,7 +727,7 @@ ep_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 728 | if (unlikely(!usb_endpoint_dir_in(&epdata->desc))) | 727 | if (unlikely(!usb_endpoint_dir_in(&epdata->desc))) |
| 729 | return -EINVAL; | 728 | return -EINVAL; |
| 730 | 729 | ||
| 731 | buf = kmalloc(iocb->ki_left, GFP_KERNEL); | 730 | buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL); |
| 732 | if (unlikely(!buf)) | 731 | if (unlikely(!buf)) |
| 733 | return -ENOMEM; | 732 | return -ENOMEM; |
| 734 | 733 | ||
| @@ -26,6 +26,7 @@ | |||
| 26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
| 27 | #include <linux/mman.h> | 27 | #include <linux/mman.h> |
| 28 | #include <linux/mmu_context.h> | 28 | #include <linux/mmu_context.h> |
| 29 | #include <linux/percpu.h> | ||
| 29 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
| 30 | #include <linux/timer.h> | 31 | #include <linux/timer.h> |
| 31 | #include <linux/aio.h> | 32 | #include <linux/aio.h> |
| @@ -35,6 +36,10 @@ | |||
| 35 | #include <linux/eventfd.h> | 36 | #include <linux/eventfd.h> |
| 36 | #include <linux/blkdev.h> | 37 | #include <linux/blkdev.h> |
| 37 | #include <linux/compat.h> | 38 | #include <linux/compat.h> |
| 39 | #include <linux/anon_inodes.h> | ||
| 40 | #include <linux/migrate.h> | ||
| 41 | #include <linux/ramfs.h> | ||
| 42 | #include <linux/percpu-refcount.h> | ||
| 38 | 43 | ||
| 39 | #include <asm/kmap_types.h> | 44 | #include <asm/kmap_types.h> |
| 40 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
| @@ -61,14 +66,29 @@ struct aio_ring { | |||
| 61 | 66 | ||
| 62 | #define AIO_RING_PAGES 8 | 67 | #define AIO_RING_PAGES 8 |
| 63 | 68 | ||
| 69 | struct kioctx_table { | ||
| 70 | struct rcu_head rcu; | ||
| 71 | unsigned nr; | ||
| 72 | struct kioctx *table[]; | ||
| 73 | }; | ||
| 74 | |||
| 75 | struct kioctx_cpu { | ||
| 76 | unsigned reqs_available; | ||
| 77 | }; | ||
| 78 | |||
| 64 | struct kioctx { | 79 | struct kioctx { |
| 65 | atomic_t users; | 80 | struct percpu_ref users; |
| 66 | atomic_t dead; | 81 | atomic_t dead; |
| 67 | 82 | ||
| 68 | /* This needs improving */ | ||
| 69 | unsigned long user_id; | 83 | unsigned long user_id; |
| 70 | struct hlist_node list; | ||
| 71 | 84 | ||
| 85 | struct __percpu kioctx_cpu *cpu; | ||
| 86 | |||
| 87 | /* | ||
| 88 | * For percpu reqs_available, number of slots we move to/from global | ||
| 89 | * counter at a time: | ||
| 90 | */ | ||
| 91 | unsigned req_batch; | ||
| 72 | /* | 92 | /* |
| 73 | * This is what userspace passed to io_setup(), it's not used for | 93 | * This is what userspace passed to io_setup(), it's not used for |
| 74 | * anything but counting against the global max_reqs quota. | 94 | * anything but counting against the global max_reqs quota. |
| @@ -88,10 +108,18 @@ struct kioctx { | |||
| 88 | long nr_pages; | 108 | long nr_pages; |
| 89 | 109 | ||
| 90 | struct rcu_head rcu_head; | 110 | struct rcu_head rcu_head; |
| 91 | struct work_struct rcu_work; | 111 | struct work_struct free_work; |
| 92 | 112 | ||
| 93 | struct { | 113 | struct { |
| 94 | atomic_t reqs_active; | 114 | /* |
| 115 | * This counts the number of available slots in the ringbuffer, | ||
| 116 | * so we avoid overflowing it: it's decremented (if positive) | ||
| 117 | * when allocating a kiocb and incremented when the resulting | ||
| 118 | * io_event is pulled off the ringbuffer. | ||
| 119 | * | ||
| 120 | * We batch accesses to it with a percpu version. | ||
| 121 | */ | ||
| 122 | atomic_t reqs_available; | ||
| 95 | } ____cacheline_aligned_in_smp; | 123 | } ____cacheline_aligned_in_smp; |
| 96 | 124 | ||
| 97 | struct { | 125 | struct { |
| @@ -110,6 +138,9 @@ struct kioctx { | |||
| 110 | } ____cacheline_aligned_in_smp; | 138 | } ____cacheline_aligned_in_smp; |
| 111 | 139 | ||
| 112 | struct page *internal_pages[AIO_RING_PAGES]; | 140 | struct page *internal_pages[AIO_RING_PAGES]; |
| 141 | struct file *aio_ring_file; | ||
| 142 | |||
| 143 | unsigned id; | ||
| 113 | }; | 144 | }; |
| 114 | 145 | ||
| 115 | /*------ sysctl variables----*/ | 146 | /*------ sysctl variables----*/ |
| @@ -138,15 +169,77 @@ __initcall(aio_setup); | |||
| 138 | 169 | ||
| 139 | static void aio_free_ring(struct kioctx *ctx) | 170 | static void aio_free_ring(struct kioctx *ctx) |
| 140 | { | 171 | { |
| 141 | long i; | 172 | int i; |
| 173 | struct file *aio_ring_file = ctx->aio_ring_file; | ||
| 142 | 174 | ||
| 143 | for (i = 0; i < ctx->nr_pages; i++) | 175 | for (i = 0; i < ctx->nr_pages; i++) { |
| 176 | pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, | ||
| 177 | page_count(ctx->ring_pages[i])); | ||
| 144 | put_page(ctx->ring_pages[i]); | 178 | put_page(ctx->ring_pages[i]); |
| 179 | } | ||
| 145 | 180 | ||
| 146 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) | 181 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) |
| 147 | kfree(ctx->ring_pages); | 182 | kfree(ctx->ring_pages); |
| 183 | |||
| 184 | if (aio_ring_file) { | ||
| 185 | truncate_setsize(aio_ring_file->f_inode, 0); | ||
| 186 | fput(aio_ring_file); | ||
| 187 | ctx->aio_ring_file = NULL; | ||
| 188 | } | ||
| 189 | } | ||
| 190 | |||
| 191 | static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) | ||
| 192 | { | ||
| 193 | vma->vm_ops = &generic_file_vm_ops; | ||
| 194 | return 0; | ||
| 148 | } | 195 | } |
| 149 | 196 | ||
| 197 | static const struct file_operations aio_ring_fops = { | ||
| 198 | .mmap = aio_ring_mmap, | ||
| 199 | }; | ||
| 200 | |||
| 201 | static int aio_set_page_dirty(struct page *page) | ||
| 202 | { | ||
| 203 | return 0; | ||
| 204 | } | ||
| 205 | |||
| 206 | #if IS_ENABLED(CONFIG_MIGRATION) | ||
| 207 | static int aio_migratepage(struct address_space *mapping, struct page *new, | ||
| 208 | struct page *old, enum migrate_mode mode) | ||
| 209 | { | ||
| 210 | struct kioctx *ctx = mapping->private_data; | ||
| 211 | unsigned long flags; | ||
| 212 | unsigned idx = old->index; | ||
| 213 | int rc; | ||
| 214 | |||
| 215 | /* Writeback must be complete */ | ||
| 216 | BUG_ON(PageWriteback(old)); | ||
| 217 | put_page(old); | ||
| 218 | |||
| 219 | rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); | ||
| 220 | if (rc != MIGRATEPAGE_SUCCESS) { | ||
| 221 | get_page(old); | ||
| 222 | return rc; | ||
| 223 | } | ||
| 224 | |||
| 225 | get_page(new); | ||
| 226 | |||
| 227 | spin_lock_irqsave(&ctx->completion_lock, flags); | ||
| 228 | migrate_page_copy(new, old); | ||
| 229 | ctx->ring_pages[idx] = new; | ||
| 230 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | ||
| 231 | |||
| 232 | return rc; | ||
| 233 | } | ||
| 234 | #endif | ||
| 235 | |||
| 236 | static const struct address_space_operations aio_ctx_aops = { | ||
| 237 | .set_page_dirty = aio_set_page_dirty, | ||
| 238 | #if IS_ENABLED(CONFIG_MIGRATION) | ||
| 239 | .migratepage = aio_migratepage, | ||
| 240 | #endif | ||
| 241 | }; | ||
| 242 | |||
| 150 | static int aio_setup_ring(struct kioctx *ctx) | 243 | static int aio_setup_ring(struct kioctx *ctx) |
| 151 | { | 244 | { |
| 152 | struct aio_ring *ring; | 245 | struct aio_ring *ring; |
| @@ -154,20 +247,45 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 154 | struct mm_struct *mm = current->mm; | 247 | struct mm_struct *mm = current->mm; |
| 155 | unsigned long size, populate; | 248 | unsigned long size, populate; |
| 156 | int nr_pages; | 249 | int nr_pages; |
| 250 | int i; | ||
| 251 | struct file *file; | ||
| 157 | 252 | ||
| 158 | /* Compensate for the ring buffer's head/tail overlap entry */ | 253 | /* Compensate for the ring buffer's head/tail overlap entry */ |
| 159 | nr_events += 2; /* 1 is required, 2 for good luck */ | 254 | nr_events += 2; /* 1 is required, 2 for good luck */ |
| 160 | 255 | ||
| 161 | size = sizeof(struct aio_ring); | 256 | size = sizeof(struct aio_ring); |
| 162 | size += sizeof(struct io_event) * nr_events; | 257 | size += sizeof(struct io_event) * nr_events; |
| 163 | nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; | ||
| 164 | 258 | ||
| 259 | nr_pages = PFN_UP(size); | ||
| 165 | if (nr_pages < 0) | 260 | if (nr_pages < 0) |
| 166 | return -EINVAL; | 261 | return -EINVAL; |
| 167 | 262 | ||
| 168 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); | 263 | file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); |
| 264 | if (IS_ERR(file)) { | ||
| 265 | ctx->aio_ring_file = NULL; | ||
| 266 | return -EAGAIN; | ||
| 267 | } | ||
| 268 | |||
| 269 | file->f_inode->i_mapping->a_ops = &aio_ctx_aops; | ||
| 270 | file->f_inode->i_mapping->private_data = ctx; | ||
| 271 | file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; | ||
| 272 | |||
| 273 | for (i = 0; i < nr_pages; i++) { | ||
| 274 | struct page *page; | ||
| 275 | page = find_or_create_page(file->f_inode->i_mapping, | ||
| 276 | i, GFP_HIGHUSER | __GFP_ZERO); | ||
| 277 | if (!page) | ||
| 278 | break; | ||
| 279 | pr_debug("pid(%d) page[%d]->count=%d\n", | ||
| 280 | current->pid, i, page_count(page)); | ||
| 281 | SetPageUptodate(page); | ||
| 282 | SetPageDirty(page); | ||
| 283 | unlock_page(page); | ||
| 284 | } | ||
| 285 | ctx->aio_ring_file = file; | ||
| 286 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) | ||
| 287 | / sizeof(struct io_event); | ||
| 169 | 288 | ||
| 170 | ctx->nr_events = 0; | ||
| 171 | ctx->ring_pages = ctx->internal_pages; | 289 | ctx->ring_pages = ctx->internal_pages; |
| 172 | if (nr_pages > AIO_RING_PAGES) { | 290 | if (nr_pages > AIO_RING_PAGES) { |
| 173 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), | 291 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), |
| @@ -178,10 +296,11 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 178 | 296 | ||
| 179 | ctx->mmap_size = nr_pages * PAGE_SIZE; | 297 | ctx->mmap_size = nr_pages * PAGE_SIZE; |
| 180 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); | 298 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); |
| 299 | |||
| 181 | down_write(&mm->mmap_sem); | 300 | down_write(&mm->mmap_sem); |
| 182 | ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, | 301 | ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, |
| 183 | PROT_READ|PROT_WRITE, | 302 | PROT_READ | PROT_WRITE, |
| 184 | MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); | 303 | MAP_SHARED | MAP_POPULATE, 0, &populate); |
| 185 | if (IS_ERR((void *)ctx->mmap_base)) { | 304 | if (IS_ERR((void *)ctx->mmap_base)) { |
| 186 | up_write(&mm->mmap_sem); | 305 | up_write(&mm->mmap_sem); |
| 187 | ctx->mmap_size = 0; | 306 | ctx->mmap_size = 0; |
| @@ -190,23 +309,34 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 190 | } | 309 | } |
| 191 | 310 | ||
| 192 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); | 311 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); |
| 312 | |||
| 313 | /* We must do this while still holding mmap_sem for write, as we | ||
| 314 | * need to be protected against userspace attempting to mremap() | ||
| 315 | * or munmap() the ring buffer. | ||
| 316 | */ | ||
| 193 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, | 317 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, |
| 194 | 1, 0, ctx->ring_pages, NULL); | 318 | 1, 0, ctx->ring_pages, NULL); |
| 319 | |||
| 320 | /* Dropping the reference here is safe as the page cache will hold | ||
| 321 | * onto the pages for us. It is also required so that page migration | ||
| 322 | * can unmap the pages and get the right reference count. | ||
| 323 | */ | ||
| 324 | for (i = 0; i < ctx->nr_pages; i++) | ||
| 325 | put_page(ctx->ring_pages[i]); | ||
| 326 | |||
| 195 | up_write(&mm->mmap_sem); | 327 | up_write(&mm->mmap_sem); |
| 196 | 328 | ||
| 197 | if (unlikely(ctx->nr_pages != nr_pages)) { | 329 | if (unlikely(ctx->nr_pages != nr_pages)) { |
| 198 | aio_free_ring(ctx); | 330 | aio_free_ring(ctx); |
| 199 | return -EAGAIN; | 331 | return -EAGAIN; |
| 200 | } | 332 | } |
| 201 | if (populate) | ||
| 202 | mm_populate(ctx->mmap_base, populate); | ||
| 203 | 333 | ||
| 204 | ctx->user_id = ctx->mmap_base; | 334 | ctx->user_id = ctx->mmap_base; |
| 205 | ctx->nr_events = nr_events; /* trusted copy */ | 335 | ctx->nr_events = nr_events; /* trusted copy */ |
| 206 | 336 | ||
| 207 | ring = kmap_atomic(ctx->ring_pages[0]); | 337 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 208 | ring->nr = nr_events; /* user copy */ | 338 | ring->nr = nr_events; /* user copy */ |
| 209 | ring->id = ctx->user_id; | 339 | ring->id = ~0U; |
| 210 | ring->head = ring->tail = 0; | 340 | ring->head = ring->tail = 0; |
| 211 | ring->magic = AIO_RING_MAGIC; | 341 | ring->magic = AIO_RING_MAGIC; |
| 212 | ring->compat_features = AIO_RING_COMPAT_FEATURES; | 342 | ring->compat_features = AIO_RING_COMPAT_FEATURES; |
| @@ -238,11 +368,9 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) | |||
| 238 | } | 368 | } |
| 239 | EXPORT_SYMBOL(kiocb_set_cancel_fn); | 369 | EXPORT_SYMBOL(kiocb_set_cancel_fn); |
| 240 | 370 | ||
| 241 | static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, | 371 | static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) |
| 242 | struct io_event *res) | ||
| 243 | { | 372 | { |
| 244 | kiocb_cancel_fn *old, *cancel; | 373 | kiocb_cancel_fn *old, *cancel; |
| 245 | int ret = -EINVAL; | ||
| 246 | 374 | ||
| 247 | /* | 375 | /* |
| 248 | * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it | 376 | * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it |
| @@ -252,28 +380,20 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, | |||
| 252 | cancel = ACCESS_ONCE(kiocb->ki_cancel); | 380 | cancel = ACCESS_ONCE(kiocb->ki_cancel); |
| 253 | do { | 381 | do { |
| 254 | if (!cancel || cancel == KIOCB_CANCELLED) | 382 | if (!cancel || cancel == KIOCB_CANCELLED) |
| 255 | return ret; | 383 | return -EINVAL; |
| 256 | 384 | ||
| 257 | old = cancel; | 385 | old = cancel; |
| 258 | cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); | 386 | cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); |
| 259 | } while (cancel != old); | 387 | } while (cancel != old); |
| 260 | 388 | ||
| 261 | atomic_inc(&kiocb->ki_users); | 389 | return cancel(kiocb); |
| 262 | spin_unlock_irq(&ctx->ctx_lock); | ||
| 263 | |||
| 264 | memset(res, 0, sizeof(*res)); | ||
| 265 | res->obj = (u64)(unsigned long)kiocb->ki_obj.user; | ||
| 266 | res->data = kiocb->ki_user_data; | ||
| 267 | ret = cancel(kiocb, res); | ||
| 268 | |||
| 269 | spin_lock_irq(&ctx->ctx_lock); | ||
| 270 | |||
| 271 | return ret; | ||
| 272 | } | 390 | } |
| 273 | 391 | ||
| 274 | static void free_ioctx_rcu(struct rcu_head *head) | 392 | static void free_ioctx_rcu(struct rcu_head *head) |
| 275 | { | 393 | { |
| 276 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | 394 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); |
| 395 | |||
| 396 | free_percpu(ctx->cpu); | ||
| 277 | kmem_cache_free(kioctx_cachep, ctx); | 397 | kmem_cache_free(kioctx_cachep, ctx); |
| 278 | } | 398 | } |
| 279 | 399 | ||
| @@ -282,12 +402,13 @@ static void free_ioctx_rcu(struct rcu_head *head) | |||
| 282 | * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - | 402 | * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - |
| 283 | * now it's safe to cancel any that need to be. | 403 | * now it's safe to cancel any that need to be. |
| 284 | */ | 404 | */ |
| 285 | static void free_ioctx(struct kioctx *ctx) | 405 | static void free_ioctx(struct work_struct *work) |
| 286 | { | 406 | { |
| 407 | struct kioctx *ctx = container_of(work, struct kioctx, free_work); | ||
| 287 | struct aio_ring *ring; | 408 | struct aio_ring *ring; |
| 288 | struct io_event res; | ||
| 289 | struct kiocb *req; | 409 | struct kiocb *req; |
| 290 | unsigned head, avail; | 410 | unsigned cpu, avail; |
| 411 | DEFINE_WAIT(wait); | ||
| 291 | 412 | ||
| 292 | spin_lock_irq(&ctx->ctx_lock); | 413 | spin_lock_irq(&ctx->ctx_lock); |
| 293 | 414 | ||
| @@ -296,28 +417,38 @@ static void free_ioctx(struct kioctx *ctx) | |||
| 296 | struct kiocb, ki_list); | 417 | struct kiocb, ki_list); |
| 297 | 418 | ||
| 298 | list_del_init(&req->ki_list); | 419 | list_del_init(&req->ki_list); |
| 299 | kiocb_cancel(ctx, req, &res); | 420 | kiocb_cancel(ctx, req); |
| 300 | } | 421 | } |
| 301 | 422 | ||
| 302 | spin_unlock_irq(&ctx->ctx_lock); | 423 | spin_unlock_irq(&ctx->ctx_lock); |
| 303 | 424 | ||
| 304 | ring = kmap_atomic(ctx->ring_pages[0]); | 425 | for_each_possible_cpu(cpu) { |
| 305 | head = ring->head; | 426 | struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); |
| 306 | kunmap_atomic(ring); | ||
| 307 | 427 | ||
| 308 | while (atomic_read(&ctx->reqs_active) > 0) { | 428 | atomic_add(kcpu->reqs_available, &ctx->reqs_available); |
| 309 | wait_event(ctx->wait, | 429 | kcpu->reqs_available = 0; |
| 310 | head != ctx->tail || | 430 | } |
| 311 | atomic_read(&ctx->reqs_active) <= 0); | ||
| 312 | 431 | ||
| 313 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; | 432 | while (1) { |
| 433 | prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); | ||
| 314 | 434 | ||
| 315 | atomic_sub(avail, &ctx->reqs_active); | 435 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 316 | head += avail; | 436 | avail = (ring->head <= ring->tail) |
| 317 | head %= ctx->nr_events; | 437 | ? ring->tail - ring->head |
| 438 | : ctx->nr_events - ring->head + ring->tail; | ||
| 439 | |||
| 440 | atomic_add(avail, &ctx->reqs_available); | ||
| 441 | ring->head = ring->tail; | ||
| 442 | kunmap_atomic(ring); | ||
| 443 | |||
| 444 | if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) | ||
| 445 | break; | ||
| 446 | |||
| 447 | schedule(); | ||
| 318 | } | 448 | } |
| 449 | finish_wait(&ctx->wait, &wait); | ||
| 319 | 450 | ||
| 320 | WARN_ON(atomic_read(&ctx->reqs_active) < 0); | 451 | WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); |
| 321 | 452 | ||
| 322 | aio_free_ring(ctx); | 453 | aio_free_ring(ctx); |
| 323 | 454 | ||
| @@ -333,10 +464,68 @@ static void free_ioctx(struct kioctx *ctx) | |||
| 333 | call_rcu(&ctx->rcu_head, free_ioctx_rcu); | 464 | call_rcu(&ctx->rcu_head, free_ioctx_rcu); |
| 334 | } | 465 | } |
| 335 | 466 | ||
| 336 | static void put_ioctx(struct kioctx *ctx) | 467 | static void free_ioctx_ref(struct percpu_ref *ref) |
| 337 | { | 468 | { |
| 338 | if (unlikely(atomic_dec_and_test(&ctx->users))) | 469 | struct kioctx *ctx = container_of(ref, struct kioctx, users); |
| 339 | free_ioctx(ctx); | 470 | |
| 471 | INIT_WORK(&ctx->free_work, free_ioctx); | ||
| 472 | schedule_work(&ctx->free_work); | ||
| 473 | } | ||
| 474 | |||
| 475 | static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | ||
| 476 | { | ||
| 477 | unsigned i, new_nr; | ||
| 478 | struct kioctx_table *table, *old; | ||
| 479 | struct aio_ring *ring; | ||
| 480 | |||
| 481 | spin_lock(&mm->ioctx_lock); | ||
| 482 | rcu_read_lock(); | ||
| 483 | table = rcu_dereference(mm->ioctx_table); | ||
| 484 | |||
| 485 | while (1) { | ||
| 486 | if (table) | ||
| 487 | for (i = 0; i < table->nr; i++) | ||
| 488 | if (!table->table[i]) { | ||
| 489 | ctx->id = i; | ||
| 490 | table->table[i] = ctx; | ||
| 491 | rcu_read_unlock(); | ||
| 492 | spin_unlock(&mm->ioctx_lock); | ||
| 493 | |||
| 494 | ring = kmap_atomic(ctx->ring_pages[0]); | ||
| 495 | ring->id = ctx->id; | ||
| 496 | kunmap_atomic(ring); | ||
| 497 | return 0; | ||
| 498 | } | ||
| 499 | |||
| 500 | new_nr = (table ? table->nr : 1) * 4; | ||
| 501 | |||
| 502 | rcu_read_unlock(); | ||
| 503 | spin_unlock(&mm->ioctx_lock); | ||
| 504 | |||
| 505 | table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * | ||
| 506 | new_nr, GFP_KERNEL); | ||
| 507 | if (!table) | ||
| 508 | return -ENOMEM; | ||
| 509 | |||
| 510 | table->nr = new_nr; | ||
| 511 | |||
| 512 | spin_lock(&mm->ioctx_lock); | ||
| 513 | rcu_read_lock(); | ||
| 514 | old = rcu_dereference(mm->ioctx_table); | ||
| 515 | |||
| 516 | if (!old) { | ||
| 517 | rcu_assign_pointer(mm->ioctx_table, table); | ||
| 518 | } else if (table->nr > old->nr) { | ||
| 519 | memcpy(table->table, old->table, | ||
| 520 | old->nr * sizeof(struct kioctx *)); | ||
| 521 | |||
| 522 | rcu_assign_pointer(mm->ioctx_table, table); | ||
| 523 | kfree_rcu(old, rcu); | ||
| 524 | } else { | ||
| 525 | kfree(table); | ||
| 526 | table = old; | ||
| 527 | } | ||
| 528 | } | ||
| 340 | } | 529 | } |
| 341 | 530 | ||
| 342 | /* ioctx_alloc | 531 | /* ioctx_alloc |
| @@ -348,6 +537,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 348 | struct kioctx *ctx; | 537 | struct kioctx *ctx; |
| 349 | int err = -ENOMEM; | 538 | int err = -ENOMEM; |
| 350 | 539 | ||
| 540 | /* | ||
| 541 | * We keep track of the number of available ringbuffer slots, to prevent | ||
| 542 | * overflow (reqs_available), and we also use percpu counters for this. | ||
| 543 | * | ||
| 544 | * So since up to half the slots might be on other cpu's percpu counters | ||
| 545 | * and unavailable, double nr_events so userspace sees what they | ||
| 546 | * expected: additionally, we move req_batch slots to/from percpu | ||
| 547 | * counters at a time, so make sure that isn't 0: | ||
| 548 | */ | ||
| 549 | nr_events = max(nr_events, num_possible_cpus() * 4); | ||
| 550 | nr_events *= 2; | ||
| 551 | |||
| 351 | /* Prevent overflows */ | 552 | /* Prevent overflows */ |
| 352 | if ((nr_events > (0x10000000U / sizeof(struct io_event))) || | 553 | if ((nr_events > (0x10000000U / sizeof(struct io_event))) || |
| 353 | (nr_events > (0x10000000U / sizeof(struct kiocb)))) { | 554 | (nr_events > (0x10000000U / sizeof(struct kiocb)))) { |
| @@ -355,7 +556,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 355 | return ERR_PTR(-EINVAL); | 556 | return ERR_PTR(-EINVAL); |
| 356 | } | 557 | } |
| 357 | 558 | ||
| 358 | if (!nr_events || (unsigned long)nr_events > aio_max_nr) | 559 | if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) |
| 359 | return ERR_PTR(-EAGAIN); | 560 | return ERR_PTR(-EAGAIN); |
| 360 | 561 | ||
| 361 | ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); | 562 | ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); |
| @@ -364,8 +565,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 364 | 565 | ||
| 365 | ctx->max_reqs = nr_events; | 566 | ctx->max_reqs = nr_events; |
| 366 | 567 | ||
| 367 | atomic_set(&ctx->users, 2); | 568 | if (percpu_ref_init(&ctx->users, free_ioctx_ref)) |
| 368 | atomic_set(&ctx->dead, 0); | 569 | goto out_freectx; |
| 570 | |||
| 369 | spin_lock_init(&ctx->ctx_lock); | 571 | spin_lock_init(&ctx->ctx_lock); |
| 370 | spin_lock_init(&ctx->completion_lock); | 572 | spin_lock_init(&ctx->completion_lock); |
| 371 | mutex_init(&ctx->ring_lock); | 573 | mutex_init(&ctx->ring_lock); |
| @@ -373,12 +575,21 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 373 | 575 | ||
| 374 | INIT_LIST_HEAD(&ctx->active_reqs); | 576 | INIT_LIST_HEAD(&ctx->active_reqs); |
| 375 | 577 | ||
| 578 | ctx->cpu = alloc_percpu(struct kioctx_cpu); | ||
| 579 | if (!ctx->cpu) | ||
| 580 | goto out_freeref; | ||
| 581 | |||
| 376 | if (aio_setup_ring(ctx) < 0) | 582 | if (aio_setup_ring(ctx) < 0) |
| 377 | goto out_freectx; | 583 | goto out_freepcpu; |
| 584 | |||
| 585 | atomic_set(&ctx->reqs_available, ctx->nr_events - 1); | ||
| 586 | ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); | ||
| 587 | if (ctx->req_batch < 1) | ||
| 588 | ctx->req_batch = 1; | ||
| 378 | 589 | ||
| 379 | /* limit the number of system wide aios */ | 590 | /* limit the number of system wide aios */ |
| 380 | spin_lock(&aio_nr_lock); | 591 | spin_lock(&aio_nr_lock); |
| 381 | if (aio_nr + nr_events > aio_max_nr || | 592 | if (aio_nr + nr_events > (aio_max_nr * 2UL) || |
| 382 | aio_nr + nr_events < aio_nr) { | 593 | aio_nr + nr_events < aio_nr) { |
| 383 | spin_unlock(&aio_nr_lock); | 594 | spin_unlock(&aio_nr_lock); |
| 384 | goto out_cleanup; | 595 | goto out_cleanup; |
| @@ -386,49 +597,54 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 386 | aio_nr += ctx->max_reqs; | 597 | aio_nr += ctx->max_reqs; |
| 387 | spin_unlock(&aio_nr_lock); | 598 | spin_unlock(&aio_nr_lock); |
| 388 | 599 | ||
| 389 | /* now link into global list. */ | 600 | percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ |
| 390 | spin_lock(&mm->ioctx_lock); | 601 | |
| 391 | hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); | 602 | err = ioctx_add_table(ctx, mm); |
| 392 | spin_unlock(&mm->ioctx_lock); | 603 | if (err) |
| 604 | goto out_cleanup_put; | ||
| 393 | 605 | ||
| 394 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 606 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
| 395 | ctx, ctx->user_id, mm, ctx->nr_events); | 607 | ctx, ctx->user_id, mm, ctx->nr_events); |
| 396 | return ctx; | 608 | return ctx; |
| 397 | 609 | ||
| 610 | out_cleanup_put: | ||
| 611 | percpu_ref_put(&ctx->users); | ||
| 398 | out_cleanup: | 612 | out_cleanup: |
| 399 | err = -EAGAIN; | 613 | err = -EAGAIN; |
| 400 | aio_free_ring(ctx); | 614 | aio_free_ring(ctx); |
| 615 | out_freepcpu: | ||
| 616 | free_percpu(ctx->cpu); | ||
| 617 | out_freeref: | ||
| 618 | free_percpu(ctx->users.pcpu_count); | ||
| 401 | out_freectx: | 619 | out_freectx: |
| 620 | if (ctx->aio_ring_file) | ||
| 621 | fput(ctx->aio_ring_file); | ||
| 402 | kmem_cache_free(kioctx_cachep, ctx); | 622 | kmem_cache_free(kioctx_cachep, ctx); |
| 403 | pr_debug("error allocating ioctx %d\n", err); | 623 | pr_debug("error allocating ioctx %d\n", err); |
| 404 | return ERR_PTR(err); | 624 | return ERR_PTR(err); |
| 405 | } | 625 | } |
| 406 | 626 | ||
| 407 | static void kill_ioctx_work(struct work_struct *work) | ||
| 408 | { | ||
| 409 | struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); | ||
| 410 | |||
| 411 | wake_up_all(&ctx->wait); | ||
| 412 | put_ioctx(ctx); | ||
| 413 | } | ||
| 414 | |||
| 415 | static void kill_ioctx_rcu(struct rcu_head *head) | ||
| 416 | { | ||
| 417 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | ||
| 418 | |||
| 419 | INIT_WORK(&ctx->rcu_work, kill_ioctx_work); | ||
| 420 | schedule_work(&ctx->rcu_work); | ||
| 421 | } | ||
| 422 | |||
| 423 | /* kill_ioctx | 627 | /* kill_ioctx |
| 424 | * Cancels all outstanding aio requests on an aio context. Used | 628 | * Cancels all outstanding aio requests on an aio context. Used |
| 425 | * when the processes owning a context have all exited to encourage | 629 | * when the processes owning a context have all exited to encourage |
| 426 | * the rapid destruction of the kioctx. | 630 | * the rapid destruction of the kioctx. |
| 427 | */ | 631 | */ |
| 428 | static void kill_ioctx(struct kioctx *ctx) | 632 | static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) |
| 429 | { | 633 | { |
| 430 | if (!atomic_xchg(&ctx->dead, 1)) { | 634 | if (!atomic_xchg(&ctx->dead, 1)) { |
| 431 | hlist_del_rcu(&ctx->list); | 635 | struct kioctx_table *table; |
| 636 | |||
| 637 | spin_lock(&mm->ioctx_lock); | ||
| 638 | rcu_read_lock(); | ||
| 639 | table = rcu_dereference(mm->ioctx_table); | ||
| 640 | |||
| 641 | WARN_ON(ctx != table->table[ctx->id]); | ||
| 642 | table->table[ctx->id] = NULL; | ||
| 643 | rcu_read_unlock(); | ||
| 644 | spin_unlock(&mm->ioctx_lock); | ||
| 645 | |||
| 646 | /* percpu_ref_kill() will do the necessary call_rcu() */ | ||
| 647 | wake_up_all(&ctx->wait); | ||
| 432 | 648 | ||
| 433 | /* | 649 | /* |
| 434 | * It'd be more correct to do this in free_ioctx(), after all | 650 | * It'd be more correct to do this in free_ioctx(), after all |
| @@ -445,24 +661,23 @@ static void kill_ioctx(struct kioctx *ctx) | |||
| 445 | if (ctx->mmap_size) | 661 | if (ctx->mmap_size) |
| 446 | vm_munmap(ctx->mmap_base, ctx->mmap_size); | 662 | vm_munmap(ctx->mmap_base, ctx->mmap_size); |
| 447 | 663 | ||
| 448 | /* Between hlist_del_rcu() and dropping the initial ref */ | 664 | percpu_ref_kill(&ctx->users); |
| 449 | call_rcu(&ctx->rcu_head, kill_ioctx_rcu); | ||
| 450 | } | 665 | } |
| 451 | } | 666 | } |
| 452 | 667 | ||
| 453 | /* wait_on_sync_kiocb: | 668 | /* wait_on_sync_kiocb: |
| 454 | * Waits on the given sync kiocb to complete. | 669 | * Waits on the given sync kiocb to complete. |
| 455 | */ | 670 | */ |
| 456 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) | 671 | ssize_t wait_on_sync_kiocb(struct kiocb *req) |
| 457 | { | 672 | { |
| 458 | while (atomic_read(&iocb->ki_users)) { | 673 | while (!req->ki_ctx) { |
| 459 | set_current_state(TASK_UNINTERRUPTIBLE); | 674 | set_current_state(TASK_UNINTERRUPTIBLE); |
| 460 | if (!atomic_read(&iocb->ki_users)) | 675 | if (req->ki_ctx) |
| 461 | break; | 676 | break; |
| 462 | io_schedule(); | 677 | io_schedule(); |
| 463 | } | 678 | } |
| 464 | __set_current_state(TASK_RUNNING); | 679 | __set_current_state(TASK_RUNNING); |
| 465 | return iocb->ki_user_data; | 680 | return req->ki_user_data; |
| 466 | } | 681 | } |
| 467 | EXPORT_SYMBOL(wait_on_sync_kiocb); | 682 | EXPORT_SYMBOL(wait_on_sync_kiocb); |
| 468 | 683 | ||
| @@ -476,16 +691,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); | |||
| 476 | */ | 691 | */ |
| 477 | void exit_aio(struct mm_struct *mm) | 692 | void exit_aio(struct mm_struct *mm) |
| 478 | { | 693 | { |
| 694 | struct kioctx_table *table; | ||
| 479 | struct kioctx *ctx; | 695 | struct kioctx *ctx; |
| 480 | struct hlist_node *n; | 696 | unsigned i = 0; |
| 481 | 697 | ||
| 482 | hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { | 698 | while (1) { |
| 483 | if (1 != atomic_read(&ctx->users)) | 699 | rcu_read_lock(); |
| 484 | printk(KERN_DEBUG | 700 | table = rcu_dereference(mm->ioctx_table); |
| 485 | "exit_aio:ioctx still alive: %d %d %d\n", | 701 | |
| 486 | atomic_read(&ctx->users), | 702 | do { |
| 487 | atomic_read(&ctx->dead), | 703 | if (!table || i >= table->nr) { |
| 488 | atomic_read(&ctx->reqs_active)); | 704 | rcu_read_unlock(); |
| 705 | rcu_assign_pointer(mm->ioctx_table, NULL); | ||
| 706 | if (table) | ||
| 707 | kfree(table); | ||
| 708 | return; | ||
| 709 | } | ||
| 710 | |||
| 711 | ctx = table->table[i++]; | ||
| 712 | } while (!ctx); | ||
| 713 | |||
| 714 | rcu_read_unlock(); | ||
| 715 | |||
| 489 | /* | 716 | /* |
| 490 | * We don't need to bother with munmap() here - | 717 | * We don't need to bother with munmap() here - |
| 491 | * exit_mmap(mm) is coming and it'll unmap everything. | 718 | * exit_mmap(mm) is coming and it'll unmap everything. |
| @@ -496,40 +723,75 @@ void exit_aio(struct mm_struct *mm) | |||
| 496 | */ | 723 | */ |
| 497 | ctx->mmap_size = 0; | 724 | ctx->mmap_size = 0; |
| 498 | 725 | ||
| 499 | kill_ioctx(ctx); | 726 | kill_ioctx(mm, ctx); |
| 727 | } | ||
| 728 | } | ||
| 729 | |||
| 730 | static void put_reqs_available(struct kioctx *ctx, unsigned nr) | ||
| 731 | { | ||
| 732 | struct kioctx_cpu *kcpu; | ||
| 733 | |||
| 734 | preempt_disable(); | ||
| 735 | kcpu = this_cpu_ptr(ctx->cpu); | ||
| 736 | |||
| 737 | kcpu->reqs_available += nr; | ||
| 738 | while (kcpu->reqs_available >= ctx->req_batch * 2) { | ||
| 739 | kcpu->reqs_available -= ctx->req_batch; | ||
| 740 | atomic_add(ctx->req_batch, &ctx->reqs_available); | ||
| 741 | } | ||
| 742 | |||
| 743 | preempt_enable(); | ||
| 744 | } | ||
| 745 | |||
| 746 | static bool get_reqs_available(struct kioctx *ctx) | ||
| 747 | { | ||
| 748 | struct kioctx_cpu *kcpu; | ||
| 749 | bool ret = false; | ||
| 750 | |||
| 751 | preempt_disable(); | ||
| 752 | kcpu = this_cpu_ptr(ctx->cpu); | ||
| 753 | |||
| 754 | if (!kcpu->reqs_available) { | ||
| 755 | int old, avail = atomic_read(&ctx->reqs_available); | ||
| 756 | |||
| 757 | do { | ||
| 758 | if (avail < ctx->req_batch) | ||
| 759 | goto out; | ||
| 760 | |||
| 761 | old = avail; | ||
| 762 | avail = atomic_cmpxchg(&ctx->reqs_available, | ||
| 763 | avail, avail - ctx->req_batch); | ||
| 764 | } while (avail != old); | ||
| 765 | |||
| 766 | kcpu->reqs_available += ctx->req_batch; | ||
| 500 | } | 767 | } |
| 768 | |||
| 769 | ret = true; | ||
| 770 | kcpu->reqs_available--; | ||
| 771 | out: | ||
| 772 | preempt_enable(); | ||
| 773 | return ret; | ||
| 501 | } | 774 | } |
| 502 | 775 | ||
| 503 | /* aio_get_req | 776 | /* aio_get_req |
| 504 | * Allocate a slot for an aio request. Increments the ki_users count | 777 | * Allocate a slot for an aio request. |
| 505 | * of the kioctx so that the kioctx stays around until all requests are | 778 | * Returns NULL if no requests are free. |
| 506 | * complete. Returns NULL if no requests are free. | ||
| 507 | * | ||
| 508 | * Returns with kiocb->ki_users set to 2. The io submit code path holds | ||
| 509 | * an extra reference while submitting the i/o. | ||
| 510 | * This prevents races between the aio code path referencing the | ||
| 511 | * req (after submitting it) and aio_complete() freeing the req. | ||
| 512 | */ | 779 | */ |
| 513 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) | 780 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) |
| 514 | { | 781 | { |
| 515 | struct kiocb *req; | 782 | struct kiocb *req; |
| 516 | 783 | ||
| 517 | if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) | 784 | if (!get_reqs_available(ctx)) |
| 518 | return NULL; | 785 | return NULL; |
| 519 | 786 | ||
| 520 | if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) | ||
| 521 | goto out_put; | ||
| 522 | |||
| 523 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); | 787 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); |
| 524 | if (unlikely(!req)) | 788 | if (unlikely(!req)) |
| 525 | goto out_put; | 789 | goto out_put; |
| 526 | 790 | ||
| 527 | atomic_set(&req->ki_users, 2); | ||
| 528 | req->ki_ctx = ctx; | 791 | req->ki_ctx = ctx; |
| 529 | |||
| 530 | return req; | 792 | return req; |
| 531 | out_put: | 793 | out_put: |
| 532 | atomic_dec(&ctx->reqs_active); | 794 | put_reqs_available(ctx, 1); |
| 533 | return NULL; | 795 | return NULL; |
| 534 | } | 796 | } |
| 535 | 797 | ||
| @@ -539,35 +801,32 @@ static void kiocb_free(struct kiocb *req) | |||
| 539 | fput(req->ki_filp); | 801 | fput(req->ki_filp); |
| 540 | if (req->ki_eventfd != NULL) | 802 | if (req->ki_eventfd != NULL) |
| 541 | eventfd_ctx_put(req->ki_eventfd); | 803 | eventfd_ctx_put(req->ki_eventfd); |
| 542 | if (req->ki_dtor) | ||
| 543 | req->ki_dtor(req); | ||
| 544 | if (req->ki_iovec != &req->ki_inline_vec) | ||
| 545 | kfree(req->ki_iovec); | ||
| 546 | kmem_cache_free(kiocb_cachep, req); | 804 | kmem_cache_free(kiocb_cachep, req); |
| 547 | } | 805 | } |
| 548 | 806 | ||
| 549 | void aio_put_req(struct kiocb *req) | ||
| 550 | { | ||
| 551 | if (atomic_dec_and_test(&req->ki_users)) | ||
| 552 | kiocb_free(req); | ||
| 553 | } | ||
| 554 | EXPORT_SYMBOL(aio_put_req); | ||
| 555 | |||
| 556 | static struct kioctx *lookup_ioctx(unsigned long ctx_id) | 807 | static struct kioctx *lookup_ioctx(unsigned long ctx_id) |
| 557 | { | 808 | { |
| 809 | struct aio_ring __user *ring = (void __user *)ctx_id; | ||
| 558 | struct mm_struct *mm = current->mm; | 810 | struct mm_struct *mm = current->mm; |
| 559 | struct kioctx *ctx, *ret = NULL; | 811 | struct kioctx *ctx, *ret = NULL; |
| 812 | struct kioctx_table *table; | ||
| 813 | unsigned id; | ||
| 814 | |||
| 815 | if (get_user(id, &ring->id)) | ||
| 816 | return NULL; | ||
| 560 | 817 | ||
| 561 | rcu_read_lock(); | 818 | rcu_read_lock(); |
| 819 | table = rcu_dereference(mm->ioctx_table); | ||
| 562 | 820 | ||
| 563 | hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { | 821 | if (!table || id >= table->nr) |
| 564 | if (ctx->user_id == ctx_id) { | 822 | goto out; |
| 565 | atomic_inc(&ctx->users); | ||
| 566 | ret = ctx; | ||
| 567 | break; | ||
| 568 | } | ||
| 569 | } | ||
| 570 | 823 | ||
| 824 | ctx = table->table[id]; | ||
| 825 | if (ctx && ctx->user_id == ctx_id) { | ||
| 826 | percpu_ref_get(&ctx->users); | ||
| 827 | ret = ctx; | ||
| 828 | } | ||
| 829 | out: | ||
| 571 | rcu_read_unlock(); | 830 | rcu_read_unlock(); |
| 572 | return ret; | 831 | return ret; |
| 573 | } | 832 | } |
| @@ -591,16 +850,16 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
| 591 | * - the sync task helpfully left a reference to itself in the iocb | 850 | * - the sync task helpfully left a reference to itself in the iocb |
| 592 | */ | 851 | */ |
| 593 | if (is_sync_kiocb(iocb)) { | 852 | if (is_sync_kiocb(iocb)) { |
| 594 | BUG_ON(atomic_read(&iocb->ki_users) != 1); | ||
| 595 | iocb->ki_user_data = res; | 853 | iocb->ki_user_data = res; |
| 596 | atomic_set(&iocb->ki_users, 0); | 854 | smp_wmb(); |
| 855 | iocb->ki_ctx = ERR_PTR(-EXDEV); | ||
| 597 | wake_up_process(iocb->ki_obj.tsk); | 856 | wake_up_process(iocb->ki_obj.tsk); |
| 598 | return; | 857 | return; |
| 599 | } | 858 | } |
| 600 | 859 | ||
| 601 | /* | 860 | /* |
| 602 | * Take rcu_read_lock() in case the kioctx is being destroyed, as we | 861 | * Take rcu_read_lock() in case the kioctx is being destroyed, as we |
| 603 | * need to issue a wakeup after decrementing reqs_active. | 862 | * need to issue a wakeup after incrementing reqs_available. |
| 604 | */ | 863 | */ |
| 605 | rcu_read_lock(); | 864 | rcu_read_lock(); |
| 606 | 865 | ||
| @@ -613,17 +872,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
| 613 | } | 872 | } |
| 614 | 873 | ||
| 615 | /* | 874 | /* |
| 616 | * cancelled requests don't get events, userland was given one | ||
| 617 | * when the event got cancelled. | ||
| 618 | */ | ||
| 619 | if (unlikely(xchg(&iocb->ki_cancel, | ||
| 620 | KIOCB_CANCELLED) == KIOCB_CANCELLED)) { | ||
| 621 | atomic_dec(&ctx->reqs_active); | ||
| 622 | /* Still need the wake_up in case free_ioctx is waiting */ | ||
| 623 | goto put_rq; | ||
| 624 | } | ||
| 625 | |||
| 626 | /* | ||
| 627 | * Add a completion event to the ring buffer. Must be done holding | 875 | * Add a completion event to the ring buffer. Must be done holding |
| 628 | * ctx->completion_lock to prevent other code from messing with the tail | 876 | * ctx->completion_lock to prevent other code from messing with the tail |
| 629 | * pointer since we might be called from irq context. | 877 | * pointer since we might be called from irq context. |
| @@ -675,9 +923,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
| 675 | if (iocb->ki_eventfd != NULL) | 923 | if (iocb->ki_eventfd != NULL) |
| 676 | eventfd_signal(iocb->ki_eventfd, 1); | 924 | eventfd_signal(iocb->ki_eventfd, 1); |
| 677 | 925 | ||
| 678 | put_rq: | ||
| 679 | /* everything turned out well, dispose of the aiocb. */ | 926 | /* everything turned out well, dispose of the aiocb. */ |
| 680 | aio_put_req(iocb); | 927 | kiocb_free(iocb); |
| 681 | 928 | ||
| 682 | /* | 929 | /* |
| 683 | * We have to order our ring_info tail store above and test | 930 | * We have to order our ring_info tail store above and test |
| @@ -702,7 +949,7 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
| 702 | struct io_event __user *event, long nr) | 949 | struct io_event __user *event, long nr) |
| 703 | { | 950 | { |
| 704 | struct aio_ring *ring; | 951 | struct aio_ring *ring; |
| 705 | unsigned head, pos; | 952 | unsigned head, tail, pos; |
| 706 | long ret = 0; | 953 | long ret = 0; |
| 707 | int copy_ret; | 954 | int copy_ret; |
| 708 | 955 | ||
| @@ -710,11 +957,12 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
| 710 | 957 | ||
| 711 | ring = kmap_atomic(ctx->ring_pages[0]); | 958 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 712 | head = ring->head; | 959 | head = ring->head; |
| 960 | tail = ring->tail; | ||
| 713 | kunmap_atomic(ring); | 961 | kunmap_atomic(ring); |
| 714 | 962 | ||
| 715 | pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); | 963 | pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); |
| 716 | 964 | ||
| 717 | if (head == ctx->tail) | 965 | if (head == tail) |
| 718 | goto out; | 966 | goto out; |
| 719 | 967 | ||
| 720 | while (ret < nr) { | 968 | while (ret < nr) { |
| @@ -722,8 +970,8 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
| 722 | struct io_event *ev; | 970 | struct io_event *ev; |
| 723 | struct page *page; | 971 | struct page *page; |
| 724 | 972 | ||
| 725 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; | 973 | avail = (head <= tail ? tail : ctx->nr_events) - head; |
| 726 | if (head == ctx->tail) | 974 | if (head == tail) |
| 727 | break; | 975 | break; |
| 728 | 976 | ||
| 729 | avail = min(avail, nr - ret); | 977 | avail = min(avail, nr - ret); |
| @@ -754,9 +1002,9 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
| 754 | kunmap_atomic(ring); | 1002 | kunmap_atomic(ring); |
| 755 | flush_dcache_page(ctx->ring_pages[0]); | 1003 | flush_dcache_page(ctx->ring_pages[0]); |
| 756 | 1004 | ||
| 757 | pr_debug("%li h%u t%u\n", ret, head, ctx->tail); | 1005 | pr_debug("%li h%u t%u\n", ret, head, tail); |
| 758 | 1006 | ||
| 759 | atomic_sub(ret, &ctx->reqs_active); | 1007 | put_reqs_available(ctx, ret); |
| 760 | out: | 1008 | out: |
| 761 | mutex_unlock(&ctx->ring_lock); | 1009 | mutex_unlock(&ctx->ring_lock); |
| 762 | 1010 | ||
| @@ -854,8 +1102,8 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) | |||
| 854 | if (!IS_ERR(ioctx)) { | 1102 | if (!IS_ERR(ioctx)) { |
| 855 | ret = put_user(ioctx->user_id, ctxp); | 1103 | ret = put_user(ioctx->user_id, ctxp); |
| 856 | if (ret) | 1104 | if (ret) |
| 857 | kill_ioctx(ioctx); | 1105 | kill_ioctx(current->mm, ioctx); |
| 858 | put_ioctx(ioctx); | 1106 | percpu_ref_put(&ioctx->users); |
| 859 | } | 1107 | } |
| 860 | 1108 | ||
| 861 | out: | 1109 | out: |
| @@ -872,101 +1120,37 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) | |||
| 872 | { | 1120 | { |
| 873 | struct kioctx *ioctx = lookup_ioctx(ctx); | 1121 | struct kioctx *ioctx = lookup_ioctx(ctx); |
| 874 | if (likely(NULL != ioctx)) { | 1122 | if (likely(NULL != ioctx)) { |
| 875 | kill_ioctx(ioctx); | 1123 | kill_ioctx(current->mm, ioctx); |
| 876 | put_ioctx(ioctx); | 1124 | percpu_ref_put(&ioctx->users); |
| 877 | return 0; | 1125 | return 0; |
| 878 | } | 1126 | } |
| 879 | pr_debug("EINVAL: io_destroy: invalid context id\n"); | 1127 | pr_debug("EINVAL: io_destroy: invalid context id\n"); |
| 880 | return -EINVAL; | 1128 | return -EINVAL; |
| 881 | } | 1129 | } |
| 882 | 1130 | ||
| 883 | static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) | ||
| 884 | { | ||
| 885 | struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; | ||
| 886 | |||
| 887 | BUG_ON(ret <= 0); | ||
| 888 | |||
| 889 | while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { | ||
| 890 | ssize_t this = min((ssize_t)iov->iov_len, ret); | ||
| 891 | iov->iov_base += this; | ||
| 892 | iov->iov_len -= this; | ||
| 893 | iocb->ki_left -= this; | ||
| 894 | ret -= this; | ||
| 895 | if (iov->iov_len == 0) { | ||
| 896 | iocb->ki_cur_seg++; | ||
| 897 | iov++; | ||
| 898 | } | ||
| 899 | } | ||
| 900 | |||
| 901 | /* the caller should not have done more io than what fit in | ||
| 902 | * the remaining iovecs */ | ||
| 903 | BUG_ON(ret > 0 && iocb->ki_left == 0); | ||
| 904 | } | ||
| 905 | |||
| 906 | typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, | 1131 | typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, |
| 907 | unsigned long, loff_t); | 1132 | unsigned long, loff_t); |
| 908 | 1133 | ||
| 909 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) | 1134 | static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, |
| 910 | { | 1135 | int rw, char __user *buf, |
| 911 | struct file *file = iocb->ki_filp; | 1136 | unsigned long *nr_segs, |
| 912 | struct address_space *mapping = file->f_mapping; | 1137 | struct iovec **iovec, |
| 913 | struct inode *inode = mapping->host; | 1138 | bool compat) |
| 914 | ssize_t ret = 0; | ||
| 915 | |||
| 916 | /* This matches the pread()/pwrite() logic */ | ||
| 917 | if (iocb->ki_pos < 0) | ||
| 918 | return -EINVAL; | ||
| 919 | |||
| 920 | if (rw == WRITE) | ||
| 921 | file_start_write(file); | ||
| 922 | do { | ||
| 923 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], | ||
| 924 | iocb->ki_nr_segs - iocb->ki_cur_seg, | ||
| 925 | iocb->ki_pos); | ||
| 926 | if (ret > 0) | ||
| 927 | aio_advance_iovec(iocb, ret); | ||
| 928 | |||
| 929 | /* retry all partial writes. retry partial reads as long as its a | ||
| 930 | * regular file. */ | ||
| 931 | } while (ret > 0 && iocb->ki_left > 0 && | ||
| 932 | (rw == WRITE || | ||
| 933 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); | ||
| 934 | if (rw == WRITE) | ||
| 935 | file_end_write(file); | ||
| 936 | |||
| 937 | /* This means we must have transferred all that we could */ | ||
| 938 | /* No need to retry anymore */ | ||
| 939 | if ((ret == 0) || (iocb->ki_left == 0)) | ||
| 940 | ret = iocb->ki_nbytes - iocb->ki_left; | ||
| 941 | |||
| 942 | /* If we managed to write some out we return that, rather than | ||
| 943 | * the eventual error. */ | ||
| 944 | if (rw == WRITE | ||
| 945 | && ret < 0 && ret != -EIOCBQUEUED | ||
| 946 | && iocb->ki_nbytes - iocb->ki_left) | ||
| 947 | ret = iocb->ki_nbytes - iocb->ki_left; | ||
| 948 | |||
| 949 | return ret; | ||
| 950 | } | ||
| 951 | |||
| 952 | static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) | ||
| 953 | { | 1139 | { |
| 954 | ssize_t ret; | 1140 | ssize_t ret; |
| 955 | 1141 | ||
| 956 | kiocb->ki_nr_segs = kiocb->ki_nbytes; | 1142 | *nr_segs = kiocb->ki_nbytes; |
| 957 | 1143 | ||
| 958 | #ifdef CONFIG_COMPAT | 1144 | #ifdef CONFIG_COMPAT |
| 959 | if (compat) | 1145 | if (compat) |
| 960 | ret = compat_rw_copy_check_uvector(rw, | 1146 | ret = compat_rw_copy_check_uvector(rw, |
| 961 | (struct compat_iovec __user *)kiocb->ki_buf, | 1147 | (struct compat_iovec __user *)buf, |
| 962 | kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, | 1148 | *nr_segs, 1, *iovec, iovec); |
| 963 | &kiocb->ki_iovec); | ||
| 964 | else | 1149 | else |
| 965 | #endif | 1150 | #endif |
| 966 | ret = rw_copy_check_uvector(rw, | 1151 | ret = rw_copy_check_uvector(rw, |
| 967 | (struct iovec __user *)kiocb->ki_buf, | 1152 | (struct iovec __user *)buf, |
| 968 | kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, | 1153 | *nr_segs, 1, *iovec, iovec); |
| 969 | &kiocb->ki_iovec); | ||
| 970 | if (ret < 0) | 1154 | if (ret < 0) |
| 971 | return ret; | 1155 | return ret; |
| 972 | 1156 | ||
| @@ -975,15 +1159,17 @@ static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) | |||
| 975 | return 0; | 1159 | return 0; |
| 976 | } | 1160 | } |
| 977 | 1161 | ||
| 978 | static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) | 1162 | static ssize_t aio_setup_single_vector(struct kiocb *kiocb, |
| 1163 | int rw, char __user *buf, | ||
| 1164 | unsigned long *nr_segs, | ||
| 1165 | struct iovec *iovec) | ||
| 979 | { | 1166 | { |
| 980 | if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) | 1167 | if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) |
| 981 | return -EFAULT; | 1168 | return -EFAULT; |
| 982 | 1169 | ||
| 983 | kiocb->ki_iovec = &kiocb->ki_inline_vec; | 1170 | iovec->iov_base = buf; |
| 984 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; | 1171 | iovec->iov_len = kiocb->ki_nbytes; |
| 985 | kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; | 1172 | *nr_segs = 1; |
| 986 | kiocb->ki_nr_segs = 1; | ||
| 987 | return 0; | 1173 | return 0; |
| 988 | } | 1174 | } |
| 989 | 1175 | ||
| @@ -992,15 +1178,18 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) | |||
| 992 | * Performs the initial checks and aio retry method | 1178 | * Performs the initial checks and aio retry method |
| 993 | * setup for the kiocb at the time of io submission. | 1179 | * setup for the kiocb at the time of io submission. |
| 994 | */ | 1180 | */ |
| 995 | static ssize_t aio_run_iocb(struct kiocb *req, bool compat) | 1181 | static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, |
| 1182 | char __user *buf, bool compat) | ||
| 996 | { | 1183 | { |
| 997 | struct file *file = req->ki_filp; | 1184 | struct file *file = req->ki_filp; |
| 998 | ssize_t ret; | 1185 | ssize_t ret; |
| 1186 | unsigned long nr_segs; | ||
| 999 | int rw; | 1187 | int rw; |
| 1000 | fmode_t mode; | 1188 | fmode_t mode; |
| 1001 | aio_rw_op *rw_op; | 1189 | aio_rw_op *rw_op; |
| 1190 | struct iovec inline_vec, *iovec = &inline_vec; | ||
| 1002 | 1191 | ||
| 1003 | switch (req->ki_opcode) { | 1192 | switch (opcode) { |
| 1004 | case IOCB_CMD_PREAD: | 1193 | case IOCB_CMD_PREAD: |
| 1005 | case IOCB_CMD_PREADV: | 1194 | case IOCB_CMD_PREADV: |
| 1006 | mode = FMODE_READ; | 1195 | mode = FMODE_READ; |
| @@ -1021,21 +1210,38 @@ rw_common: | |||
| 1021 | if (!rw_op) | 1210 | if (!rw_op) |
| 1022 | return -EINVAL; | 1211 | return -EINVAL; |
| 1023 | 1212 | ||
| 1024 | ret = (req->ki_opcode == IOCB_CMD_PREADV || | 1213 | ret = (opcode == IOCB_CMD_PREADV || |
| 1025 | req->ki_opcode == IOCB_CMD_PWRITEV) | 1214 | opcode == IOCB_CMD_PWRITEV) |
| 1026 | ? aio_setup_vectored_rw(rw, req, compat) | 1215 | ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, |
| 1027 | : aio_setup_single_vector(rw, req); | 1216 | &iovec, compat) |
| 1217 | : aio_setup_single_vector(req, rw, buf, &nr_segs, | ||
| 1218 | iovec); | ||
| 1028 | if (ret) | 1219 | if (ret) |
| 1029 | return ret; | 1220 | return ret; |
| 1030 | 1221 | ||
| 1031 | ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); | 1222 | ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); |
| 1032 | if (ret < 0) | 1223 | if (ret < 0) { |
| 1224 | if (iovec != &inline_vec) | ||
| 1225 | kfree(iovec); | ||
| 1033 | return ret; | 1226 | return ret; |
| 1227 | } | ||
| 1034 | 1228 | ||
| 1035 | req->ki_nbytes = ret; | 1229 | req->ki_nbytes = ret; |
| 1036 | req->ki_left = ret; | ||
| 1037 | 1230 | ||
| 1038 | ret = aio_rw_vect_retry(req, rw, rw_op); | 1231 | /* XXX: move/kill - rw_verify_area()? */ |
| 1232 | /* This matches the pread()/pwrite() logic */ | ||
| 1233 | if (req->ki_pos < 0) { | ||
| 1234 | ret = -EINVAL; | ||
| 1235 | break; | ||
| 1236 | } | ||
| 1237 | |||
| 1238 | if (rw == WRITE) | ||
| 1239 | file_start_write(file); | ||
| 1240 | |||
| 1241 | ret = rw_op(req, iovec, nr_segs, req->ki_pos); | ||
| 1242 | |||
| 1243 | if (rw == WRITE) | ||
| 1244 | file_end_write(file); | ||
| 1039 | break; | 1245 | break; |
| 1040 | 1246 | ||
| 1041 | case IOCB_CMD_FDSYNC: | 1247 | case IOCB_CMD_FDSYNC: |
| @@ -1057,6 +1263,9 @@ rw_common: | |||
| 1057 | return -EINVAL; | 1263 | return -EINVAL; |
| 1058 | } | 1264 | } |
| 1059 | 1265 | ||
| 1266 | if (iovec != &inline_vec) | ||
| 1267 | kfree(iovec); | ||
| 1268 | |||
| 1060 | if (ret != -EIOCBQUEUED) { | 1269 | if (ret != -EIOCBQUEUED) { |
| 1061 | /* | 1270 | /* |
| 1062 | * There's no easy way to restart the syscall since other AIO's | 1271 | * There's no easy way to restart the syscall since other AIO's |
| @@ -1128,21 +1337,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
| 1128 | req->ki_obj.user = user_iocb; | 1337 | req->ki_obj.user = user_iocb; |
| 1129 | req->ki_user_data = iocb->aio_data; | 1338 | req->ki_user_data = iocb->aio_data; |
| 1130 | req->ki_pos = iocb->aio_offset; | 1339 | req->ki_pos = iocb->aio_offset; |
| 1340 | req->ki_nbytes = iocb->aio_nbytes; | ||
| 1131 | 1341 | ||
| 1132 | req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; | 1342 | ret = aio_run_iocb(req, iocb->aio_lio_opcode, |
| 1133 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; | 1343 | (char __user *)(unsigned long)iocb->aio_buf, |
| 1134 | req->ki_opcode = iocb->aio_lio_opcode; | 1344 | compat); |
| 1135 | |||
| 1136 | ret = aio_run_iocb(req, compat); | ||
| 1137 | if (ret) | 1345 | if (ret) |
| 1138 | goto out_put_req; | 1346 | goto out_put_req; |
| 1139 | 1347 | ||
| 1140 | aio_put_req(req); /* drop extra ref to req */ | ||
| 1141 | return 0; | 1348 | return 0; |
| 1142 | out_put_req: | 1349 | out_put_req: |
| 1143 | atomic_dec(&ctx->reqs_active); | 1350 | put_reqs_available(ctx, 1); |
| 1144 | aio_put_req(req); /* drop extra ref to req */ | 1351 | kiocb_free(req); |
| 1145 | aio_put_req(req); /* drop i/o ref to req */ | ||
| 1146 | return ret; | 1352 | return ret; |
| 1147 | } | 1353 | } |
| 1148 | 1354 | ||
| @@ -1195,7 +1401,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, | |||
| 1195 | } | 1401 | } |
| 1196 | blk_finish_plug(&plug); | 1402 | blk_finish_plug(&plug); |
| 1197 | 1403 | ||
| 1198 | put_ioctx(ctx); | 1404 | percpu_ref_put(&ctx->users); |
| 1199 | return i ? i : ret; | 1405 | return i ? i : ret; |
| 1200 | } | 1406 | } |
| 1201 | 1407 | ||
| @@ -1252,7 +1458,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, | |||
| 1252 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | 1458 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, |
| 1253 | struct io_event __user *, result) | 1459 | struct io_event __user *, result) |
| 1254 | { | 1460 | { |
| 1255 | struct io_event res; | ||
| 1256 | struct kioctx *ctx; | 1461 | struct kioctx *ctx; |
| 1257 | struct kiocb *kiocb; | 1462 | struct kiocb *kiocb; |
| 1258 | u32 key; | 1463 | u32 key; |
| @@ -1270,21 +1475,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | |||
| 1270 | 1475 | ||
| 1271 | kiocb = lookup_kiocb(ctx, iocb, key); | 1476 | kiocb = lookup_kiocb(ctx, iocb, key); |
| 1272 | if (kiocb) | 1477 | if (kiocb) |
| 1273 | ret = kiocb_cancel(ctx, kiocb, &res); | 1478 | ret = kiocb_cancel(ctx, kiocb); |
| 1274 | else | 1479 | else |
| 1275 | ret = -EINVAL; | 1480 | ret = -EINVAL; |
| 1276 | 1481 | ||
| 1277 | spin_unlock_irq(&ctx->ctx_lock); | 1482 | spin_unlock_irq(&ctx->ctx_lock); |
| 1278 | 1483 | ||
| 1279 | if (!ret) { | 1484 | if (!ret) { |
| 1280 | /* Cancellation succeeded -- copy the result | 1485 | /* |
| 1281 | * into the user's buffer. | 1486 | * The result argument is no longer used - the io_event is |
| 1487 | * always delivered via the ring buffer. -EINPROGRESS indicates | ||
| 1488 | * cancellation is progress: | ||
| 1282 | */ | 1489 | */ |
| 1283 | if (copy_to_user(result, &res, sizeof(res))) | 1490 | ret = -EINPROGRESS; |
| 1284 | ret = -EFAULT; | ||
| 1285 | } | 1491 | } |
| 1286 | 1492 | ||
| 1287 | put_ioctx(ctx); | 1493 | percpu_ref_put(&ctx->users); |
| 1288 | 1494 | ||
| 1289 | return ret; | 1495 | return ret; |
| 1290 | } | 1496 | } |
| @@ -1313,7 +1519,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, | |||
| 1313 | if (likely(ioctx)) { | 1519 | if (likely(ioctx)) { |
| 1314 | if (likely(min_nr <= nr && min_nr >= 0)) | 1520 | if (likely(min_nr <= nr && min_nr >= 0)) |
| 1315 | ret = read_events(ioctx, min_nr, nr, events, timeout); | 1521 | ret = read_events(ioctx, min_nr, nr, events, timeout); |
| 1316 | put_ioctx(ioctx); | 1522 | percpu_ref_put(&ioctx->users); |
| 1317 | } | 1523 | } |
| 1318 | return ret; | 1524 | return ret; |
| 1319 | } | 1525 | } |
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 47a65df8c871..85c961849953 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c | |||
| @@ -109,6 +109,72 @@ static struct file_system_type anon_inode_fs_type = { | |||
| 109 | }; | 109 | }; |
| 110 | 110 | ||
| 111 | /** | 111 | /** |
| 112 | * anon_inode_getfile_private - creates a new file instance by hooking it up to an | ||
| 113 | * anonymous inode, and a dentry that describe the "class" | ||
| 114 | * of the file | ||
| 115 | * | ||
| 116 | * @name: [in] name of the "class" of the new file | ||
| 117 | * @fops: [in] file operations for the new file | ||
| 118 | * @priv: [in] private data for the new file (will be file's private_data) | ||
| 119 | * @flags: [in] flags | ||
| 120 | * | ||
| 121 | * | ||
| 122 | * Similar to anon_inode_getfile, but each file holds a single inode. | ||
| 123 | * | ||
| 124 | */ | ||
| 125 | struct file *anon_inode_getfile_private(const char *name, | ||
| 126 | const struct file_operations *fops, | ||
| 127 | void *priv, int flags) | ||
| 128 | { | ||
| 129 | struct qstr this; | ||
| 130 | struct path path; | ||
| 131 | struct file *file; | ||
| 132 | struct inode *inode; | ||
| 133 | |||
| 134 | if (fops->owner && !try_module_get(fops->owner)) | ||
| 135 | return ERR_PTR(-ENOENT); | ||
| 136 | |||
| 137 | inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb); | ||
| 138 | if (IS_ERR(inode)) { | ||
| 139 | file = ERR_PTR(-ENOMEM); | ||
| 140 | goto err_module; | ||
| 141 | } | ||
| 142 | |||
| 143 | /* | ||
| 144 | * Link the inode to a directory entry by creating a unique name | ||
| 145 | * using the inode sequence number. | ||
| 146 | */ | ||
| 147 | file = ERR_PTR(-ENOMEM); | ||
| 148 | this.name = name; | ||
| 149 | this.len = strlen(name); | ||
| 150 | this.hash = 0; | ||
| 151 | path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); | ||
| 152 | if (!path.dentry) | ||
| 153 | goto err_module; | ||
| 154 | |||
| 155 | path.mnt = mntget(anon_inode_mnt); | ||
| 156 | |||
| 157 | d_instantiate(path.dentry, inode); | ||
| 158 | |||
| 159 | file = alloc_file(&path, OPEN_FMODE(flags), fops); | ||
| 160 | if (IS_ERR(file)) | ||
| 161 | goto err_dput; | ||
| 162 | |||
| 163 | file->f_mapping = inode->i_mapping; | ||
| 164 | file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); | ||
| 165 | file->private_data = priv; | ||
| 166 | |||
| 167 | return file; | ||
| 168 | |||
| 169 | err_dput: | ||
| 170 | path_put(&path); | ||
| 171 | err_module: | ||
| 172 | module_put(fops->owner); | ||
| 173 | return file; | ||
| 174 | } | ||
| 175 | EXPORT_SYMBOL_GPL(anon_inode_getfile_private); | ||
| 176 | |||
| 177 | /** | ||
| 112 | * anon_inode_getfile - creates a new file instance by hooking it up to an | 178 | * anon_inode_getfile - creates a new file instance by hooking it up to an |
| 113 | * anonymous inode, and a dentry that describe the "class" | 179 | * anonymous inode, and a dentry that describe the "class" |
| 114 | * of the file | 180 | * of the file |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 1173a4ee0830..c3549ed58038 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
| @@ -1542,7 +1542,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 1542 | return 0; | 1542 | return 0; |
| 1543 | 1543 | ||
| 1544 | size -= pos; | 1544 | size -= pos; |
| 1545 | if (size < iocb->ki_left) | 1545 | if (size < iocb->ki_nbytes) |
| 1546 | nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); | 1546 | nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); |
| 1547 | return generic_file_aio_read(iocb, iov, nr_segs, pos); | 1547 | return generic_file_aio_read(iocb, iov, nr_segs, pos); |
| 1548 | } | 1548 | } |
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0bd7a55a5f07..91ff089d3412 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c | |||
| @@ -130,7 +130,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_ | |||
| 130 | 130 | ||
| 131 | return -EINVAL; | 131 | return -EINVAL; |
| 132 | #else | 132 | #else |
| 133 | VM_BUG_ON(iocb->ki_left != PAGE_SIZE); | ||
| 134 | VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); | 133 | VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); |
| 135 | 134 | ||
| 136 | if (rw == READ || rw == KERNEL_READ) | 135 | if (rw == READ || rw == KERNEL_READ) |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4f8197caa487..d71903c6068b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
| @@ -2242,7 +2242,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
| 2242 | file->f_path.dentry->d_name.name, | 2242 | file->f_path.dentry->d_name.name, |
| 2243 | (unsigned int)nr_segs); | 2243 | (unsigned int)nr_segs); |
| 2244 | 2244 | ||
| 2245 | if (iocb->ki_left == 0) | 2245 | if (iocb->ki_nbytes == 0) |
| 2246 | return 0; | 2246 | return 0; |
| 2247 | 2247 | ||
| 2248 | appending = file->f_flags & O_APPEND ? 1 : 0; | 2248 | appending = file->f_flags & O_APPEND ? 1 : 0; |
| @@ -2293,7 +2293,7 @@ relock: | |||
| 2293 | 2293 | ||
| 2294 | can_do_direct = direct_io; | 2294 | can_do_direct = direct_io; |
| 2295 | ret = ocfs2_prepare_inode_for_write(file, ppos, | 2295 | ret = ocfs2_prepare_inode_for_write(file, ppos, |
| 2296 | iocb->ki_left, appending, | 2296 | iocb->ki_nbytes, appending, |
| 2297 | &can_do_direct, &has_refcount); | 2297 | &can_do_direct, &has_refcount); |
| 2298 | if (ret < 0) { | 2298 | if (ret < 0) { |
| 2299 | mlog_errno(ret); | 2299 | mlog_errno(ret); |
| @@ -2301,7 +2301,7 @@ relock: | |||
| 2301 | } | 2301 | } |
| 2302 | 2302 | ||
| 2303 | if (direct_io && !is_sync_kiocb(iocb)) | 2303 | if (direct_io && !is_sync_kiocb(iocb)) |
| 2304 | unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, | 2304 | unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, |
| 2305 | *ppos); | 2305 | *ppos); |
| 2306 | 2306 | ||
| 2307 | /* | 2307 | /* |
diff --git a/fs/read_write.c b/fs/read_write.c index 122a3846d9e1..e3cd280b158c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c | |||
| @@ -367,7 +367,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp | |||
| 367 | 367 | ||
| 368 | init_sync_kiocb(&kiocb, filp); | 368 | init_sync_kiocb(&kiocb, filp); |
| 369 | kiocb.ki_pos = *ppos; | 369 | kiocb.ki_pos = *ppos; |
| 370 | kiocb.ki_left = len; | ||
| 371 | kiocb.ki_nbytes = len; | 370 | kiocb.ki_nbytes = len; |
| 372 | 371 | ||
| 373 | ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); | 372 | ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); |
| @@ -417,7 +416,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof | |||
| 417 | 416 | ||
| 418 | init_sync_kiocb(&kiocb, filp); | 417 | init_sync_kiocb(&kiocb, filp); |
| 419 | kiocb.ki_pos = *ppos; | 418 | kiocb.ki_pos = *ppos; |
| 420 | kiocb.ki_left = len; | ||
| 421 | kiocb.ki_nbytes = len; | 419 | kiocb.ki_nbytes = len; |
| 422 | 420 | ||
| 423 | ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); | 421 | ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); |
| @@ -599,7 +597,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, | |||
| 599 | 597 | ||
| 600 | init_sync_kiocb(&kiocb, filp); | 598 | init_sync_kiocb(&kiocb, filp); |
| 601 | kiocb.ki_pos = *ppos; | 599 | kiocb.ki_pos = *ppos; |
| 602 | kiocb.ki_left = len; | ||
| 603 | kiocb.ki_nbytes = len; | 600 | kiocb.ki_nbytes = len; |
| 604 | 601 | ||
| 605 | ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); | 602 | ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); |
diff --git a/fs/udf/file.c b/fs/udf/file.c index 29569dd08168..c02a27a19c6d 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c | |||
| @@ -141,7 +141,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 141 | struct file *file = iocb->ki_filp; | 141 | struct file *file = iocb->ki_filp; |
| 142 | struct inode *inode = file_inode(file); | 142 | struct inode *inode = file_inode(file); |
| 143 | int err, pos; | 143 | int err, pos; |
| 144 | size_t count = iocb->ki_left; | 144 | size_t count = iocb->ki_nbytes; |
| 145 | struct udf_inode_info *iinfo = UDF_I(inode); | 145 | struct udf_inode_info *iinfo = UDF_I(inode); |
| 146 | 146 | ||
| 147 | down_write(&iinfo->i_data_sem); | 147 | down_write(&iinfo->i_data_sem); |
diff --git a/include/linux/aio.h b/include/linux/aio.h index 1bdf965339f9..d9c92daa3944 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h | |||
| @@ -27,15 +27,13 @@ struct kiocb; | |||
| 27 | */ | 27 | */ |
| 28 | #define KIOCB_CANCELLED ((void *) (~0ULL)) | 28 | #define KIOCB_CANCELLED ((void *) (~0ULL)) |
| 29 | 29 | ||
| 30 | typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *); | 30 | typedef int (kiocb_cancel_fn)(struct kiocb *); |
| 31 | 31 | ||
| 32 | struct kiocb { | 32 | struct kiocb { |
| 33 | atomic_t ki_users; | ||
| 34 | |||
| 35 | struct file *ki_filp; | 33 | struct file *ki_filp; |
| 36 | struct kioctx *ki_ctx; /* NULL for sync ops */ | 34 | struct kioctx *ki_ctx; /* NULL for sync ops */ |
| 37 | kiocb_cancel_fn *ki_cancel; | 35 | kiocb_cancel_fn *ki_cancel; |
| 38 | void (*ki_dtor)(struct kiocb *); | 36 | void *private; |
| 39 | 37 | ||
| 40 | union { | 38 | union { |
| 41 | void __user *user; | 39 | void __user *user; |
| @@ -44,17 +42,7 @@ struct kiocb { | |||
| 44 | 42 | ||
| 45 | __u64 ki_user_data; /* user's data for completion */ | 43 | __u64 ki_user_data; /* user's data for completion */ |
| 46 | loff_t ki_pos; | 44 | loff_t ki_pos; |
| 47 | 45 | size_t ki_nbytes; /* copy of iocb->aio_nbytes */ | |
| 48 | void *private; | ||
| 49 | /* State that we remember to be able to restart/retry */ | ||
| 50 | unsigned short ki_opcode; | ||
| 51 | size_t ki_nbytes; /* copy of iocb->aio_nbytes */ | ||
| 52 | char __user *ki_buf; /* remaining iocb->aio_buf */ | ||
| 53 | size_t ki_left; /* remaining bytes */ | ||
| 54 | struct iovec ki_inline_vec; /* inline vector */ | ||
| 55 | struct iovec *ki_iovec; | ||
| 56 | unsigned long ki_nr_segs; | ||
| 57 | unsigned long ki_cur_seg; | ||
| 58 | 46 | ||
| 59 | struct list_head ki_list; /* the aio core uses this | 47 | struct list_head ki_list; /* the aio core uses this |
| 60 | * for cancellation */ | 48 | * for cancellation */ |
| @@ -74,7 +62,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) | |||
| 74 | static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) | 62 | static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) |
| 75 | { | 63 | { |
| 76 | *kiocb = (struct kiocb) { | 64 | *kiocb = (struct kiocb) { |
| 77 | .ki_users = ATOMIC_INIT(1), | ||
| 78 | .ki_ctx = NULL, | 65 | .ki_ctx = NULL, |
| 79 | .ki_filp = filp, | 66 | .ki_filp = filp, |
| 80 | .ki_obj.tsk = current, | 67 | .ki_obj.tsk = current, |
| @@ -84,7 +71,6 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) | |||
| 84 | /* prototypes */ | 71 | /* prototypes */ |
| 85 | #ifdef CONFIG_AIO | 72 | #ifdef CONFIG_AIO |
| 86 | extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); | 73 | extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); |
| 87 | extern void aio_put_req(struct kiocb *iocb); | ||
| 88 | extern void aio_complete(struct kiocb *iocb, long res, long res2); | 74 | extern void aio_complete(struct kiocb *iocb, long res, long res2); |
| 89 | struct mm_struct; | 75 | struct mm_struct; |
| 90 | extern void exit_aio(struct mm_struct *mm); | 76 | extern void exit_aio(struct mm_struct *mm); |
| @@ -93,7 +79,6 @@ extern long do_io_submit(aio_context_t ctx_id, long nr, | |||
| 93 | void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); | 79 | void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); |
| 94 | #else | 80 | #else |
| 95 | static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } | 81 | static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } |
| 96 | static inline void aio_put_req(struct kiocb *iocb) { } | ||
| 97 | static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } | 82 | static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } |
| 98 | struct mm_struct; | 83 | struct mm_struct; |
| 99 | static inline void exit_aio(struct mm_struct *mm) { } | 84 | static inline void exit_aio(struct mm_struct *mm) { } |
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index 8013a45242fe..cf573c22b81e 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h | |||
| @@ -13,6 +13,9 @@ struct file_operations; | |||
| 13 | struct file *anon_inode_getfile(const char *name, | 13 | struct file *anon_inode_getfile(const char *name, |
| 14 | const struct file_operations *fops, | 14 | const struct file_operations *fops, |
| 15 | void *priv, int flags); | 15 | void *priv, int flags); |
| 16 | struct file *anon_inode_getfile_private(const char *name, | ||
| 17 | const struct file_operations *fops, | ||
| 18 | void *priv, int flags); | ||
| 16 | int anon_inode_getfd(const char *name, const struct file_operations *fops, | 19 | int anon_inode_getfd(const char *name, const struct file_operations *fops, |
| 17 | void *priv, int flags); | 20 | void *priv, int flags); |
| 18 | 21 | ||
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6fe521420631..8d3c57fdf221 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
| @@ -53,6 +53,9 @@ extern int migrate_vmas(struct mm_struct *mm, | |||
| 53 | extern void migrate_page_copy(struct page *newpage, struct page *page); | 53 | extern void migrate_page_copy(struct page *newpage, struct page *page); |
| 54 | extern int migrate_huge_page_move_mapping(struct address_space *mapping, | 54 | extern int migrate_huge_page_move_mapping(struct address_space *mapping, |
| 55 | struct page *newpage, struct page *page); | 55 | struct page *newpage, struct page *page); |
| 56 | extern int migrate_page_move_mapping(struct address_space *mapping, | ||
| 57 | struct page *newpage, struct page *page, | ||
| 58 | struct buffer_head *head, enum migrate_mode mode); | ||
| 56 | #else | 59 | #else |
| 57 | 60 | ||
| 58 | static inline void putback_lru_pages(struct list_head *l) {} | 61 | static inline void putback_lru_pages(struct list_head *l) {} |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index faf4b7c1ad12..d9851eeb6e1d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
| @@ -322,6 +322,7 @@ struct mm_rss_stat { | |||
| 322 | atomic_long_t count[NR_MM_COUNTERS]; | 322 | atomic_long_t count[NR_MM_COUNTERS]; |
| 323 | }; | 323 | }; |
| 324 | 324 | ||
| 325 | struct kioctx_table; | ||
| 325 | struct mm_struct { | 326 | struct mm_struct { |
| 326 | struct vm_area_struct * mmap; /* list of VMAs */ | 327 | struct vm_area_struct * mmap; /* list of VMAs */ |
| 327 | struct rb_root mm_rb; | 328 | struct rb_root mm_rb; |
| @@ -383,8 +384,8 @@ struct mm_struct { | |||
| 383 | 384 | ||
| 384 | struct core_state *core_state; /* coredumping support */ | 385 | struct core_state *core_state; /* coredumping support */ |
| 385 | #ifdef CONFIG_AIO | 386 | #ifdef CONFIG_AIO |
| 386 | spinlock_t ioctx_lock; | 387 | spinlock_t ioctx_lock; |
| 387 | struct hlist_head ioctx_list; | 388 | struct kioctx_table __rcu *ioctx_table; |
| 388 | #endif | 389 | #endif |
| 389 | #ifdef CONFIG_MM_OWNER | 390 | #ifdef CONFIG_MM_OWNER |
| 390 | /* | 391 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index 81ccb4f010c2..086fe73ad6bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -519,7 +519,7 @@ static void mm_init_aio(struct mm_struct *mm) | |||
| 519 | { | 519 | { |
| 520 | #ifdef CONFIG_AIO | 520 | #ifdef CONFIG_AIO |
| 521 | spin_lock_init(&mm->ioctx_lock); | 521 | spin_lock_init(&mm->ioctx_lock); |
| 522 | INIT_HLIST_HEAD(&mm->ioctx_list); | 522 | mm->ioctx_table = NULL; |
| 523 | #endif | 523 | #endif |
| 524 | } | 524 | } |
| 525 | 525 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index b7ded7eafe3a..9c8d5f59d30b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -311,7 +311,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, | |||
| 311 | * 2 for pages with a mapping | 311 | * 2 for pages with a mapping |
| 312 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. | 312 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. |
| 313 | */ | 313 | */ |
| 314 | static int migrate_page_move_mapping(struct address_space *mapping, | 314 | int migrate_page_move_mapping(struct address_space *mapping, |
| 315 | struct page *newpage, struct page *page, | 315 | struct page *newpage, struct page *page, |
| 316 | struct buffer_head *head, enum migrate_mode mode) | 316 | struct buffer_head *head, enum migrate_mode mode) |
| 317 | { | 317 | { |
diff --git a/mm/page_io.c b/mm/page_io.c index ba05b64e5d8d..8c79a4764be0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -266,7 +266,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
| 266 | 266 | ||
| 267 | init_sync_kiocb(&kiocb, swap_file); | 267 | init_sync_kiocb(&kiocb, swap_file); |
| 268 | kiocb.ki_pos = page_file_offset(page); | 268 | kiocb.ki_pos = page_file_offset(page); |
| 269 | kiocb.ki_left = PAGE_SIZE; | ||
| 270 | kiocb.ki_nbytes = PAGE_SIZE; | 269 | kiocb.ki_nbytes = PAGE_SIZE; |
| 271 | 270 | ||
| 272 | set_page_writeback(page); | 271 | set_page_writeback(page); |
diff --git a/net/socket.c b/net/socket.c index 0ceaa5cb9ead..ebed4b68f768 100644 --- a/net/socket.c +++ b/net/socket.c | |||
| @@ -854,11 +854,6 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, | |||
| 854 | } | 854 | } |
| 855 | EXPORT_SYMBOL(kernel_recvmsg); | 855 | EXPORT_SYMBOL(kernel_recvmsg); |
| 856 | 856 | ||
| 857 | static void sock_aio_dtor(struct kiocb *iocb) | ||
| 858 | { | ||
| 859 | kfree(iocb->private); | ||
| 860 | } | ||
| 861 | |||
| 862 | static ssize_t sock_sendpage(struct file *file, struct page *page, | 857 | static ssize_t sock_sendpage(struct file *file, struct page *page, |
| 863 | int offset, size_t size, loff_t *ppos, int more) | 858 | int offset, size_t size, loff_t *ppos, int more) |
| 864 | { | 859 | { |
| @@ -889,12 +884,8 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, | |||
| 889 | static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, | 884 | static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, |
| 890 | struct sock_iocb *siocb) | 885 | struct sock_iocb *siocb) |
| 891 | { | 886 | { |
| 892 | if (!is_sync_kiocb(iocb)) { | 887 | if (!is_sync_kiocb(iocb)) |
| 893 | siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); | 888 | BUG(); |
| 894 | if (!siocb) | ||
| 895 | return NULL; | ||
| 896 | iocb->ki_dtor = sock_aio_dtor; | ||
| 897 | } | ||
| 898 | 889 | ||
| 899 | siocb->kiocb = iocb; | 890 | siocb->kiocb = iocb; |
| 900 | iocb->private = siocb; | 891 | iocb->private = siocb; |
| @@ -931,7 +922,7 @@ static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 931 | if (pos != 0) | 922 | if (pos != 0) |
| 932 | return -ESPIPE; | 923 | return -ESPIPE; |
| 933 | 924 | ||
| 934 | if (iocb->ki_left == 0) /* Match SYS5 behaviour */ | 925 | if (iocb->ki_nbytes == 0) /* Match SYS5 behaviour */ |
| 935 | return 0; | 926 | return 0; |
| 936 | 927 | ||
| 937 | 928 | ||
