diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-13 13:55:58 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-13 13:55:58 -0400 |
commit | 9bf12df31f282e845b3dfaac1e5d5376a041da22 (patch) | |
tree | 10d7a21d34c7f2c47eff3e807f5efef46228d507 /fs | |
parent | 399a946edbbe90bd03aec2e93ce58c9b3f18e70b (diff) | |
parent | d9b2c8714aef102dea95544a8cd9372b21af463f (diff) |
Merge git://git.kvack.org/~bcrl/aio-next
Pull aio changes from Ben LaHaise:
"First off, sorry for this pull request being late in the merge window.
Al had raised a couple of concerns about 2 items in the series below.
I addressed the first issue (the race introduced by Gu's use of
mm_populate()), but he has not provided any further details on how he
wants to rework the anon_inode.c changes (which were sent out months
ago but have yet to be commented on).
The bulk of the changes have been sitting in the -next tree for a few
months, with all the issues raised being addressed"
* git://git.kvack.org/~bcrl/aio-next: (22 commits)
aio: rcu_read_lock protection for new rcu_dereference calls
aio: fix race in ring buffer page lookup introduced by page migration support
aio: fix rcu sparse warnings introduced by ioctx table lookup patch
aio: remove unnecessary debugging from aio_free_ring()
aio: table lookup: verify ctx pointer
staging/lustre: kiocb->ki_left is removed
aio: fix error handling and rcu usage in "convert the ioctx list to table lookup v3"
aio: be defensive to ensure request batching is non-zero instead of BUG_ON()
aio: convert the ioctx list to table lookup v3
aio: double aio_max_nr in calculations
aio: Kill ki_dtor
aio: Kill ki_users
aio: Kill unneeded kiocb members
aio: Kill aio_rw_vect_retry()
aio: Don't use ctx->tail unnecessarily
aio: io_cancel() no longer returns the io_event
aio: percpu ioctx refcount
aio: percpu reqs_available
aio: reqs_active -> reqs_available
aio: fix build when migration is disabled
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/aio.c | 726 | ||||
-rw-r--r-- | fs/anon_inodes.c | 66 | ||||
-rw-r--r-- | fs/block_dev.c | 2 | ||||
-rw-r--r-- | fs/nfs/direct.c | 1 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 6 | ||||
-rw-r--r-- | fs/read_write.c | 3 | ||||
-rw-r--r-- | fs/udf/file.c | 2 |
7 files changed, 537 insertions, 269 deletions
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
27 | #include <linux/mman.h> | 27 | #include <linux/mman.h> |
28 | #include <linux/mmu_context.h> | 28 | #include <linux/mmu_context.h> |
29 | #include <linux/percpu.h> | ||
29 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
30 | #include <linux/timer.h> | 31 | #include <linux/timer.h> |
31 | #include <linux/aio.h> | 32 | #include <linux/aio.h> |
@@ -35,6 +36,10 @@ | |||
35 | #include <linux/eventfd.h> | 36 | #include <linux/eventfd.h> |
36 | #include <linux/blkdev.h> | 37 | #include <linux/blkdev.h> |
37 | #include <linux/compat.h> | 38 | #include <linux/compat.h> |
39 | #include <linux/anon_inodes.h> | ||
40 | #include <linux/migrate.h> | ||
41 | #include <linux/ramfs.h> | ||
42 | #include <linux/percpu-refcount.h> | ||
38 | 43 | ||
39 | #include <asm/kmap_types.h> | 44 | #include <asm/kmap_types.h> |
40 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
@@ -61,14 +66,29 @@ struct aio_ring { | |||
61 | 66 | ||
62 | #define AIO_RING_PAGES 8 | 67 | #define AIO_RING_PAGES 8 |
63 | 68 | ||
69 | struct kioctx_table { | ||
70 | struct rcu_head rcu; | ||
71 | unsigned nr; | ||
72 | struct kioctx *table[]; | ||
73 | }; | ||
74 | |||
75 | struct kioctx_cpu { | ||
76 | unsigned reqs_available; | ||
77 | }; | ||
78 | |||
64 | struct kioctx { | 79 | struct kioctx { |
65 | atomic_t users; | 80 | struct percpu_ref users; |
66 | atomic_t dead; | 81 | atomic_t dead; |
67 | 82 | ||
68 | /* This needs improving */ | ||
69 | unsigned long user_id; | 83 | unsigned long user_id; |
70 | struct hlist_node list; | ||
71 | 84 | ||
85 | struct __percpu kioctx_cpu *cpu; | ||
86 | |||
87 | /* | ||
88 | * For percpu reqs_available, number of slots we move to/from global | ||
89 | * counter at a time: | ||
90 | */ | ||
91 | unsigned req_batch; | ||
72 | /* | 92 | /* |
73 | * This is what userspace passed to io_setup(), it's not used for | 93 | * This is what userspace passed to io_setup(), it's not used for |
74 | * anything but counting against the global max_reqs quota. | 94 | * anything but counting against the global max_reqs quota. |
@@ -88,10 +108,18 @@ struct kioctx { | |||
88 | long nr_pages; | 108 | long nr_pages; |
89 | 109 | ||
90 | struct rcu_head rcu_head; | 110 | struct rcu_head rcu_head; |
91 | struct work_struct rcu_work; | 111 | struct work_struct free_work; |
92 | 112 | ||
93 | struct { | 113 | struct { |
94 | atomic_t reqs_active; | 114 | /* |
115 | * This counts the number of available slots in the ringbuffer, | ||
116 | * so we avoid overflowing it: it's decremented (if positive) | ||
117 | * when allocating a kiocb and incremented when the resulting | ||
118 | * io_event is pulled off the ringbuffer. | ||
119 | * | ||
120 | * We batch accesses to it with a percpu version. | ||
121 | */ | ||
122 | atomic_t reqs_available; | ||
95 | } ____cacheline_aligned_in_smp; | 123 | } ____cacheline_aligned_in_smp; |
96 | 124 | ||
97 | struct { | 125 | struct { |
@@ -110,6 +138,9 @@ struct kioctx { | |||
110 | } ____cacheline_aligned_in_smp; | 138 | } ____cacheline_aligned_in_smp; |
111 | 139 | ||
112 | struct page *internal_pages[AIO_RING_PAGES]; | 140 | struct page *internal_pages[AIO_RING_PAGES]; |
141 | struct file *aio_ring_file; | ||
142 | |||
143 | unsigned id; | ||
113 | }; | 144 | }; |
114 | 145 | ||
115 | /*------ sysctl variables----*/ | 146 | /*------ sysctl variables----*/ |
@@ -138,15 +169,77 @@ __initcall(aio_setup); | |||
138 | 169 | ||
139 | static void aio_free_ring(struct kioctx *ctx) | 170 | static void aio_free_ring(struct kioctx *ctx) |
140 | { | 171 | { |
141 | long i; | 172 | int i; |
173 | struct file *aio_ring_file = ctx->aio_ring_file; | ||
142 | 174 | ||
143 | for (i = 0; i < ctx->nr_pages; i++) | 175 | for (i = 0; i < ctx->nr_pages; i++) { |
176 | pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, | ||
177 | page_count(ctx->ring_pages[i])); | ||
144 | put_page(ctx->ring_pages[i]); | 178 | put_page(ctx->ring_pages[i]); |
179 | } | ||
145 | 180 | ||
146 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) | 181 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) |
147 | kfree(ctx->ring_pages); | 182 | kfree(ctx->ring_pages); |
183 | |||
184 | if (aio_ring_file) { | ||
185 | truncate_setsize(aio_ring_file->f_inode, 0); | ||
186 | fput(aio_ring_file); | ||
187 | ctx->aio_ring_file = NULL; | ||
188 | } | ||
189 | } | ||
190 | |||
191 | static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) | ||
192 | { | ||
193 | vma->vm_ops = &generic_file_vm_ops; | ||
194 | return 0; | ||
148 | } | 195 | } |
149 | 196 | ||
197 | static const struct file_operations aio_ring_fops = { | ||
198 | .mmap = aio_ring_mmap, | ||
199 | }; | ||
200 | |||
201 | static int aio_set_page_dirty(struct page *page) | ||
202 | { | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | #if IS_ENABLED(CONFIG_MIGRATION) | ||
207 | static int aio_migratepage(struct address_space *mapping, struct page *new, | ||
208 | struct page *old, enum migrate_mode mode) | ||
209 | { | ||
210 | struct kioctx *ctx = mapping->private_data; | ||
211 | unsigned long flags; | ||
212 | unsigned idx = old->index; | ||
213 | int rc; | ||
214 | |||
215 | /* Writeback must be complete */ | ||
216 | BUG_ON(PageWriteback(old)); | ||
217 | put_page(old); | ||
218 | |||
219 | rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); | ||
220 | if (rc != MIGRATEPAGE_SUCCESS) { | ||
221 | get_page(old); | ||
222 | return rc; | ||
223 | } | ||
224 | |||
225 | get_page(new); | ||
226 | |||
227 | spin_lock_irqsave(&ctx->completion_lock, flags); | ||
228 | migrate_page_copy(new, old); | ||
229 | ctx->ring_pages[idx] = new; | ||
230 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | ||
231 | |||
232 | return rc; | ||
233 | } | ||
234 | #endif | ||
235 | |||
236 | static const struct address_space_operations aio_ctx_aops = { | ||
237 | .set_page_dirty = aio_set_page_dirty, | ||
238 | #if IS_ENABLED(CONFIG_MIGRATION) | ||
239 | .migratepage = aio_migratepage, | ||
240 | #endif | ||
241 | }; | ||
242 | |||
150 | static int aio_setup_ring(struct kioctx *ctx) | 243 | static int aio_setup_ring(struct kioctx *ctx) |
151 | { | 244 | { |
152 | struct aio_ring *ring; | 245 | struct aio_ring *ring; |
@@ -154,20 +247,45 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
154 | struct mm_struct *mm = current->mm; | 247 | struct mm_struct *mm = current->mm; |
155 | unsigned long size, populate; | 248 | unsigned long size, populate; |
156 | int nr_pages; | 249 | int nr_pages; |
250 | int i; | ||
251 | struct file *file; | ||
157 | 252 | ||
158 | /* Compensate for the ring buffer's head/tail overlap entry */ | 253 | /* Compensate for the ring buffer's head/tail overlap entry */ |
159 | nr_events += 2; /* 1 is required, 2 for good luck */ | 254 | nr_events += 2; /* 1 is required, 2 for good luck */ |
160 | 255 | ||
161 | size = sizeof(struct aio_ring); | 256 | size = sizeof(struct aio_ring); |
162 | size += sizeof(struct io_event) * nr_events; | 257 | size += sizeof(struct io_event) * nr_events; |
163 | nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; | ||
164 | 258 | ||
259 | nr_pages = PFN_UP(size); | ||
165 | if (nr_pages < 0) | 260 | if (nr_pages < 0) |
166 | return -EINVAL; | 261 | return -EINVAL; |
167 | 262 | ||
168 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); | 263 | file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); |
264 | if (IS_ERR(file)) { | ||
265 | ctx->aio_ring_file = NULL; | ||
266 | return -EAGAIN; | ||
267 | } | ||
268 | |||
269 | file->f_inode->i_mapping->a_ops = &aio_ctx_aops; | ||
270 | file->f_inode->i_mapping->private_data = ctx; | ||
271 | file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; | ||
272 | |||
273 | for (i = 0; i < nr_pages; i++) { | ||
274 | struct page *page; | ||
275 | page = find_or_create_page(file->f_inode->i_mapping, | ||
276 | i, GFP_HIGHUSER | __GFP_ZERO); | ||
277 | if (!page) | ||
278 | break; | ||
279 | pr_debug("pid(%d) page[%d]->count=%d\n", | ||
280 | current->pid, i, page_count(page)); | ||
281 | SetPageUptodate(page); | ||
282 | SetPageDirty(page); | ||
283 | unlock_page(page); | ||
284 | } | ||
285 | ctx->aio_ring_file = file; | ||
286 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) | ||
287 | / sizeof(struct io_event); | ||
169 | 288 | ||
170 | ctx->nr_events = 0; | ||
171 | ctx->ring_pages = ctx->internal_pages; | 289 | ctx->ring_pages = ctx->internal_pages; |
172 | if (nr_pages > AIO_RING_PAGES) { | 290 | if (nr_pages > AIO_RING_PAGES) { |
173 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), | 291 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), |
@@ -178,10 +296,11 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
178 | 296 | ||
179 | ctx->mmap_size = nr_pages * PAGE_SIZE; | 297 | ctx->mmap_size = nr_pages * PAGE_SIZE; |
180 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); | 298 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); |
299 | |||
181 | down_write(&mm->mmap_sem); | 300 | down_write(&mm->mmap_sem); |
182 | ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, | 301 | ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, |
183 | PROT_READ|PROT_WRITE, | 302 | PROT_READ | PROT_WRITE, |
184 | MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); | 303 | MAP_SHARED | MAP_POPULATE, 0, &populate); |
185 | if (IS_ERR((void *)ctx->mmap_base)) { | 304 | if (IS_ERR((void *)ctx->mmap_base)) { |
186 | up_write(&mm->mmap_sem); | 305 | up_write(&mm->mmap_sem); |
187 | ctx->mmap_size = 0; | 306 | ctx->mmap_size = 0; |
@@ -190,23 +309,34 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
190 | } | 309 | } |
191 | 310 | ||
192 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); | 311 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); |
312 | |||
313 | /* We must do this while still holding mmap_sem for write, as we | ||
314 | * need to be protected against userspace attempting to mremap() | ||
315 | * or munmap() the ring buffer. | ||
316 | */ | ||
193 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, | 317 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, |
194 | 1, 0, ctx->ring_pages, NULL); | 318 | 1, 0, ctx->ring_pages, NULL); |
319 | |||
320 | /* Dropping the reference here is safe as the page cache will hold | ||
321 | * onto the pages for us. It is also required so that page migration | ||
322 | * can unmap the pages and get the right reference count. | ||
323 | */ | ||
324 | for (i = 0; i < ctx->nr_pages; i++) | ||
325 | put_page(ctx->ring_pages[i]); | ||
326 | |||
195 | up_write(&mm->mmap_sem); | 327 | up_write(&mm->mmap_sem); |
196 | 328 | ||
197 | if (unlikely(ctx->nr_pages != nr_pages)) { | 329 | if (unlikely(ctx->nr_pages != nr_pages)) { |
198 | aio_free_ring(ctx); | 330 | aio_free_ring(ctx); |
199 | return -EAGAIN; | 331 | return -EAGAIN; |
200 | } | 332 | } |
201 | if (populate) | ||
202 | mm_populate(ctx->mmap_base, populate); | ||
203 | 333 | ||
204 | ctx->user_id = ctx->mmap_base; | 334 | ctx->user_id = ctx->mmap_base; |
205 | ctx->nr_events = nr_events; /* trusted copy */ | 335 | ctx->nr_events = nr_events; /* trusted copy */ |
206 | 336 | ||
207 | ring = kmap_atomic(ctx->ring_pages[0]); | 337 | ring = kmap_atomic(ctx->ring_pages[0]); |
208 | ring->nr = nr_events; /* user copy */ | 338 | ring->nr = nr_events; /* user copy */ |
209 | ring->id = ctx->user_id; | 339 | ring->id = ~0U; |
210 | ring->head = ring->tail = 0; | 340 | ring->head = ring->tail = 0; |
211 | ring->magic = AIO_RING_MAGIC; | 341 | ring->magic = AIO_RING_MAGIC; |
212 | ring->compat_features = AIO_RING_COMPAT_FEATURES; | 342 | ring->compat_features = AIO_RING_COMPAT_FEATURES; |
@@ -238,11 +368,9 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) | |||
238 | } | 368 | } |
239 | EXPORT_SYMBOL(kiocb_set_cancel_fn); | 369 | EXPORT_SYMBOL(kiocb_set_cancel_fn); |
240 | 370 | ||
241 | static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, | 371 | static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) |
242 | struct io_event *res) | ||
243 | { | 372 | { |
244 | kiocb_cancel_fn *old, *cancel; | 373 | kiocb_cancel_fn *old, *cancel; |
245 | int ret = -EINVAL; | ||
246 | 374 | ||
247 | /* | 375 | /* |
248 | * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it | 376 | * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it |
@@ -252,28 +380,20 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, | |||
252 | cancel = ACCESS_ONCE(kiocb->ki_cancel); | 380 | cancel = ACCESS_ONCE(kiocb->ki_cancel); |
253 | do { | 381 | do { |
254 | if (!cancel || cancel == KIOCB_CANCELLED) | 382 | if (!cancel || cancel == KIOCB_CANCELLED) |
255 | return ret; | 383 | return -EINVAL; |
256 | 384 | ||
257 | old = cancel; | 385 | old = cancel; |
258 | cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); | 386 | cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); |
259 | } while (cancel != old); | 387 | } while (cancel != old); |
260 | 388 | ||
261 | atomic_inc(&kiocb->ki_users); | 389 | return cancel(kiocb); |
262 | spin_unlock_irq(&ctx->ctx_lock); | ||
263 | |||
264 | memset(res, 0, sizeof(*res)); | ||
265 | res->obj = (u64)(unsigned long)kiocb->ki_obj.user; | ||
266 | res->data = kiocb->ki_user_data; | ||
267 | ret = cancel(kiocb, res); | ||
268 | |||
269 | spin_lock_irq(&ctx->ctx_lock); | ||
270 | |||
271 | return ret; | ||
272 | } | 390 | } |
273 | 391 | ||
274 | static void free_ioctx_rcu(struct rcu_head *head) | 392 | static void free_ioctx_rcu(struct rcu_head *head) |
275 | { | 393 | { |
276 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | 394 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); |
395 | |||
396 | free_percpu(ctx->cpu); | ||
277 | kmem_cache_free(kioctx_cachep, ctx); | 397 | kmem_cache_free(kioctx_cachep, ctx); |
278 | } | 398 | } |
279 | 399 | ||
@@ -282,12 +402,13 @@ static void free_ioctx_rcu(struct rcu_head *head) | |||
282 | * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - | 402 | * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - |
283 | * now it's safe to cancel any that need to be. | 403 | * now it's safe to cancel any that need to be. |
284 | */ | 404 | */ |
285 | static void free_ioctx(struct kioctx *ctx) | 405 | static void free_ioctx(struct work_struct *work) |
286 | { | 406 | { |
407 | struct kioctx *ctx = container_of(work, struct kioctx, free_work); | ||
287 | struct aio_ring *ring; | 408 | struct aio_ring *ring; |
288 | struct io_event res; | ||
289 | struct kiocb *req; | 409 | struct kiocb *req; |
290 | unsigned head, avail; | 410 | unsigned cpu, avail; |
411 | DEFINE_WAIT(wait); | ||
291 | 412 | ||
292 | spin_lock_irq(&ctx->ctx_lock); | 413 | spin_lock_irq(&ctx->ctx_lock); |
293 | 414 | ||
@@ -296,28 +417,38 @@ static void free_ioctx(struct kioctx *ctx) | |||
296 | struct kiocb, ki_list); | 417 | struct kiocb, ki_list); |
297 | 418 | ||
298 | list_del_init(&req->ki_list); | 419 | list_del_init(&req->ki_list); |
299 | kiocb_cancel(ctx, req, &res); | 420 | kiocb_cancel(ctx, req); |
300 | } | 421 | } |
301 | 422 | ||
302 | spin_unlock_irq(&ctx->ctx_lock); | 423 | spin_unlock_irq(&ctx->ctx_lock); |
303 | 424 | ||
304 | ring = kmap_atomic(ctx->ring_pages[0]); | 425 | for_each_possible_cpu(cpu) { |
305 | head = ring->head; | 426 | struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); |
306 | kunmap_atomic(ring); | ||
307 | 427 | ||
308 | while (atomic_read(&ctx->reqs_active) > 0) { | 428 | atomic_add(kcpu->reqs_available, &ctx->reqs_available); |
309 | wait_event(ctx->wait, | 429 | kcpu->reqs_available = 0; |
310 | head != ctx->tail || | 430 | } |
311 | atomic_read(&ctx->reqs_active) <= 0); | ||
312 | 431 | ||
313 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; | 432 | while (1) { |
433 | prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); | ||
314 | 434 | ||
315 | atomic_sub(avail, &ctx->reqs_active); | 435 | ring = kmap_atomic(ctx->ring_pages[0]); |
316 | head += avail; | 436 | avail = (ring->head <= ring->tail) |
317 | head %= ctx->nr_events; | 437 | ? ring->tail - ring->head |
438 | : ctx->nr_events - ring->head + ring->tail; | ||
439 | |||
440 | atomic_add(avail, &ctx->reqs_available); | ||
441 | ring->head = ring->tail; | ||
442 | kunmap_atomic(ring); | ||
443 | |||
444 | if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) | ||
445 | break; | ||
446 | |||
447 | schedule(); | ||
318 | } | 448 | } |
449 | finish_wait(&ctx->wait, &wait); | ||
319 | 450 | ||
320 | WARN_ON(atomic_read(&ctx->reqs_active) < 0); | 451 | WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); |
321 | 452 | ||
322 | aio_free_ring(ctx); | 453 | aio_free_ring(ctx); |
323 | 454 | ||
@@ -333,10 +464,68 @@ static void free_ioctx(struct kioctx *ctx) | |||
333 | call_rcu(&ctx->rcu_head, free_ioctx_rcu); | 464 | call_rcu(&ctx->rcu_head, free_ioctx_rcu); |
334 | } | 465 | } |
335 | 466 | ||
336 | static void put_ioctx(struct kioctx *ctx) | 467 | static void free_ioctx_ref(struct percpu_ref *ref) |
337 | { | 468 | { |
338 | if (unlikely(atomic_dec_and_test(&ctx->users))) | 469 | struct kioctx *ctx = container_of(ref, struct kioctx, users); |
339 | free_ioctx(ctx); | 470 | |
471 | INIT_WORK(&ctx->free_work, free_ioctx); | ||
472 | schedule_work(&ctx->free_work); | ||
473 | } | ||
474 | |||
475 | static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | ||
476 | { | ||
477 | unsigned i, new_nr; | ||
478 | struct kioctx_table *table, *old; | ||
479 | struct aio_ring *ring; | ||
480 | |||
481 | spin_lock(&mm->ioctx_lock); | ||
482 | rcu_read_lock(); | ||
483 | table = rcu_dereference(mm->ioctx_table); | ||
484 | |||
485 | while (1) { | ||
486 | if (table) | ||
487 | for (i = 0; i < table->nr; i++) | ||
488 | if (!table->table[i]) { | ||
489 | ctx->id = i; | ||
490 | table->table[i] = ctx; | ||
491 | rcu_read_unlock(); | ||
492 | spin_unlock(&mm->ioctx_lock); | ||
493 | |||
494 | ring = kmap_atomic(ctx->ring_pages[0]); | ||
495 | ring->id = ctx->id; | ||
496 | kunmap_atomic(ring); | ||
497 | return 0; | ||
498 | } | ||
499 | |||
500 | new_nr = (table ? table->nr : 1) * 4; | ||
501 | |||
502 | rcu_read_unlock(); | ||
503 | spin_unlock(&mm->ioctx_lock); | ||
504 | |||
505 | table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * | ||
506 | new_nr, GFP_KERNEL); | ||
507 | if (!table) | ||
508 | return -ENOMEM; | ||
509 | |||
510 | table->nr = new_nr; | ||
511 | |||
512 | spin_lock(&mm->ioctx_lock); | ||
513 | rcu_read_lock(); | ||
514 | old = rcu_dereference(mm->ioctx_table); | ||
515 | |||
516 | if (!old) { | ||
517 | rcu_assign_pointer(mm->ioctx_table, table); | ||
518 | } else if (table->nr > old->nr) { | ||
519 | memcpy(table->table, old->table, | ||
520 | old->nr * sizeof(struct kioctx *)); | ||
521 | |||
522 | rcu_assign_pointer(mm->ioctx_table, table); | ||
523 | kfree_rcu(old, rcu); | ||
524 | } else { | ||
525 | kfree(table); | ||
526 | table = old; | ||
527 | } | ||
528 | } | ||
340 | } | 529 | } |
341 | 530 | ||
342 | /* ioctx_alloc | 531 | /* ioctx_alloc |
@@ -348,6 +537,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
348 | struct kioctx *ctx; | 537 | struct kioctx *ctx; |
349 | int err = -ENOMEM; | 538 | int err = -ENOMEM; |
350 | 539 | ||
540 | /* | ||
541 | * We keep track of the number of available ringbuffer slots, to prevent | ||
542 | * overflow (reqs_available), and we also use percpu counters for this. | ||
543 | * | ||
544 | * So since up to half the slots might be on other cpu's percpu counters | ||
545 | * and unavailable, double nr_events so userspace sees what they | ||
546 | * expected: additionally, we move req_batch slots to/from percpu | ||
547 | * counters at a time, so make sure that isn't 0: | ||
548 | */ | ||
549 | nr_events = max(nr_events, num_possible_cpus() * 4); | ||
550 | nr_events *= 2; | ||
551 | |||
351 | /* Prevent overflows */ | 552 | /* Prevent overflows */ |
352 | if ((nr_events > (0x10000000U / sizeof(struct io_event))) || | 553 | if ((nr_events > (0x10000000U / sizeof(struct io_event))) || |
353 | (nr_events > (0x10000000U / sizeof(struct kiocb)))) { | 554 | (nr_events > (0x10000000U / sizeof(struct kiocb)))) { |
@@ -355,7 +556,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
355 | return ERR_PTR(-EINVAL); | 556 | return ERR_PTR(-EINVAL); |
356 | } | 557 | } |
357 | 558 | ||
358 | if (!nr_events || (unsigned long)nr_events > aio_max_nr) | 559 | if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) |
359 | return ERR_PTR(-EAGAIN); | 560 | return ERR_PTR(-EAGAIN); |
360 | 561 | ||
361 | ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); | 562 | ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); |
@@ -364,8 +565,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
364 | 565 | ||
365 | ctx->max_reqs = nr_events; | 566 | ctx->max_reqs = nr_events; |
366 | 567 | ||
367 | atomic_set(&ctx->users, 2); | 568 | if (percpu_ref_init(&ctx->users, free_ioctx_ref)) |
368 | atomic_set(&ctx->dead, 0); | 569 | goto out_freectx; |
570 | |||
369 | spin_lock_init(&ctx->ctx_lock); | 571 | spin_lock_init(&ctx->ctx_lock); |
370 | spin_lock_init(&ctx->completion_lock); | 572 | spin_lock_init(&ctx->completion_lock); |
371 | mutex_init(&ctx->ring_lock); | 573 | mutex_init(&ctx->ring_lock); |
@@ -373,12 +575,21 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
373 | 575 | ||
374 | INIT_LIST_HEAD(&ctx->active_reqs); | 576 | INIT_LIST_HEAD(&ctx->active_reqs); |
375 | 577 | ||
578 | ctx->cpu = alloc_percpu(struct kioctx_cpu); | ||
579 | if (!ctx->cpu) | ||
580 | goto out_freeref; | ||
581 | |||
376 | if (aio_setup_ring(ctx) < 0) | 582 | if (aio_setup_ring(ctx) < 0) |
377 | goto out_freectx; | 583 | goto out_freepcpu; |
584 | |||
585 | atomic_set(&ctx->reqs_available, ctx->nr_events - 1); | ||
586 | ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); | ||
587 | if (ctx->req_batch < 1) | ||
588 | ctx->req_batch = 1; | ||
378 | 589 | ||
379 | /* limit the number of system wide aios */ | 590 | /* limit the number of system wide aios */ |
380 | spin_lock(&aio_nr_lock); | 591 | spin_lock(&aio_nr_lock); |
381 | if (aio_nr + nr_events > aio_max_nr || | 592 | if (aio_nr + nr_events > (aio_max_nr * 2UL) || |
382 | aio_nr + nr_events < aio_nr) { | 593 | aio_nr + nr_events < aio_nr) { |
383 | spin_unlock(&aio_nr_lock); | 594 | spin_unlock(&aio_nr_lock); |
384 | goto out_cleanup; | 595 | goto out_cleanup; |
@@ -386,49 +597,54 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
386 | aio_nr += ctx->max_reqs; | 597 | aio_nr += ctx->max_reqs; |
387 | spin_unlock(&aio_nr_lock); | 598 | spin_unlock(&aio_nr_lock); |
388 | 599 | ||
389 | /* now link into global list. */ | 600 | percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ |
390 | spin_lock(&mm->ioctx_lock); | 601 | |
391 | hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); | 602 | err = ioctx_add_table(ctx, mm); |
392 | spin_unlock(&mm->ioctx_lock); | 603 | if (err) |
604 | goto out_cleanup_put; | ||
393 | 605 | ||
394 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 606 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
395 | ctx, ctx->user_id, mm, ctx->nr_events); | 607 | ctx, ctx->user_id, mm, ctx->nr_events); |
396 | return ctx; | 608 | return ctx; |
397 | 609 | ||
610 | out_cleanup_put: | ||
611 | percpu_ref_put(&ctx->users); | ||
398 | out_cleanup: | 612 | out_cleanup: |
399 | err = -EAGAIN; | 613 | err = -EAGAIN; |
400 | aio_free_ring(ctx); | 614 | aio_free_ring(ctx); |
615 | out_freepcpu: | ||
616 | free_percpu(ctx->cpu); | ||
617 | out_freeref: | ||
618 | free_percpu(ctx->users.pcpu_count); | ||
401 | out_freectx: | 619 | out_freectx: |
620 | if (ctx->aio_ring_file) | ||
621 | fput(ctx->aio_ring_file); | ||
402 | kmem_cache_free(kioctx_cachep, ctx); | 622 | kmem_cache_free(kioctx_cachep, ctx); |
403 | pr_debug("error allocating ioctx %d\n", err); | 623 | pr_debug("error allocating ioctx %d\n", err); |
404 | return ERR_PTR(err); | 624 | return ERR_PTR(err); |
405 | } | 625 | } |
406 | 626 | ||
407 | static void kill_ioctx_work(struct work_struct *work) | ||
408 | { | ||
409 | struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); | ||
410 | |||
411 | wake_up_all(&ctx->wait); | ||
412 | put_ioctx(ctx); | ||
413 | } | ||
414 | |||
415 | static void kill_ioctx_rcu(struct rcu_head *head) | ||
416 | { | ||
417 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | ||
418 | |||
419 | INIT_WORK(&ctx->rcu_work, kill_ioctx_work); | ||
420 | schedule_work(&ctx->rcu_work); | ||
421 | } | ||
422 | |||
423 | /* kill_ioctx | 627 | /* kill_ioctx |
424 | * Cancels all outstanding aio requests on an aio context. Used | 628 | * Cancels all outstanding aio requests on an aio context. Used |
425 | * when the processes owning a context have all exited to encourage | 629 | * when the processes owning a context have all exited to encourage |
426 | * the rapid destruction of the kioctx. | 630 | * the rapid destruction of the kioctx. |
427 | */ | 631 | */ |
428 | static void kill_ioctx(struct kioctx *ctx) | 632 | static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) |
429 | { | 633 | { |
430 | if (!atomic_xchg(&ctx->dead, 1)) { | 634 | if (!atomic_xchg(&ctx->dead, 1)) { |
431 | hlist_del_rcu(&ctx->list); | 635 | struct kioctx_table *table; |
636 | |||
637 | spin_lock(&mm->ioctx_lock); | ||
638 | rcu_read_lock(); | ||
639 | table = rcu_dereference(mm->ioctx_table); | ||
640 | |||
641 | WARN_ON(ctx != table->table[ctx->id]); | ||
642 | table->table[ctx->id] = NULL; | ||
643 | rcu_read_unlock(); | ||
644 | spin_unlock(&mm->ioctx_lock); | ||
645 | |||
646 | /* percpu_ref_kill() will do the necessary call_rcu() */ | ||
647 | wake_up_all(&ctx->wait); | ||
432 | 648 | ||
433 | /* | 649 | /* |
434 | * It'd be more correct to do this in free_ioctx(), after all | 650 | * It'd be more correct to do this in free_ioctx(), after all |
@@ -445,24 +661,23 @@ static void kill_ioctx(struct kioctx *ctx) | |||
445 | if (ctx->mmap_size) | 661 | if (ctx->mmap_size) |
446 | vm_munmap(ctx->mmap_base, ctx->mmap_size); | 662 | vm_munmap(ctx->mmap_base, ctx->mmap_size); |
447 | 663 | ||
448 | /* Between hlist_del_rcu() and dropping the initial ref */ | 664 | percpu_ref_kill(&ctx->users); |
449 | call_rcu(&ctx->rcu_head, kill_ioctx_rcu); | ||
450 | } | 665 | } |
451 | } | 666 | } |
452 | 667 | ||
453 | /* wait_on_sync_kiocb: | 668 | /* wait_on_sync_kiocb: |
454 | * Waits on the given sync kiocb to complete. | 669 | * Waits on the given sync kiocb to complete. |
455 | */ | 670 | */ |
456 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) | 671 | ssize_t wait_on_sync_kiocb(struct kiocb *req) |
457 | { | 672 | { |
458 | while (atomic_read(&iocb->ki_users)) { | 673 | while (!req->ki_ctx) { |
459 | set_current_state(TASK_UNINTERRUPTIBLE); | 674 | set_current_state(TASK_UNINTERRUPTIBLE); |
460 | if (!atomic_read(&iocb->ki_users)) | 675 | if (req->ki_ctx) |
461 | break; | 676 | break; |
462 | io_schedule(); | 677 | io_schedule(); |
463 | } | 678 | } |
464 | __set_current_state(TASK_RUNNING); | 679 | __set_current_state(TASK_RUNNING); |
465 | return iocb->ki_user_data; | 680 | return req->ki_user_data; |
466 | } | 681 | } |
467 | EXPORT_SYMBOL(wait_on_sync_kiocb); | 682 | EXPORT_SYMBOL(wait_on_sync_kiocb); |
468 | 683 | ||
@@ -476,16 +691,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); | |||
476 | */ | 691 | */ |
477 | void exit_aio(struct mm_struct *mm) | 692 | void exit_aio(struct mm_struct *mm) |
478 | { | 693 | { |
694 | struct kioctx_table *table; | ||
479 | struct kioctx *ctx; | 695 | struct kioctx *ctx; |
480 | struct hlist_node *n; | 696 | unsigned i = 0; |
481 | 697 | ||
482 | hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { | 698 | while (1) { |
483 | if (1 != atomic_read(&ctx->users)) | 699 | rcu_read_lock(); |
484 | printk(KERN_DEBUG | 700 | table = rcu_dereference(mm->ioctx_table); |
485 | "exit_aio:ioctx still alive: %d %d %d\n", | 701 | |
486 | atomic_read(&ctx->users), | 702 | do { |
487 | atomic_read(&ctx->dead), | 703 | if (!table || i >= table->nr) { |
488 | atomic_read(&ctx->reqs_active)); | 704 | rcu_read_unlock(); |
705 | rcu_assign_pointer(mm->ioctx_table, NULL); | ||
706 | if (table) | ||
707 | kfree(table); | ||
708 | return; | ||
709 | } | ||
710 | |||
711 | ctx = table->table[i++]; | ||
712 | } while (!ctx); | ||
713 | |||
714 | rcu_read_unlock(); | ||
715 | |||
489 | /* | 716 | /* |
490 | * We don't need to bother with munmap() here - | 717 | * We don't need to bother with munmap() here - |
491 | * exit_mmap(mm) is coming and it'll unmap everything. | 718 | * exit_mmap(mm) is coming and it'll unmap everything. |
@@ -496,40 +723,75 @@ void exit_aio(struct mm_struct *mm) | |||
496 | */ | 723 | */ |
497 | ctx->mmap_size = 0; | 724 | ctx->mmap_size = 0; |
498 | 725 | ||
499 | kill_ioctx(ctx); | 726 | kill_ioctx(mm, ctx); |
727 | } | ||
728 | } | ||
729 | |||
730 | static void put_reqs_available(struct kioctx *ctx, unsigned nr) | ||
731 | { | ||
732 | struct kioctx_cpu *kcpu; | ||
733 | |||
734 | preempt_disable(); | ||
735 | kcpu = this_cpu_ptr(ctx->cpu); | ||
736 | |||
737 | kcpu->reqs_available += nr; | ||
738 | while (kcpu->reqs_available >= ctx->req_batch * 2) { | ||
739 | kcpu->reqs_available -= ctx->req_batch; | ||
740 | atomic_add(ctx->req_batch, &ctx->reqs_available); | ||
741 | } | ||
742 | |||
743 | preempt_enable(); | ||
744 | } | ||
745 | |||
746 | static bool get_reqs_available(struct kioctx *ctx) | ||
747 | { | ||
748 | struct kioctx_cpu *kcpu; | ||
749 | bool ret = false; | ||
750 | |||
751 | preempt_disable(); | ||
752 | kcpu = this_cpu_ptr(ctx->cpu); | ||
753 | |||
754 | if (!kcpu->reqs_available) { | ||
755 | int old, avail = atomic_read(&ctx->reqs_available); | ||
756 | |||
757 | do { | ||
758 | if (avail < ctx->req_batch) | ||
759 | goto out; | ||
760 | |||
761 | old = avail; | ||
762 | avail = atomic_cmpxchg(&ctx->reqs_available, | ||
763 | avail, avail - ctx->req_batch); | ||
764 | } while (avail != old); | ||
765 | |||
766 | kcpu->reqs_available += ctx->req_batch; | ||
500 | } | 767 | } |
768 | |||
769 | ret = true; | ||
770 | kcpu->reqs_available--; | ||
771 | out: | ||
772 | preempt_enable(); | ||
773 | return ret; | ||
501 | } | 774 | } |
502 | 775 | ||
503 | /* aio_get_req | 776 | /* aio_get_req |
504 | * Allocate a slot for an aio request. Increments the ki_users count | 777 | * Allocate a slot for an aio request. |
505 | * of the kioctx so that the kioctx stays around until all requests are | 778 | * Returns NULL if no requests are free. |
506 | * complete. Returns NULL if no requests are free. | ||
507 | * | ||
508 | * Returns with kiocb->ki_users set to 2. The io submit code path holds | ||
509 | * an extra reference while submitting the i/o. | ||
510 | * This prevents races between the aio code path referencing the | ||
511 | * req (after submitting it) and aio_complete() freeing the req. | ||
512 | */ | 779 | */ |
513 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) | 780 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) |
514 | { | 781 | { |
515 | struct kiocb *req; | 782 | struct kiocb *req; |
516 | 783 | ||
517 | if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) | 784 | if (!get_reqs_available(ctx)) |
518 | return NULL; | 785 | return NULL; |
519 | 786 | ||
520 | if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) | ||
521 | goto out_put; | ||
522 | |||
523 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); | 787 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); |
524 | if (unlikely(!req)) | 788 | if (unlikely(!req)) |
525 | goto out_put; | 789 | goto out_put; |
526 | 790 | ||
527 | atomic_set(&req->ki_users, 2); | ||
528 | req->ki_ctx = ctx; | 791 | req->ki_ctx = ctx; |
529 | |||
530 | return req; | 792 | return req; |
531 | out_put: | 793 | out_put: |
532 | atomic_dec(&ctx->reqs_active); | 794 | put_reqs_available(ctx, 1); |
533 | return NULL; | 795 | return NULL; |
534 | } | 796 | } |
535 | 797 | ||
@@ -539,35 +801,32 @@ static void kiocb_free(struct kiocb *req) | |||
539 | fput(req->ki_filp); | 801 | fput(req->ki_filp); |
540 | if (req->ki_eventfd != NULL) | 802 | if (req->ki_eventfd != NULL) |
541 | eventfd_ctx_put(req->ki_eventfd); | 803 | eventfd_ctx_put(req->ki_eventfd); |
542 | if (req->ki_dtor) | ||
543 | req->ki_dtor(req); | ||
544 | if (req->ki_iovec != &req->ki_inline_vec) | ||
545 | kfree(req->ki_iovec); | ||
546 | kmem_cache_free(kiocb_cachep, req); | 804 | kmem_cache_free(kiocb_cachep, req); |
547 | } | 805 | } |
548 | 806 | ||
549 | void aio_put_req(struct kiocb *req) | ||
550 | { | ||
551 | if (atomic_dec_and_test(&req->ki_users)) | ||
552 | kiocb_free(req); | ||
553 | } | ||
554 | EXPORT_SYMBOL(aio_put_req); | ||
555 | |||
556 | static struct kioctx *lookup_ioctx(unsigned long ctx_id) | 807 | static struct kioctx *lookup_ioctx(unsigned long ctx_id) |
557 | { | 808 | { |
809 | struct aio_ring __user *ring = (void __user *)ctx_id; | ||
558 | struct mm_struct *mm = current->mm; | 810 | struct mm_struct *mm = current->mm; |
559 | struct kioctx *ctx, *ret = NULL; | 811 | struct kioctx *ctx, *ret = NULL; |
812 | struct kioctx_table *table; | ||
813 | unsigned id; | ||
814 | |||
815 | if (get_user(id, &ring->id)) | ||
816 | return NULL; | ||
560 | 817 | ||
561 | rcu_read_lock(); | 818 | rcu_read_lock(); |
819 | table = rcu_dereference(mm->ioctx_table); | ||
562 | 820 | ||
563 | hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { | 821 | if (!table || id >= table->nr) |
564 | if (ctx->user_id == ctx_id) { | 822 | goto out; |
565 | atomic_inc(&ctx->users); | ||
566 | ret = ctx; | ||
567 | break; | ||
568 | } | ||
569 | } | ||
570 | 823 | ||
824 | ctx = table->table[id]; | ||
825 | if (ctx && ctx->user_id == ctx_id) { | ||
826 | percpu_ref_get(&ctx->users); | ||
827 | ret = ctx; | ||
828 | } | ||
829 | out: | ||
571 | rcu_read_unlock(); | 830 | rcu_read_unlock(); |
572 | return ret; | 831 | return ret; |
573 | } | 832 | } |
@@ -591,16 +850,16 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
591 | * - the sync task helpfully left a reference to itself in the iocb | 850 | * - the sync task helpfully left a reference to itself in the iocb |
592 | */ | 851 | */ |
593 | if (is_sync_kiocb(iocb)) { | 852 | if (is_sync_kiocb(iocb)) { |
594 | BUG_ON(atomic_read(&iocb->ki_users) != 1); | ||
595 | iocb->ki_user_data = res; | 853 | iocb->ki_user_data = res; |
596 | atomic_set(&iocb->ki_users, 0); | 854 | smp_wmb(); |
855 | iocb->ki_ctx = ERR_PTR(-EXDEV); | ||
597 | wake_up_process(iocb->ki_obj.tsk); | 856 | wake_up_process(iocb->ki_obj.tsk); |
598 | return; | 857 | return; |
599 | } | 858 | } |
600 | 859 | ||
601 | /* | 860 | /* |
602 | * Take rcu_read_lock() in case the kioctx is being destroyed, as we | 861 | * Take rcu_read_lock() in case the kioctx is being destroyed, as we |
603 | * need to issue a wakeup after decrementing reqs_active. | 862 | * need to issue a wakeup after incrementing reqs_available. |
604 | */ | 863 | */ |
605 | rcu_read_lock(); | 864 | rcu_read_lock(); |
606 | 865 | ||
@@ -613,17 +872,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
613 | } | 872 | } |
614 | 873 | ||
615 | /* | 874 | /* |
616 | * cancelled requests don't get events, userland was given one | ||
617 | * when the event got cancelled. | ||
618 | */ | ||
619 | if (unlikely(xchg(&iocb->ki_cancel, | ||
620 | KIOCB_CANCELLED) == KIOCB_CANCELLED)) { | ||
621 | atomic_dec(&ctx->reqs_active); | ||
622 | /* Still need the wake_up in case free_ioctx is waiting */ | ||
623 | goto put_rq; | ||
624 | } | ||
625 | |||
626 | /* | ||
627 | * Add a completion event to the ring buffer. Must be done holding | 875 | * Add a completion event to the ring buffer. Must be done holding |
628 | * ctx->completion_lock to prevent other code from messing with the tail | 876 | * ctx->completion_lock to prevent other code from messing with the tail |
629 | * pointer since we might be called from irq context. | 877 | * pointer since we might be called from irq context. |
@@ -675,9 +923,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) | |||
675 | if (iocb->ki_eventfd != NULL) | 923 | if (iocb->ki_eventfd != NULL) |
676 | eventfd_signal(iocb->ki_eventfd, 1); | 924 | eventfd_signal(iocb->ki_eventfd, 1); |
677 | 925 | ||
678 | put_rq: | ||
679 | /* everything turned out well, dispose of the aiocb. */ | 926 | /* everything turned out well, dispose of the aiocb. */ |
680 | aio_put_req(iocb); | 927 | kiocb_free(iocb); |
681 | 928 | ||
682 | /* | 929 | /* |
683 | * We have to order our ring_info tail store above and test | 930 | * We have to order our ring_info tail store above and test |
@@ -702,7 +949,7 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
702 | struct io_event __user *event, long nr) | 949 | struct io_event __user *event, long nr) |
703 | { | 950 | { |
704 | struct aio_ring *ring; | 951 | struct aio_ring *ring; |
705 | unsigned head, pos; | 952 | unsigned head, tail, pos; |
706 | long ret = 0; | 953 | long ret = 0; |
707 | int copy_ret; | 954 | int copy_ret; |
708 | 955 | ||
@@ -710,11 +957,12 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
710 | 957 | ||
711 | ring = kmap_atomic(ctx->ring_pages[0]); | 958 | ring = kmap_atomic(ctx->ring_pages[0]); |
712 | head = ring->head; | 959 | head = ring->head; |
960 | tail = ring->tail; | ||
713 | kunmap_atomic(ring); | 961 | kunmap_atomic(ring); |
714 | 962 | ||
715 | pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); | 963 | pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); |
716 | 964 | ||
717 | if (head == ctx->tail) | 965 | if (head == tail) |
718 | goto out; | 966 | goto out; |
719 | 967 | ||
720 | while (ret < nr) { | 968 | while (ret < nr) { |
@@ -722,8 +970,8 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
722 | struct io_event *ev; | 970 | struct io_event *ev; |
723 | struct page *page; | 971 | struct page *page; |
724 | 972 | ||
725 | avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; | 973 | avail = (head <= tail ? tail : ctx->nr_events) - head; |
726 | if (head == ctx->tail) | 974 | if (head == tail) |
727 | break; | 975 | break; |
728 | 976 | ||
729 | avail = min(avail, nr - ret); | 977 | avail = min(avail, nr - ret); |
@@ -754,9 +1002,9 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
754 | kunmap_atomic(ring); | 1002 | kunmap_atomic(ring); |
755 | flush_dcache_page(ctx->ring_pages[0]); | 1003 | flush_dcache_page(ctx->ring_pages[0]); |
756 | 1004 | ||
757 | pr_debug("%li h%u t%u\n", ret, head, ctx->tail); | 1005 | pr_debug("%li h%u t%u\n", ret, head, tail); |
758 | 1006 | ||
759 | atomic_sub(ret, &ctx->reqs_active); | 1007 | put_reqs_available(ctx, ret); |
760 | out: | 1008 | out: |
761 | mutex_unlock(&ctx->ring_lock); | 1009 | mutex_unlock(&ctx->ring_lock); |
762 | 1010 | ||
@@ -854,8 +1102,8 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) | |||
854 | if (!IS_ERR(ioctx)) { | 1102 | if (!IS_ERR(ioctx)) { |
855 | ret = put_user(ioctx->user_id, ctxp); | 1103 | ret = put_user(ioctx->user_id, ctxp); |
856 | if (ret) | 1104 | if (ret) |
857 | kill_ioctx(ioctx); | 1105 | kill_ioctx(current->mm, ioctx); |
858 | put_ioctx(ioctx); | 1106 | percpu_ref_put(&ioctx->users); |
859 | } | 1107 | } |
860 | 1108 | ||
861 | out: | 1109 | out: |
@@ -872,101 +1120,37 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) | |||
872 | { | 1120 | { |
873 | struct kioctx *ioctx = lookup_ioctx(ctx); | 1121 | struct kioctx *ioctx = lookup_ioctx(ctx); |
874 | if (likely(NULL != ioctx)) { | 1122 | if (likely(NULL != ioctx)) { |
875 | kill_ioctx(ioctx); | 1123 | kill_ioctx(current->mm, ioctx); |
876 | put_ioctx(ioctx); | 1124 | percpu_ref_put(&ioctx->users); |
877 | return 0; | 1125 | return 0; |
878 | } | 1126 | } |
879 | pr_debug("EINVAL: io_destroy: invalid context id\n"); | 1127 | pr_debug("EINVAL: io_destroy: invalid context id\n"); |
880 | return -EINVAL; | 1128 | return -EINVAL; |
881 | } | 1129 | } |
882 | 1130 | ||
883 | static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) | ||
884 | { | ||
885 | struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; | ||
886 | |||
887 | BUG_ON(ret <= 0); | ||
888 | |||
889 | while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { | ||
890 | ssize_t this = min((ssize_t)iov->iov_len, ret); | ||
891 | iov->iov_base += this; | ||
892 | iov->iov_len -= this; | ||
893 | iocb->ki_left -= this; | ||
894 | ret -= this; | ||
895 | if (iov->iov_len == 0) { | ||
896 | iocb->ki_cur_seg++; | ||
897 | iov++; | ||
898 | } | ||
899 | } | ||
900 | |||
901 | /* the caller should not have done more io than what fit in | ||
902 | * the remaining iovecs */ | ||
903 | BUG_ON(ret > 0 && iocb->ki_left == 0); | ||
904 | } | ||
905 | |||
906 | typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, | 1131 | typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, |
907 | unsigned long, loff_t); | 1132 | unsigned long, loff_t); |
908 | 1133 | ||
909 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) | 1134 | static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, |
910 | { | 1135 | int rw, char __user *buf, |
911 | struct file *file = iocb->ki_filp; | 1136 | unsigned long *nr_segs, |
912 | struct address_space *mapping = file->f_mapping; | 1137 | struct iovec **iovec, |
913 | struct inode *inode = mapping->host; | 1138 | bool compat) |
914 | ssize_t ret = 0; | ||
915 | |||
916 | /* This matches the pread()/pwrite() logic */ | ||
917 | if (iocb->ki_pos < 0) | ||
918 | return -EINVAL; | ||
919 | |||
920 | if (rw == WRITE) | ||
921 | file_start_write(file); | ||
922 | do { | ||
923 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], | ||
924 | iocb->ki_nr_segs - iocb->ki_cur_seg, | ||
925 | iocb->ki_pos); | ||
926 | if (ret > 0) | ||
927 | aio_advance_iovec(iocb, ret); | ||
928 | |||
929 | /* retry all partial writes. retry partial reads as long as its a | ||
930 | * regular file. */ | ||
931 | } while (ret > 0 && iocb->ki_left > 0 && | ||
932 | (rw == WRITE || | ||
933 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); | ||
934 | if (rw == WRITE) | ||
935 | file_end_write(file); | ||
936 | |||
937 | /* This means we must have transferred all that we could */ | ||
938 | /* No need to retry anymore */ | ||
939 | if ((ret == 0) || (iocb->ki_left == 0)) | ||
940 | ret = iocb->ki_nbytes - iocb->ki_left; | ||
941 | |||
942 | /* If we managed to write some out we return that, rather than | ||
943 | * the eventual error. */ | ||
944 | if (rw == WRITE | ||
945 | && ret < 0 && ret != -EIOCBQUEUED | ||
946 | && iocb->ki_nbytes - iocb->ki_left) | ||
947 | ret = iocb->ki_nbytes - iocb->ki_left; | ||
948 | |||
949 | return ret; | ||
950 | } | ||
951 | |||
952 | static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) | ||
953 | { | 1139 | { |
954 | ssize_t ret; | 1140 | ssize_t ret; |
955 | 1141 | ||
956 | kiocb->ki_nr_segs = kiocb->ki_nbytes; | 1142 | *nr_segs = kiocb->ki_nbytes; |
957 | 1143 | ||
958 | #ifdef CONFIG_COMPAT | 1144 | #ifdef CONFIG_COMPAT |
959 | if (compat) | 1145 | if (compat) |
960 | ret = compat_rw_copy_check_uvector(rw, | 1146 | ret = compat_rw_copy_check_uvector(rw, |
961 | (struct compat_iovec __user *)kiocb->ki_buf, | 1147 | (struct compat_iovec __user *)buf, |
962 | kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, | 1148 | *nr_segs, 1, *iovec, iovec); |
963 | &kiocb->ki_iovec); | ||
964 | else | 1149 | else |
965 | #endif | 1150 | #endif |
966 | ret = rw_copy_check_uvector(rw, | 1151 | ret = rw_copy_check_uvector(rw, |
967 | (struct iovec __user *)kiocb->ki_buf, | 1152 | (struct iovec __user *)buf, |
968 | kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, | 1153 | *nr_segs, 1, *iovec, iovec); |
969 | &kiocb->ki_iovec); | ||
970 | if (ret < 0) | 1154 | if (ret < 0) |
971 | return ret; | 1155 | return ret; |
972 | 1156 | ||
@@ -975,15 +1159,17 @@ static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) | |||
975 | return 0; | 1159 | return 0; |
976 | } | 1160 | } |
977 | 1161 | ||
978 | static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) | 1162 | static ssize_t aio_setup_single_vector(struct kiocb *kiocb, |
1163 | int rw, char __user *buf, | ||
1164 | unsigned long *nr_segs, | ||
1165 | struct iovec *iovec) | ||
979 | { | 1166 | { |
980 | if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) | 1167 | if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) |
981 | return -EFAULT; | 1168 | return -EFAULT; |
982 | 1169 | ||
983 | kiocb->ki_iovec = &kiocb->ki_inline_vec; | 1170 | iovec->iov_base = buf; |
984 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; | 1171 | iovec->iov_len = kiocb->ki_nbytes; |
985 | kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; | 1172 | *nr_segs = 1; |
986 | kiocb->ki_nr_segs = 1; | ||
987 | return 0; | 1173 | return 0; |
988 | } | 1174 | } |
989 | 1175 | ||
@@ -992,15 +1178,18 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) | |||
992 | * Performs the initial checks and aio retry method | 1178 | * Performs the initial checks and aio retry method |
993 | * setup for the kiocb at the time of io submission. | 1179 | * setup for the kiocb at the time of io submission. |
994 | */ | 1180 | */ |
995 | static ssize_t aio_run_iocb(struct kiocb *req, bool compat) | 1181 | static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, |
1182 | char __user *buf, bool compat) | ||
996 | { | 1183 | { |
997 | struct file *file = req->ki_filp; | 1184 | struct file *file = req->ki_filp; |
998 | ssize_t ret; | 1185 | ssize_t ret; |
1186 | unsigned long nr_segs; | ||
999 | int rw; | 1187 | int rw; |
1000 | fmode_t mode; | 1188 | fmode_t mode; |
1001 | aio_rw_op *rw_op; | 1189 | aio_rw_op *rw_op; |
1190 | struct iovec inline_vec, *iovec = &inline_vec; | ||
1002 | 1191 | ||
1003 | switch (req->ki_opcode) { | 1192 | switch (opcode) { |
1004 | case IOCB_CMD_PREAD: | 1193 | case IOCB_CMD_PREAD: |
1005 | case IOCB_CMD_PREADV: | 1194 | case IOCB_CMD_PREADV: |
1006 | mode = FMODE_READ; | 1195 | mode = FMODE_READ; |
@@ -1021,21 +1210,38 @@ rw_common: | |||
1021 | if (!rw_op) | 1210 | if (!rw_op) |
1022 | return -EINVAL; | 1211 | return -EINVAL; |
1023 | 1212 | ||
1024 | ret = (req->ki_opcode == IOCB_CMD_PREADV || | 1213 | ret = (opcode == IOCB_CMD_PREADV || |
1025 | req->ki_opcode == IOCB_CMD_PWRITEV) | 1214 | opcode == IOCB_CMD_PWRITEV) |
1026 | ? aio_setup_vectored_rw(rw, req, compat) | 1215 | ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, |
1027 | : aio_setup_single_vector(rw, req); | 1216 | &iovec, compat) |
1217 | : aio_setup_single_vector(req, rw, buf, &nr_segs, | ||
1218 | iovec); | ||
1028 | if (ret) | 1219 | if (ret) |
1029 | return ret; | 1220 | return ret; |
1030 | 1221 | ||
1031 | ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); | 1222 | ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); |
1032 | if (ret < 0) | 1223 | if (ret < 0) { |
1224 | if (iovec != &inline_vec) | ||
1225 | kfree(iovec); | ||
1033 | return ret; | 1226 | return ret; |
1227 | } | ||
1034 | 1228 | ||
1035 | req->ki_nbytes = ret; | 1229 | req->ki_nbytes = ret; |
1036 | req->ki_left = ret; | ||
1037 | 1230 | ||
1038 | ret = aio_rw_vect_retry(req, rw, rw_op); | 1231 | /* XXX: move/kill - rw_verify_area()? */ |
1232 | /* This matches the pread()/pwrite() logic */ | ||
1233 | if (req->ki_pos < 0) { | ||
1234 | ret = -EINVAL; | ||
1235 | break; | ||
1236 | } | ||
1237 | |||
1238 | if (rw == WRITE) | ||
1239 | file_start_write(file); | ||
1240 | |||
1241 | ret = rw_op(req, iovec, nr_segs, req->ki_pos); | ||
1242 | |||
1243 | if (rw == WRITE) | ||
1244 | file_end_write(file); | ||
1039 | break; | 1245 | break; |
1040 | 1246 | ||
1041 | case IOCB_CMD_FDSYNC: | 1247 | case IOCB_CMD_FDSYNC: |
@@ -1057,6 +1263,9 @@ rw_common: | |||
1057 | return -EINVAL; | 1263 | return -EINVAL; |
1058 | } | 1264 | } |
1059 | 1265 | ||
1266 | if (iovec != &inline_vec) | ||
1267 | kfree(iovec); | ||
1268 | |||
1060 | if (ret != -EIOCBQUEUED) { | 1269 | if (ret != -EIOCBQUEUED) { |
1061 | /* | 1270 | /* |
1062 | * There's no easy way to restart the syscall since other AIO's | 1271 | * There's no easy way to restart the syscall since other AIO's |
@@ -1128,21 +1337,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | |||
1128 | req->ki_obj.user = user_iocb; | 1337 | req->ki_obj.user = user_iocb; |
1129 | req->ki_user_data = iocb->aio_data; | 1338 | req->ki_user_data = iocb->aio_data; |
1130 | req->ki_pos = iocb->aio_offset; | 1339 | req->ki_pos = iocb->aio_offset; |
1340 | req->ki_nbytes = iocb->aio_nbytes; | ||
1131 | 1341 | ||
1132 | req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; | 1342 | ret = aio_run_iocb(req, iocb->aio_lio_opcode, |
1133 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; | 1343 | (char __user *)(unsigned long)iocb->aio_buf, |
1134 | req->ki_opcode = iocb->aio_lio_opcode; | 1344 | compat); |
1135 | |||
1136 | ret = aio_run_iocb(req, compat); | ||
1137 | if (ret) | 1345 | if (ret) |
1138 | goto out_put_req; | 1346 | goto out_put_req; |
1139 | 1347 | ||
1140 | aio_put_req(req); /* drop extra ref to req */ | ||
1141 | return 0; | 1348 | return 0; |
1142 | out_put_req: | 1349 | out_put_req: |
1143 | atomic_dec(&ctx->reqs_active); | 1350 | put_reqs_available(ctx, 1); |
1144 | aio_put_req(req); /* drop extra ref to req */ | 1351 | kiocb_free(req); |
1145 | aio_put_req(req); /* drop i/o ref to req */ | ||
1146 | return ret; | 1352 | return ret; |
1147 | } | 1353 | } |
1148 | 1354 | ||
@@ -1195,7 +1401,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, | |||
1195 | } | 1401 | } |
1196 | blk_finish_plug(&plug); | 1402 | blk_finish_plug(&plug); |
1197 | 1403 | ||
1198 | put_ioctx(ctx); | 1404 | percpu_ref_put(&ctx->users); |
1199 | return i ? i : ret; | 1405 | return i ? i : ret; |
1200 | } | 1406 | } |
1201 | 1407 | ||
@@ -1252,7 +1458,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, | |||
1252 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | 1458 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, |
1253 | struct io_event __user *, result) | 1459 | struct io_event __user *, result) |
1254 | { | 1460 | { |
1255 | struct io_event res; | ||
1256 | struct kioctx *ctx; | 1461 | struct kioctx *ctx; |
1257 | struct kiocb *kiocb; | 1462 | struct kiocb *kiocb; |
1258 | u32 key; | 1463 | u32 key; |
@@ -1270,21 +1475,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | |||
1270 | 1475 | ||
1271 | kiocb = lookup_kiocb(ctx, iocb, key); | 1476 | kiocb = lookup_kiocb(ctx, iocb, key); |
1272 | if (kiocb) | 1477 | if (kiocb) |
1273 | ret = kiocb_cancel(ctx, kiocb, &res); | 1478 | ret = kiocb_cancel(ctx, kiocb); |
1274 | else | 1479 | else |
1275 | ret = -EINVAL; | 1480 | ret = -EINVAL; |
1276 | 1481 | ||
1277 | spin_unlock_irq(&ctx->ctx_lock); | 1482 | spin_unlock_irq(&ctx->ctx_lock); |
1278 | 1483 | ||
1279 | if (!ret) { | 1484 | if (!ret) { |
1280 | /* Cancellation succeeded -- copy the result | 1485 | /* |
1281 | * into the user's buffer. | 1486 | * The result argument is no longer used - the io_event is |
1487 | * always delivered via the ring buffer. -EINPROGRESS indicates | ||
1488 | * cancellation is progress: | ||
1282 | */ | 1489 | */ |
1283 | if (copy_to_user(result, &res, sizeof(res))) | 1490 | ret = -EINPROGRESS; |
1284 | ret = -EFAULT; | ||
1285 | } | 1491 | } |
1286 | 1492 | ||
1287 | put_ioctx(ctx); | 1493 | percpu_ref_put(&ctx->users); |
1288 | 1494 | ||
1289 | return ret; | 1495 | return ret; |
1290 | } | 1496 | } |
@@ -1313,7 +1519,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, | |||
1313 | if (likely(ioctx)) { | 1519 | if (likely(ioctx)) { |
1314 | if (likely(min_nr <= nr && min_nr >= 0)) | 1520 | if (likely(min_nr <= nr && min_nr >= 0)) |
1315 | ret = read_events(ioctx, min_nr, nr, events, timeout); | 1521 | ret = read_events(ioctx, min_nr, nr, events, timeout); |
1316 | put_ioctx(ioctx); | 1522 | percpu_ref_put(&ioctx->users); |
1317 | } | 1523 | } |
1318 | return ret; | 1524 | return ret; |
1319 | } | 1525 | } |
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 47a65df8c871..85c961849953 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c | |||
@@ -109,6 +109,72 @@ static struct file_system_type anon_inode_fs_type = { | |||
109 | }; | 109 | }; |
110 | 110 | ||
111 | /** | 111 | /** |
112 | * anon_inode_getfile_private - creates a new file instance by hooking it up to an | ||
113 | * anonymous inode, and a dentry that describe the "class" | ||
114 | * of the file | ||
115 | * | ||
116 | * @name: [in] name of the "class" of the new file | ||
117 | * @fops: [in] file operations for the new file | ||
118 | * @priv: [in] private data for the new file (will be file's private_data) | ||
119 | * @flags: [in] flags | ||
120 | * | ||
121 | * | ||
122 | * Similar to anon_inode_getfile, but each file holds a single inode. | ||
123 | * | ||
124 | */ | ||
125 | struct file *anon_inode_getfile_private(const char *name, | ||
126 | const struct file_operations *fops, | ||
127 | void *priv, int flags) | ||
128 | { | ||
129 | struct qstr this; | ||
130 | struct path path; | ||
131 | struct file *file; | ||
132 | struct inode *inode; | ||
133 | |||
134 | if (fops->owner && !try_module_get(fops->owner)) | ||
135 | return ERR_PTR(-ENOENT); | ||
136 | |||
137 | inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb); | ||
138 | if (IS_ERR(inode)) { | ||
139 | file = ERR_PTR(-ENOMEM); | ||
140 | goto err_module; | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * Link the inode to a directory entry by creating a unique name | ||
145 | * using the inode sequence number. | ||
146 | */ | ||
147 | file = ERR_PTR(-ENOMEM); | ||
148 | this.name = name; | ||
149 | this.len = strlen(name); | ||
150 | this.hash = 0; | ||
151 | path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); | ||
152 | if (!path.dentry) | ||
153 | goto err_module; | ||
154 | |||
155 | path.mnt = mntget(anon_inode_mnt); | ||
156 | |||
157 | d_instantiate(path.dentry, inode); | ||
158 | |||
159 | file = alloc_file(&path, OPEN_FMODE(flags), fops); | ||
160 | if (IS_ERR(file)) | ||
161 | goto err_dput; | ||
162 | |||
163 | file->f_mapping = inode->i_mapping; | ||
164 | file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); | ||
165 | file->private_data = priv; | ||
166 | |||
167 | return file; | ||
168 | |||
169 | err_dput: | ||
170 | path_put(&path); | ||
171 | err_module: | ||
172 | module_put(fops->owner); | ||
173 | return file; | ||
174 | } | ||
175 | EXPORT_SYMBOL_GPL(anon_inode_getfile_private); | ||
176 | |||
177 | /** | ||
112 | * anon_inode_getfile - creates a new file instance by hooking it up to an | 178 | * anon_inode_getfile - creates a new file instance by hooking it up to an |
113 | * anonymous inode, and a dentry that describe the "class" | 179 | * anonymous inode, and a dentry that describe the "class" |
114 | * of the file | 180 | * of the file |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 1173a4ee0830..c3549ed58038 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -1542,7 +1542,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1542 | return 0; | 1542 | return 0; |
1543 | 1543 | ||
1544 | size -= pos; | 1544 | size -= pos; |
1545 | if (size < iocb->ki_left) | 1545 | if (size < iocb->ki_nbytes) |
1546 | nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); | 1546 | nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); |
1547 | return generic_file_aio_read(iocb, iov, nr_segs, pos); | 1547 | return generic_file_aio_read(iocb, iov, nr_segs, pos); |
1548 | } | 1548 | } |
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0bd7a55a5f07..91ff089d3412 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c | |||
@@ -130,7 +130,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_ | |||
130 | 130 | ||
131 | return -EINVAL; | 131 | return -EINVAL; |
132 | #else | 132 | #else |
133 | VM_BUG_ON(iocb->ki_left != PAGE_SIZE); | ||
134 | VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); | 133 | VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); |
135 | 134 | ||
136 | if (rw == READ || rw == KERNEL_READ) | 135 | if (rw == READ || rw == KERNEL_READ) |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4f8197caa487..d71903c6068b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -2242,7 +2242,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
2242 | file->f_path.dentry->d_name.name, | 2242 | file->f_path.dentry->d_name.name, |
2243 | (unsigned int)nr_segs); | 2243 | (unsigned int)nr_segs); |
2244 | 2244 | ||
2245 | if (iocb->ki_left == 0) | 2245 | if (iocb->ki_nbytes == 0) |
2246 | return 0; | 2246 | return 0; |
2247 | 2247 | ||
2248 | appending = file->f_flags & O_APPEND ? 1 : 0; | 2248 | appending = file->f_flags & O_APPEND ? 1 : 0; |
@@ -2293,7 +2293,7 @@ relock: | |||
2293 | 2293 | ||
2294 | can_do_direct = direct_io; | 2294 | can_do_direct = direct_io; |
2295 | ret = ocfs2_prepare_inode_for_write(file, ppos, | 2295 | ret = ocfs2_prepare_inode_for_write(file, ppos, |
2296 | iocb->ki_left, appending, | 2296 | iocb->ki_nbytes, appending, |
2297 | &can_do_direct, &has_refcount); | 2297 | &can_do_direct, &has_refcount); |
2298 | if (ret < 0) { | 2298 | if (ret < 0) { |
2299 | mlog_errno(ret); | 2299 | mlog_errno(ret); |
@@ -2301,7 +2301,7 @@ relock: | |||
2301 | } | 2301 | } |
2302 | 2302 | ||
2303 | if (direct_io && !is_sync_kiocb(iocb)) | 2303 | if (direct_io && !is_sync_kiocb(iocb)) |
2304 | unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, | 2304 | unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, |
2305 | *ppos); | 2305 | *ppos); |
2306 | 2306 | ||
2307 | /* | 2307 | /* |
diff --git a/fs/read_write.c b/fs/read_write.c index 122a3846d9e1..e3cd280b158c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c | |||
@@ -367,7 +367,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp | |||
367 | 367 | ||
368 | init_sync_kiocb(&kiocb, filp); | 368 | init_sync_kiocb(&kiocb, filp); |
369 | kiocb.ki_pos = *ppos; | 369 | kiocb.ki_pos = *ppos; |
370 | kiocb.ki_left = len; | ||
371 | kiocb.ki_nbytes = len; | 370 | kiocb.ki_nbytes = len; |
372 | 371 | ||
373 | ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); | 372 | ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); |
@@ -417,7 +416,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof | |||
417 | 416 | ||
418 | init_sync_kiocb(&kiocb, filp); | 417 | init_sync_kiocb(&kiocb, filp); |
419 | kiocb.ki_pos = *ppos; | 418 | kiocb.ki_pos = *ppos; |
420 | kiocb.ki_left = len; | ||
421 | kiocb.ki_nbytes = len; | 419 | kiocb.ki_nbytes = len; |
422 | 420 | ||
423 | ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); | 421 | ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); |
@@ -599,7 +597,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, | |||
599 | 597 | ||
600 | init_sync_kiocb(&kiocb, filp); | 598 | init_sync_kiocb(&kiocb, filp); |
601 | kiocb.ki_pos = *ppos; | 599 | kiocb.ki_pos = *ppos; |
602 | kiocb.ki_left = len; | ||
603 | kiocb.ki_nbytes = len; | 600 | kiocb.ki_nbytes = len; |
604 | 601 | ||
605 | ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); | 602 | ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); |
diff --git a/fs/udf/file.c b/fs/udf/file.c index 29569dd08168..c02a27a19c6d 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c | |||
@@ -141,7 +141,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
141 | struct file *file = iocb->ki_filp; | 141 | struct file *file = iocb->ki_filp; |
142 | struct inode *inode = file_inode(file); | 142 | struct inode *inode = file_inode(file); |
143 | int err, pos; | 143 | int err, pos; |
144 | size_t count = iocb->ki_left; | 144 | size_t count = iocb->ki_nbytes; |
145 | struct udf_inode_info *iinfo = UDF_I(inode); | 145 | struct udf_inode_info *iinfo = UDF_I(inode); |
146 | 146 | ||
147 | down_write(&iinfo->i_data_sem); | 147 | down_write(&iinfo->i_data_sem); |