diff options
Diffstat (limited to 'fs/aio.c')
-rw-r--r-- | fs/aio.c | 120 |
1 files changed, 67 insertions, 53 deletions
@@ -52,7 +52,8 @@ | |||
52 | struct aio_ring { | 52 | struct aio_ring { |
53 | unsigned id; /* kernel internal index number */ | 53 | unsigned id; /* kernel internal index number */ |
54 | unsigned nr; /* number of io_events */ | 54 | unsigned nr; /* number of io_events */ |
55 | unsigned head; | 55 | unsigned head; /* Written to by userland or under ring_lock |
56 | * mutex by aio_read_events_ring(). */ | ||
56 | unsigned tail; | 57 | unsigned tail; |
57 | 58 | ||
58 | unsigned magic; | 59 | unsigned magic; |
@@ -243,6 +244,11 @@ static void aio_free_ring(struct kioctx *ctx) | |||
243 | { | 244 | { |
244 | int i; | 245 | int i; |
245 | 246 | ||
247 | /* Disconnect the kiotx from the ring file. This prevents future | ||
248 | * accesses to the kioctx from page migration. | ||
249 | */ | ||
250 | put_aio_ring_file(ctx); | ||
251 | |||
246 | for (i = 0; i < ctx->nr_pages; i++) { | 252 | for (i = 0; i < ctx->nr_pages; i++) { |
247 | struct page *page; | 253 | struct page *page; |
248 | pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, | 254 | pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, |
@@ -254,8 +260,6 @@ static void aio_free_ring(struct kioctx *ctx) | |||
254 | put_page(page); | 260 | put_page(page); |
255 | } | 261 | } |
256 | 262 | ||
257 | put_aio_ring_file(ctx); | ||
258 | |||
259 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { | 263 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { |
260 | kfree(ctx->ring_pages); | 264 | kfree(ctx->ring_pages); |
261 | ctx->ring_pages = NULL; | 265 | ctx->ring_pages = NULL; |
@@ -283,29 +287,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, | |||
283 | { | 287 | { |
284 | struct kioctx *ctx; | 288 | struct kioctx *ctx; |
285 | unsigned long flags; | 289 | unsigned long flags; |
290 | pgoff_t idx; | ||
286 | int rc; | 291 | int rc; |
287 | 292 | ||
288 | rc = 0; | 293 | rc = 0; |
289 | 294 | ||
290 | /* Make sure the old page hasn't already been changed */ | 295 | /* mapping->private_lock here protects against the kioctx teardown. */ |
291 | spin_lock(&mapping->private_lock); | 296 | spin_lock(&mapping->private_lock); |
292 | ctx = mapping->private_data; | 297 | ctx = mapping->private_data; |
293 | if (ctx) { | 298 | if (!ctx) { |
294 | pgoff_t idx; | 299 | rc = -EINVAL; |
295 | spin_lock_irqsave(&ctx->completion_lock, flags); | 300 | goto out; |
296 | idx = old->index; | 301 | } |
297 | if (idx < (pgoff_t)ctx->nr_pages) { | 302 | |
298 | if (ctx->ring_pages[idx] != old) | 303 | /* The ring_lock mutex. The prevents aio_read_events() from writing |
299 | rc = -EAGAIN; | 304 | * to the ring's head, and prevents page migration from mucking in |
300 | } else | 305 | * a partially initialized kiotx. |
301 | rc = -EINVAL; | 306 | */ |
302 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | 307 | if (!mutex_trylock(&ctx->ring_lock)) { |
308 | rc = -EAGAIN; | ||
309 | goto out; | ||
310 | } | ||
311 | |||
312 | idx = old->index; | ||
313 | if (idx < (pgoff_t)ctx->nr_pages) { | ||
314 | /* Make sure the old page hasn't already been changed */ | ||
315 | if (ctx->ring_pages[idx] != old) | ||
316 | rc = -EAGAIN; | ||
303 | } else | 317 | } else |
304 | rc = -EINVAL; | 318 | rc = -EINVAL; |
305 | spin_unlock(&mapping->private_lock); | ||
306 | 319 | ||
307 | if (rc != 0) | 320 | if (rc != 0) |
308 | return rc; | 321 | goto out_unlock; |
309 | 322 | ||
310 | /* Writeback must be complete */ | 323 | /* Writeback must be complete */ |
311 | BUG_ON(PageWriteback(old)); | 324 | BUG_ON(PageWriteback(old)); |
@@ -314,38 +327,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, | |||
314 | rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); | 327 | rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); |
315 | if (rc != MIGRATEPAGE_SUCCESS) { | 328 | if (rc != MIGRATEPAGE_SUCCESS) { |
316 | put_page(new); | 329 | put_page(new); |
317 | return rc; | 330 | goto out_unlock; |
318 | } | 331 | } |
319 | 332 | ||
320 | /* We can potentially race against kioctx teardown here. Use the | 333 | /* Take completion_lock to prevent other writes to the ring buffer |
321 | * address_space's private data lock to protect the mapping's | 334 | * while the old page is copied to the new. This prevents new |
322 | * private_data. | 335 | * events from being lost. |
323 | */ | 336 | */ |
324 | spin_lock(&mapping->private_lock); | 337 | spin_lock_irqsave(&ctx->completion_lock, flags); |
325 | ctx = mapping->private_data; | 338 | migrate_page_copy(new, old); |
326 | if (ctx) { | 339 | BUG_ON(ctx->ring_pages[idx] != old); |
327 | pgoff_t idx; | 340 | ctx->ring_pages[idx] = new; |
328 | spin_lock_irqsave(&ctx->completion_lock, flags); | 341 | spin_unlock_irqrestore(&ctx->completion_lock, flags); |
329 | migrate_page_copy(new, old); | ||
330 | idx = old->index; | ||
331 | if (idx < (pgoff_t)ctx->nr_pages) { | ||
332 | /* And only do the move if things haven't changed */ | ||
333 | if (ctx->ring_pages[idx] == old) | ||
334 | ctx->ring_pages[idx] = new; | ||
335 | else | ||
336 | rc = -EAGAIN; | ||
337 | } else | ||
338 | rc = -EINVAL; | ||
339 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | ||
340 | } else | ||
341 | rc = -EBUSY; | ||
342 | spin_unlock(&mapping->private_lock); | ||
343 | 342 | ||
344 | if (rc == MIGRATEPAGE_SUCCESS) | 343 | /* The old page is no longer accessible. */ |
345 | put_page(old); | 344 | put_page(old); |
346 | else | ||
347 | put_page(new); | ||
348 | 345 | ||
346 | out_unlock: | ||
347 | mutex_unlock(&ctx->ring_lock); | ||
348 | out: | ||
349 | spin_unlock(&mapping->private_lock); | ||
349 | return rc; | 350 | return rc; |
350 | } | 351 | } |
351 | #endif | 352 | #endif |
@@ -380,7 +381,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
380 | file = aio_private_file(ctx, nr_pages); | 381 | file = aio_private_file(ctx, nr_pages); |
381 | if (IS_ERR(file)) { | 382 | if (IS_ERR(file)) { |
382 | ctx->aio_ring_file = NULL; | 383 | ctx->aio_ring_file = NULL; |
383 | return -EAGAIN; | 384 | return -ENOMEM; |
384 | } | 385 | } |
385 | 386 | ||
386 | ctx->aio_ring_file = file; | 387 | ctx->aio_ring_file = file; |
@@ -415,7 +416,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
415 | 416 | ||
416 | if (unlikely(i != nr_pages)) { | 417 | if (unlikely(i != nr_pages)) { |
417 | aio_free_ring(ctx); | 418 | aio_free_ring(ctx); |
418 | return -EAGAIN; | 419 | return -ENOMEM; |
419 | } | 420 | } |
420 | 421 | ||
421 | ctx->mmap_size = nr_pages * PAGE_SIZE; | 422 | ctx->mmap_size = nr_pages * PAGE_SIZE; |
@@ -429,7 +430,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
429 | if (IS_ERR((void *)ctx->mmap_base)) { | 430 | if (IS_ERR((void *)ctx->mmap_base)) { |
430 | ctx->mmap_size = 0; | 431 | ctx->mmap_size = 0; |
431 | aio_free_ring(ctx); | 432 | aio_free_ring(ctx); |
432 | return -EAGAIN; | 433 | return -ENOMEM; |
433 | } | 434 | } |
434 | 435 | ||
435 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); | 436 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); |
@@ -556,6 +557,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | |||
556 | rcu_read_unlock(); | 557 | rcu_read_unlock(); |
557 | spin_unlock(&mm->ioctx_lock); | 558 | spin_unlock(&mm->ioctx_lock); |
558 | 559 | ||
560 | /* While kioctx setup is in progress, | ||
561 | * we are protected from page migration | ||
562 | * changes ring_pages by ->ring_lock. | ||
563 | */ | ||
559 | ring = kmap_atomic(ctx->ring_pages[0]); | 564 | ring = kmap_atomic(ctx->ring_pages[0]); |
560 | ring->id = ctx->id; | 565 | ring->id = ctx->id; |
561 | kunmap_atomic(ring); | 566 | kunmap_atomic(ring); |
@@ -640,24 +645,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
640 | 645 | ||
641 | ctx->max_reqs = nr_events; | 646 | ctx->max_reqs = nr_events; |
642 | 647 | ||
643 | if (percpu_ref_init(&ctx->users, free_ioctx_users)) | ||
644 | goto err; | ||
645 | |||
646 | if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) | ||
647 | goto err; | ||
648 | |||
649 | spin_lock_init(&ctx->ctx_lock); | 648 | spin_lock_init(&ctx->ctx_lock); |
650 | spin_lock_init(&ctx->completion_lock); | 649 | spin_lock_init(&ctx->completion_lock); |
651 | mutex_init(&ctx->ring_lock); | 650 | mutex_init(&ctx->ring_lock); |
651 | /* Protect against page migration throughout kiotx setup by keeping | ||
652 | * the ring_lock mutex held until setup is complete. */ | ||
653 | mutex_lock(&ctx->ring_lock); | ||
652 | init_waitqueue_head(&ctx->wait); | 654 | init_waitqueue_head(&ctx->wait); |
653 | 655 | ||
654 | INIT_LIST_HEAD(&ctx->active_reqs); | 656 | INIT_LIST_HEAD(&ctx->active_reqs); |
655 | 657 | ||
658 | if (percpu_ref_init(&ctx->users, free_ioctx_users)) | ||
659 | goto err; | ||
660 | |||
661 | if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) | ||
662 | goto err; | ||
663 | |||
656 | ctx->cpu = alloc_percpu(struct kioctx_cpu); | 664 | ctx->cpu = alloc_percpu(struct kioctx_cpu); |
657 | if (!ctx->cpu) | 665 | if (!ctx->cpu) |
658 | goto err; | 666 | goto err; |
659 | 667 | ||
660 | if (aio_setup_ring(ctx) < 0) | 668 | err = aio_setup_ring(ctx); |
669 | if (err < 0) | ||
661 | goto err; | 670 | goto err; |
662 | 671 | ||
663 | atomic_set(&ctx->reqs_available, ctx->nr_events - 1); | 672 | atomic_set(&ctx->reqs_available, ctx->nr_events - 1); |
@@ -683,6 +692,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
683 | if (err) | 692 | if (err) |
684 | goto err_cleanup; | 693 | goto err_cleanup; |
685 | 694 | ||
695 | /* Release the ring_lock mutex now that all setup is complete. */ | ||
696 | mutex_unlock(&ctx->ring_lock); | ||
697 | |||
686 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 698 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
687 | ctx, ctx->user_id, mm, ctx->nr_events); | 699 | ctx, ctx->user_id, mm, ctx->nr_events); |
688 | return ctx; | 700 | return ctx; |
@@ -692,6 +704,7 @@ err_cleanup: | |||
692 | err_ctx: | 704 | err_ctx: |
693 | aio_free_ring(ctx); | 705 | aio_free_ring(ctx); |
694 | err: | 706 | err: |
707 | mutex_unlock(&ctx->ring_lock); | ||
695 | free_percpu(ctx->cpu); | 708 | free_percpu(ctx->cpu); |
696 | free_percpu(ctx->reqs.pcpu_count); | 709 | free_percpu(ctx->reqs.pcpu_count); |
697 | free_percpu(ctx->users.pcpu_count); | 710 | free_percpu(ctx->users.pcpu_count); |
@@ -1024,6 +1037,7 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
1024 | 1037 | ||
1025 | mutex_lock(&ctx->ring_lock); | 1038 | mutex_lock(&ctx->ring_lock); |
1026 | 1039 | ||
1040 | /* Access to ->ring_pages here is protected by ctx->ring_lock. */ | ||
1027 | ring = kmap_atomic(ctx->ring_pages[0]); | 1041 | ring = kmap_atomic(ctx->ring_pages[0]); |
1028 | head = ring->head; | 1042 | head = ring->head; |
1029 | tail = ring->tail; | 1043 | tail = ring->tail; |