diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-11 19:36:50 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-11 19:36:50 -0400 |
| commit | a63b747b41d6f6c9116fb2260381a3c96fe5dc02 (patch) | |
| tree | 22edfda79990a5b674646605ba6b723447032602 /fs/aio.c | |
| parent | 3123bca71993c2346a458875488863772c1d5dc4 (diff) | |
| parent | fa8a53c39f3fdde98c9eace6a9b412143f0f6ed6 (diff) | |
Merge git://git.kvack.org/~bcrl/aio-next
Pull aio ctx->ring_pages migration serialization fix from Ben LaHaise.
* git://git.kvack.org/~bcrl/aio-next:
aio: v4 ensure access to ctx->ring_pages is correctly serialised for migration
Diffstat (limited to 'fs/aio.c')
| -rw-r--r-- | fs/aio.c | 120 |
1 files changed, 67 insertions, 53 deletions
| @@ -52,7 +52,8 @@ | |||
| 52 | struct aio_ring { | 52 | struct aio_ring { |
| 53 | unsigned id; /* kernel internal index number */ | 53 | unsigned id; /* kernel internal index number */ |
| 54 | unsigned nr; /* number of io_events */ | 54 | unsigned nr; /* number of io_events */ |
| 55 | unsigned head; | 55 | unsigned head; /* Written to by userland or under ring_lock |
| 56 | * mutex by aio_read_events_ring(). */ | ||
| 56 | unsigned tail; | 57 | unsigned tail; |
| 57 | 58 | ||
| 58 | unsigned magic; | 59 | unsigned magic; |
| @@ -243,6 +244,11 @@ static void aio_free_ring(struct kioctx *ctx) | |||
| 243 | { | 244 | { |
| 244 | int i; | 245 | int i; |
| 245 | 246 | ||
| 247 | /* Disconnect the kiotx from the ring file. This prevents future | ||
| 248 | * accesses to the kioctx from page migration. | ||
| 249 | */ | ||
| 250 | put_aio_ring_file(ctx); | ||
| 251 | |||
| 246 | for (i = 0; i < ctx->nr_pages; i++) { | 252 | for (i = 0; i < ctx->nr_pages; i++) { |
| 247 | struct page *page; | 253 | struct page *page; |
| 248 | pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, | 254 | pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, |
| @@ -254,8 +260,6 @@ static void aio_free_ring(struct kioctx *ctx) | |||
| 254 | put_page(page); | 260 | put_page(page); |
| 255 | } | 261 | } |
| 256 | 262 | ||
| 257 | put_aio_ring_file(ctx); | ||
| 258 | |||
| 259 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { | 263 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { |
| 260 | kfree(ctx->ring_pages); | 264 | kfree(ctx->ring_pages); |
| 261 | ctx->ring_pages = NULL; | 265 | ctx->ring_pages = NULL; |
| @@ -283,29 +287,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, | |||
| 283 | { | 287 | { |
| 284 | struct kioctx *ctx; | 288 | struct kioctx *ctx; |
| 285 | unsigned long flags; | 289 | unsigned long flags; |
| 290 | pgoff_t idx; | ||
| 286 | int rc; | 291 | int rc; |
| 287 | 292 | ||
| 288 | rc = 0; | 293 | rc = 0; |
| 289 | 294 | ||
| 290 | /* Make sure the old page hasn't already been changed */ | 295 | /* mapping->private_lock here protects against the kioctx teardown. */ |
| 291 | spin_lock(&mapping->private_lock); | 296 | spin_lock(&mapping->private_lock); |
| 292 | ctx = mapping->private_data; | 297 | ctx = mapping->private_data; |
| 293 | if (ctx) { | 298 | if (!ctx) { |
| 294 | pgoff_t idx; | 299 | rc = -EINVAL; |
| 295 | spin_lock_irqsave(&ctx->completion_lock, flags); | 300 | goto out; |
| 296 | idx = old->index; | 301 | } |
| 297 | if (idx < (pgoff_t)ctx->nr_pages) { | 302 | |
| 298 | if (ctx->ring_pages[idx] != old) | 303 | /* The ring_lock mutex. The prevents aio_read_events() from writing |
| 299 | rc = -EAGAIN; | 304 | * to the ring's head, and prevents page migration from mucking in |
| 300 | } else | 305 | * a partially initialized kiotx. |
| 301 | rc = -EINVAL; | 306 | */ |
| 302 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | 307 | if (!mutex_trylock(&ctx->ring_lock)) { |
| 308 | rc = -EAGAIN; | ||
| 309 | goto out; | ||
| 310 | } | ||
| 311 | |||
| 312 | idx = old->index; | ||
| 313 | if (idx < (pgoff_t)ctx->nr_pages) { | ||
| 314 | /* Make sure the old page hasn't already been changed */ | ||
| 315 | if (ctx->ring_pages[idx] != old) | ||
| 316 | rc = -EAGAIN; | ||
| 303 | } else | 317 | } else |
| 304 | rc = -EINVAL; | 318 | rc = -EINVAL; |
| 305 | spin_unlock(&mapping->private_lock); | ||
| 306 | 319 | ||
| 307 | if (rc != 0) | 320 | if (rc != 0) |
| 308 | return rc; | 321 | goto out_unlock; |
| 309 | 322 | ||
| 310 | /* Writeback must be complete */ | 323 | /* Writeback must be complete */ |
| 311 | BUG_ON(PageWriteback(old)); | 324 | BUG_ON(PageWriteback(old)); |
| @@ -314,38 +327,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, | |||
| 314 | rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); | 327 | rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); |
| 315 | if (rc != MIGRATEPAGE_SUCCESS) { | 328 | if (rc != MIGRATEPAGE_SUCCESS) { |
| 316 | put_page(new); | 329 | put_page(new); |
| 317 | return rc; | 330 | goto out_unlock; |
| 318 | } | 331 | } |
| 319 | 332 | ||
| 320 | /* We can potentially race against kioctx teardown here. Use the | 333 | /* Take completion_lock to prevent other writes to the ring buffer |
| 321 | * address_space's private data lock to protect the mapping's | 334 | * while the old page is copied to the new. This prevents new |
| 322 | * private_data. | 335 | * events from being lost. |
| 323 | */ | 336 | */ |
| 324 | spin_lock(&mapping->private_lock); | 337 | spin_lock_irqsave(&ctx->completion_lock, flags); |
| 325 | ctx = mapping->private_data; | 338 | migrate_page_copy(new, old); |
| 326 | if (ctx) { | 339 | BUG_ON(ctx->ring_pages[idx] != old); |
| 327 | pgoff_t idx; | 340 | ctx->ring_pages[idx] = new; |
| 328 | spin_lock_irqsave(&ctx->completion_lock, flags); | 341 | spin_unlock_irqrestore(&ctx->completion_lock, flags); |
| 329 | migrate_page_copy(new, old); | ||
| 330 | idx = old->index; | ||
| 331 | if (idx < (pgoff_t)ctx->nr_pages) { | ||
| 332 | /* And only do the move if things haven't changed */ | ||
| 333 | if (ctx->ring_pages[idx] == old) | ||
| 334 | ctx->ring_pages[idx] = new; | ||
| 335 | else | ||
| 336 | rc = -EAGAIN; | ||
| 337 | } else | ||
| 338 | rc = -EINVAL; | ||
| 339 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | ||
| 340 | } else | ||
| 341 | rc = -EBUSY; | ||
| 342 | spin_unlock(&mapping->private_lock); | ||
| 343 | 342 | ||
| 344 | if (rc == MIGRATEPAGE_SUCCESS) | 343 | /* The old page is no longer accessible. */ |
| 345 | put_page(old); | 344 | put_page(old); |
| 346 | else | ||
| 347 | put_page(new); | ||
| 348 | 345 | ||
| 346 | out_unlock: | ||
| 347 | mutex_unlock(&ctx->ring_lock); | ||
| 348 | out: | ||
| 349 | spin_unlock(&mapping->private_lock); | ||
| 349 | return rc; | 350 | return rc; |
| 350 | } | 351 | } |
| 351 | #endif | 352 | #endif |
| @@ -380,7 +381,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 380 | file = aio_private_file(ctx, nr_pages); | 381 | file = aio_private_file(ctx, nr_pages); |
| 381 | if (IS_ERR(file)) { | 382 | if (IS_ERR(file)) { |
| 382 | ctx->aio_ring_file = NULL; | 383 | ctx->aio_ring_file = NULL; |
| 383 | return -EAGAIN; | 384 | return -ENOMEM; |
| 384 | } | 385 | } |
| 385 | 386 | ||
| 386 | ctx->aio_ring_file = file; | 387 | ctx->aio_ring_file = file; |
| @@ -415,7 +416,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 415 | 416 | ||
| 416 | if (unlikely(i != nr_pages)) { | 417 | if (unlikely(i != nr_pages)) { |
| 417 | aio_free_ring(ctx); | 418 | aio_free_ring(ctx); |
| 418 | return -EAGAIN; | 419 | return -ENOMEM; |
| 419 | } | 420 | } |
| 420 | 421 | ||
| 421 | ctx->mmap_size = nr_pages * PAGE_SIZE; | 422 | ctx->mmap_size = nr_pages * PAGE_SIZE; |
| @@ -429,7 +430,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
| 429 | if (IS_ERR((void *)ctx->mmap_base)) { | 430 | if (IS_ERR((void *)ctx->mmap_base)) { |
| 430 | ctx->mmap_size = 0; | 431 | ctx->mmap_size = 0; |
| 431 | aio_free_ring(ctx); | 432 | aio_free_ring(ctx); |
| 432 | return -EAGAIN; | 433 | return -ENOMEM; |
| 433 | } | 434 | } |
| 434 | 435 | ||
| 435 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); | 436 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); |
| @@ -556,6 +557,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | |||
| 556 | rcu_read_unlock(); | 557 | rcu_read_unlock(); |
| 557 | spin_unlock(&mm->ioctx_lock); | 558 | spin_unlock(&mm->ioctx_lock); |
| 558 | 559 | ||
| 560 | /* While kioctx setup is in progress, | ||
| 561 | * we are protected from page migration | ||
| 562 | * changes ring_pages by ->ring_lock. | ||
| 563 | */ | ||
| 559 | ring = kmap_atomic(ctx->ring_pages[0]); | 564 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 560 | ring->id = ctx->id; | 565 | ring->id = ctx->id; |
| 561 | kunmap_atomic(ring); | 566 | kunmap_atomic(ring); |
| @@ -640,24 +645,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 640 | 645 | ||
| 641 | ctx->max_reqs = nr_events; | 646 | ctx->max_reqs = nr_events; |
| 642 | 647 | ||
| 643 | if (percpu_ref_init(&ctx->users, free_ioctx_users)) | ||
| 644 | goto err; | ||
| 645 | |||
| 646 | if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) | ||
| 647 | goto err; | ||
| 648 | |||
| 649 | spin_lock_init(&ctx->ctx_lock); | 648 | spin_lock_init(&ctx->ctx_lock); |
| 650 | spin_lock_init(&ctx->completion_lock); | 649 | spin_lock_init(&ctx->completion_lock); |
| 651 | mutex_init(&ctx->ring_lock); | 650 | mutex_init(&ctx->ring_lock); |
| 651 | /* Protect against page migration throughout kiotx setup by keeping | ||
| 652 | * the ring_lock mutex held until setup is complete. */ | ||
| 653 | mutex_lock(&ctx->ring_lock); | ||
| 652 | init_waitqueue_head(&ctx->wait); | 654 | init_waitqueue_head(&ctx->wait); |
| 653 | 655 | ||
| 654 | INIT_LIST_HEAD(&ctx->active_reqs); | 656 | INIT_LIST_HEAD(&ctx->active_reqs); |
| 655 | 657 | ||
| 658 | if (percpu_ref_init(&ctx->users, free_ioctx_users)) | ||
| 659 | goto err; | ||
| 660 | |||
| 661 | if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) | ||
| 662 | goto err; | ||
| 663 | |||
| 656 | ctx->cpu = alloc_percpu(struct kioctx_cpu); | 664 | ctx->cpu = alloc_percpu(struct kioctx_cpu); |
| 657 | if (!ctx->cpu) | 665 | if (!ctx->cpu) |
| 658 | goto err; | 666 | goto err; |
| 659 | 667 | ||
| 660 | if (aio_setup_ring(ctx) < 0) | 668 | err = aio_setup_ring(ctx); |
| 669 | if (err < 0) | ||
| 661 | goto err; | 670 | goto err; |
| 662 | 671 | ||
| 663 | atomic_set(&ctx->reqs_available, ctx->nr_events - 1); | 672 | atomic_set(&ctx->reqs_available, ctx->nr_events - 1); |
| @@ -683,6 +692,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
| 683 | if (err) | 692 | if (err) |
| 684 | goto err_cleanup; | 693 | goto err_cleanup; |
| 685 | 694 | ||
| 695 | /* Release the ring_lock mutex now that all setup is complete. */ | ||
| 696 | mutex_unlock(&ctx->ring_lock); | ||
| 697 | |||
| 686 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 698 | pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
| 687 | ctx, ctx->user_id, mm, ctx->nr_events); | 699 | ctx, ctx->user_id, mm, ctx->nr_events); |
| 688 | return ctx; | 700 | return ctx; |
| @@ -692,6 +704,7 @@ err_cleanup: | |||
| 692 | err_ctx: | 704 | err_ctx: |
| 693 | aio_free_ring(ctx); | 705 | aio_free_ring(ctx); |
| 694 | err: | 706 | err: |
| 707 | mutex_unlock(&ctx->ring_lock); | ||
| 695 | free_percpu(ctx->cpu); | 708 | free_percpu(ctx->cpu); |
| 696 | free_percpu(ctx->reqs.pcpu_count); | 709 | free_percpu(ctx->reqs.pcpu_count); |
| 697 | free_percpu(ctx->users.pcpu_count); | 710 | free_percpu(ctx->users.pcpu_count); |
| @@ -1024,6 +1037,7 @@ static long aio_read_events_ring(struct kioctx *ctx, | |||
| 1024 | 1037 | ||
| 1025 | mutex_lock(&ctx->ring_lock); | 1038 | mutex_lock(&ctx->ring_lock); |
| 1026 | 1039 | ||
| 1040 | /* Access to ->ring_pages here is protected by ctx->ring_lock. */ | ||
| 1027 | ring = kmap_atomic(ctx->ring_pages[0]); | 1041 | ring = kmap_atomic(ctx->ring_pages[0]); |
| 1028 | head = ring->head; | 1042 | head = ring->head; |
| 1029 | tail = ring->tail; | 1043 | tail = ring->tail; |
