diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 382 |
1 files changed, 186 insertions, 196 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 126d3973b3d1..ef169f37156d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -39,11 +39,10 @@ | |||
| 39 | /* | 39 | /* |
| 40 | * FIXME: remove all knowledge of the buffer layer from the core VM | 40 | * FIXME: remove all knowledge of the buffer layer from the core VM |
| 41 | */ | 41 | */ |
| 42 | #include <linux/buffer_head.h> /* for generic_osync_inode */ | 42 | #include <linux/buffer_head.h> /* for try_to_free_buffers */ |
| 43 | 43 | ||
| 44 | #include <asm/mman.h> | 44 | #include <asm/mman.h> |
| 45 | 45 | ||
| 46 | |||
| 47 | /* | 46 | /* |
| 48 | * Shared mappings implemented 30.11.1994. It's not fully working yet, | 47 | * Shared mappings implemented 30.11.1994. It's not fully working yet, |
| 49 | * though. | 48 | * though. |
| @@ -59,7 +58,7 @@ | |||
| 59 | /* | 58 | /* |
| 60 | * Lock ordering: | 59 | * Lock ordering: |
| 61 | * | 60 | * |
| 62 | * ->i_mmap_lock (vmtruncate) | 61 | * ->i_mmap_lock (truncate_pagecache) |
| 63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 62 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
| 64 | * ->swap_lock (exclusive_swap_page, others) | 63 | * ->swap_lock (exclusive_swap_page, others) |
| 65 | * ->mapping->tree_lock | 64 | * ->mapping->tree_lock |
| @@ -105,6 +104,10 @@ | |||
| 105 | * | 104 | * |
| 106 | * ->task->proc_lock | 105 | * ->task->proc_lock |
| 107 | * ->dcache_lock (proc_pid_lookup) | 106 | * ->dcache_lock (proc_pid_lookup) |
| 107 | * | ||
| 108 | * (code doesn't rely on that order, so you could switch it around) | ||
| 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | ||
| 110 | * ->i_mmap_lock | ||
| 108 | */ | 111 | */ |
| 109 | 112 | ||
| 110 | /* | 113 | /* |
| @@ -120,8 +123,9 @@ void __remove_from_page_cache(struct page *page) | |||
| 120 | page->mapping = NULL; | 123 | page->mapping = NULL; |
| 121 | mapping->nrpages--; | 124 | mapping->nrpages--; |
| 122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 125 | __dec_zone_page_state(page, NR_FILE_PAGES); |
| 126 | if (PageSwapBacked(page)) | ||
| 127 | __dec_zone_page_state(page, NR_SHMEM); | ||
| 123 | BUG_ON(page_mapped(page)); | 128 | BUG_ON(page_mapped(page)); |
| 124 | mem_cgroup_uncharge_cache_page(page); | ||
| 125 | 129 | ||
| 126 | /* | 130 | /* |
| 127 | * Some filesystems seem to re-dirty the page even after | 131 | * Some filesystems seem to re-dirty the page even after |
| @@ -145,6 +149,7 @@ void remove_from_page_cache(struct page *page) | |||
| 145 | spin_lock_irq(&mapping->tree_lock); | 149 | spin_lock_irq(&mapping->tree_lock); |
| 146 | __remove_from_page_cache(page); | 150 | __remove_from_page_cache(page); |
| 147 | spin_unlock_irq(&mapping->tree_lock); | 151 | spin_unlock_irq(&mapping->tree_lock); |
| 152 | mem_cgroup_uncharge_cache_page(page); | ||
| 148 | } | 153 | } |
| 149 | 154 | ||
| 150 | static int sync_page(void *word) | 155 | static int sync_page(void *word) |
| @@ -307,68 +312,24 @@ int wait_on_page_writeback_range(struct address_space *mapping, | |||
| 307 | } | 312 | } |
| 308 | 313 | ||
| 309 | /** | 314 | /** |
| 310 | * sync_page_range - write and wait on all pages in the passed range | 315 | * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range |
| 311 | * @inode: target inode | 316 | * @mapping: address space structure to wait for |
| 312 | * @mapping: target address_space | 317 | * @start: offset in bytes where the range starts |
| 313 | * @pos: beginning offset in pages to write | 318 | * @end: offset in bytes where the range ends (inclusive) |
| 314 | * @count: number of bytes to write | ||
| 315 | * | ||
| 316 | * Write and wait upon all the pages in the passed range. This is a "data | ||
| 317 | * integrity" operation. It waits upon in-flight writeout before starting and | ||
| 318 | * waiting upon new writeout. If there was an IO error, return it. | ||
| 319 | * | 319 | * |
| 320 | * We need to re-take i_mutex during the generic_osync_inode list walk because | 320 | * Walk the list of under-writeback pages of the given address space |
| 321 | * it is otherwise livelockable. | 321 | * in the given range and wait for all of them. |
| 322 | */ | ||
| 323 | int sync_page_range(struct inode *inode, struct address_space *mapping, | ||
| 324 | loff_t pos, loff_t count) | ||
| 325 | { | ||
| 326 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; | ||
| 327 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; | ||
| 328 | int ret; | ||
| 329 | |||
| 330 | if (!mapping_cap_writeback_dirty(mapping) || !count) | ||
| 331 | return 0; | ||
| 332 | ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); | ||
| 333 | if (ret == 0) { | ||
| 334 | mutex_lock(&inode->i_mutex); | ||
| 335 | ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 336 | mutex_unlock(&inode->i_mutex); | ||
| 337 | } | ||
| 338 | if (ret == 0) | ||
| 339 | ret = wait_on_page_writeback_range(mapping, start, end); | ||
| 340 | return ret; | ||
| 341 | } | ||
| 342 | EXPORT_SYMBOL(sync_page_range); | ||
| 343 | |||
| 344 | /** | ||
| 345 | * sync_page_range_nolock - write & wait on all pages in the passed range without locking | ||
| 346 | * @inode: target inode | ||
| 347 | * @mapping: target address_space | ||
| 348 | * @pos: beginning offset in pages to write | ||
| 349 | * @count: number of bytes to write | ||
| 350 | * | 322 | * |
| 351 | * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea | 323 | * This is just a simple wrapper so that callers don't have to convert offsets |
| 352 | * as it forces O_SYNC writers to different parts of the same file | 324 | * to page indexes themselves |
| 353 | * to be serialised right until io completion. | ||
| 354 | */ | 325 | */ |
| 355 | int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, | 326 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start, |
| 356 | loff_t pos, loff_t count) | 327 | loff_t end) |
| 357 | { | 328 | { |
| 358 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; | 329 | return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, |
| 359 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; | 330 | end >> PAGE_CACHE_SHIFT); |
| 360 | int ret; | ||
| 361 | |||
| 362 | if (!mapping_cap_writeback_dirty(mapping) || !count) | ||
| 363 | return 0; | ||
| 364 | ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); | ||
| 365 | if (ret == 0) | ||
| 366 | ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 367 | if (ret == 0) | ||
| 368 | ret = wait_on_page_writeback_range(mapping, start, end); | ||
| 369 | return ret; | ||
| 370 | } | 331 | } |
| 371 | EXPORT_SYMBOL(sync_page_range_nolock); | 332 | EXPORT_SYMBOL(filemap_fdatawait_range); |
| 372 | 333 | ||
| 373 | /** | 334 | /** |
| 374 | * filemap_fdatawait - wait for all under-writeback pages to complete | 335 | * filemap_fdatawait - wait for all under-writeback pages to complete |
| @@ -441,6 +402,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
| 441 | } | 402 | } |
| 442 | return err; | 403 | return err; |
| 443 | } | 404 | } |
| 405 | EXPORT_SYMBOL(filemap_write_and_wait_range); | ||
| 444 | 406 | ||
| 445 | /** | 407 | /** |
| 446 | * add_to_page_cache_locked - add a locked page to the pagecache | 408 | * add_to_page_cache_locked - add a locked page to the pagecache |
| @@ -475,13 +437,15 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
| 475 | if (likely(!error)) { | 437 | if (likely(!error)) { |
| 476 | mapping->nrpages++; | 438 | mapping->nrpages++; |
| 477 | __inc_zone_page_state(page, NR_FILE_PAGES); | 439 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 440 | if (PageSwapBacked(page)) | ||
| 441 | __inc_zone_page_state(page, NR_SHMEM); | ||
| 442 | spin_unlock_irq(&mapping->tree_lock); | ||
| 478 | } else { | 443 | } else { |
| 479 | page->mapping = NULL; | 444 | page->mapping = NULL; |
| 445 | spin_unlock_irq(&mapping->tree_lock); | ||
| 480 | mem_cgroup_uncharge_cache_page(page); | 446 | mem_cgroup_uncharge_cache_page(page); |
| 481 | page_cache_release(page); | 447 | page_cache_release(page); |
| 482 | } | 448 | } |
| 483 | |||
| 484 | spin_unlock_irq(&mapping->tree_lock); | ||
| 485 | radix_tree_preload_end(); | 449 | radix_tree_preload_end(); |
| 486 | } else | 450 | } else |
| 487 | mem_cgroup_uncharge_cache_page(page); | 451 | mem_cgroup_uncharge_cache_page(page); |
| @@ -513,13 +477,14 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
| 513 | } | 477 | } |
| 514 | return ret; | 478 | return ret; |
| 515 | } | 479 | } |
| 480 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); | ||
| 516 | 481 | ||
| 517 | #ifdef CONFIG_NUMA | 482 | #ifdef CONFIG_NUMA |
| 518 | struct page *__page_cache_alloc(gfp_t gfp) | 483 | struct page *__page_cache_alloc(gfp_t gfp) |
| 519 | { | 484 | { |
| 520 | if (cpuset_do_page_mem_spread()) { | 485 | if (cpuset_do_page_mem_spread()) { |
| 521 | int n = cpuset_mem_spread_node(); | 486 | int n = cpuset_mem_spread_node(); |
| 522 | return alloc_pages_node(n, gfp, 0); | 487 | return alloc_pages_exact_node(n, gfp, 0); |
| 523 | } | 488 | } |
| 524 | return alloc_pages(gfp, 0); | 489 | return alloc_pages(gfp, 0); |
| 525 | } | 490 | } |
| @@ -565,6 +530,24 @@ void wait_on_page_bit(struct page *page, int bit_nr) | |||
| 565 | EXPORT_SYMBOL(wait_on_page_bit); | 530 | EXPORT_SYMBOL(wait_on_page_bit); |
| 566 | 531 | ||
| 567 | /** | 532 | /** |
| 533 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue | ||
| 534 | * @page: Page defining the wait queue of interest | ||
| 535 | * @waiter: Waiter to add to the queue | ||
| 536 | * | ||
| 537 | * Add an arbitrary @waiter to the wait queue for the nominated @page. | ||
| 538 | */ | ||
| 539 | void add_page_wait_queue(struct page *page, wait_queue_t *waiter) | ||
| 540 | { | ||
| 541 | wait_queue_head_t *q = page_waitqueue(page); | ||
| 542 | unsigned long flags; | ||
| 543 | |||
| 544 | spin_lock_irqsave(&q->lock, flags); | ||
| 545 | __add_wait_queue(q, waiter); | ||
| 546 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 547 | } | ||
| 548 | EXPORT_SYMBOL_GPL(add_page_wait_queue); | ||
| 549 | |||
| 550 | /** | ||
| 568 | * unlock_page - unlock a locked page | 551 | * unlock_page - unlock a locked page |
| 569 | * @page: the page | 552 | * @page: the page |
| 570 | * | 553 | * |
| @@ -627,6 +610,7 @@ int __lock_page_killable(struct page *page) | |||
| 627 | return __wait_on_bit_lock(page_waitqueue(page), &wait, | 610 | return __wait_on_bit_lock(page_waitqueue(page), &wait, |
| 628 | sync_page_killable, TASK_KILLABLE); | 611 | sync_page_killable, TASK_KILLABLE); |
| 629 | } | 612 | } |
| 613 | EXPORT_SYMBOL_GPL(__lock_page_killable); | ||
| 630 | 614 | ||
| 631 | /** | 615 | /** |
| 632 | * __lock_page_nosync - get a lock on the page, without calling sync_page() | 616 | * __lock_page_nosync - get a lock on the page, without calling sync_page() |
| @@ -983,9 +967,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); | |||
| 983 | static void shrink_readahead_size_eio(struct file *filp, | 967 | static void shrink_readahead_size_eio(struct file *filp, |
| 984 | struct file_ra_state *ra) | 968 | struct file_ra_state *ra) |
| 985 | { | 969 | { |
| 986 | if (!ra->ra_pages) | ||
| 987 | return; | ||
| 988 | |||
| 989 | ra->ra_pages /= 4; | 970 | ra->ra_pages /= 4; |
| 990 | } | 971 | } |
| 991 | 972 | ||
| @@ -1369,8 +1350,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
| 1369 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1350 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
| 1370 | return -EINVAL; | 1351 | return -EINVAL; |
| 1371 | 1352 | ||
| 1372 | force_page_cache_readahead(mapping, filp, index, | 1353 | force_page_cache_readahead(mapping, filp, index, nr); |
| 1373 | max_sane_readahead(nr)); | ||
| 1374 | return 0; | 1354 | return 0; |
| 1375 | } | 1355 | } |
| 1376 | 1356 | ||
| @@ -1436,6 +1416,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
| 1436 | 1416 | ||
| 1437 | #define MMAP_LOTSAMISS (100) | 1417 | #define MMAP_LOTSAMISS (100) |
| 1438 | 1418 | ||
| 1419 | /* | ||
| 1420 | * Synchronous readahead happens when we don't even find | ||
| 1421 | * a page in the page cache at all. | ||
| 1422 | */ | ||
| 1423 | static void do_sync_mmap_readahead(struct vm_area_struct *vma, | ||
| 1424 | struct file_ra_state *ra, | ||
| 1425 | struct file *file, | ||
| 1426 | pgoff_t offset) | ||
| 1427 | { | ||
| 1428 | unsigned long ra_pages; | ||
| 1429 | struct address_space *mapping = file->f_mapping; | ||
| 1430 | |||
| 1431 | /* If we don't want any read-ahead, don't bother */ | ||
| 1432 | if (VM_RandomReadHint(vma)) | ||
| 1433 | return; | ||
| 1434 | |||
| 1435 | if (VM_SequentialReadHint(vma) || | ||
| 1436 | offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { | ||
| 1437 | page_cache_sync_readahead(mapping, ra, file, offset, | ||
| 1438 | ra->ra_pages); | ||
| 1439 | return; | ||
| 1440 | } | ||
| 1441 | |||
| 1442 | if (ra->mmap_miss < INT_MAX) | ||
| 1443 | ra->mmap_miss++; | ||
| 1444 | |||
| 1445 | /* | ||
| 1446 | * Do we miss much more than hit in this file? If so, | ||
| 1447 | * stop bothering with read-ahead. It will only hurt. | ||
| 1448 | */ | ||
| 1449 | if (ra->mmap_miss > MMAP_LOTSAMISS) | ||
| 1450 | return; | ||
| 1451 | |||
| 1452 | /* | ||
| 1453 | * mmap read-around | ||
| 1454 | */ | ||
| 1455 | ra_pages = max_sane_readahead(ra->ra_pages); | ||
| 1456 | if (ra_pages) { | ||
| 1457 | ra->start = max_t(long, 0, offset - ra_pages/2); | ||
| 1458 | ra->size = ra_pages; | ||
| 1459 | ra->async_size = 0; | ||
| 1460 | ra_submit(ra, mapping, file); | ||
| 1461 | } | ||
| 1462 | } | ||
| 1463 | |||
| 1464 | /* | ||
| 1465 | * Asynchronous readahead happens when we find the page and PG_readahead, | ||
| 1466 | * so we want to possibly extend the readahead further.. | ||
| 1467 | */ | ||
| 1468 | static void do_async_mmap_readahead(struct vm_area_struct *vma, | ||
| 1469 | struct file_ra_state *ra, | ||
| 1470 | struct file *file, | ||
| 1471 | struct page *page, | ||
| 1472 | pgoff_t offset) | ||
| 1473 | { | ||
| 1474 | struct address_space *mapping = file->f_mapping; | ||
| 1475 | |||
| 1476 | /* If we don't want any read-ahead, don't bother */ | ||
| 1477 | if (VM_RandomReadHint(vma)) | ||
| 1478 | return; | ||
| 1479 | if (ra->mmap_miss > 0) | ||
| 1480 | ra->mmap_miss--; | ||
| 1481 | if (PageReadahead(page)) | ||
| 1482 | page_cache_async_readahead(mapping, ra, file, | ||
| 1483 | page, offset, ra->ra_pages); | ||
| 1484 | } | ||
| 1485 | |||
| 1439 | /** | 1486 | /** |
| 1440 | * filemap_fault - read in file data for page fault handling | 1487 | * filemap_fault - read in file data for page fault handling |
| 1441 | * @vma: vma in which the fault was taken | 1488 | * @vma: vma in which the fault was taken |
| @@ -1455,78 +1502,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1455 | struct address_space *mapping = file->f_mapping; | 1502 | struct address_space *mapping = file->f_mapping; |
| 1456 | struct file_ra_state *ra = &file->f_ra; | 1503 | struct file_ra_state *ra = &file->f_ra; |
| 1457 | struct inode *inode = mapping->host; | 1504 | struct inode *inode = mapping->host; |
| 1505 | pgoff_t offset = vmf->pgoff; | ||
| 1458 | struct page *page; | 1506 | struct page *page; |
| 1459 | pgoff_t size; | 1507 | pgoff_t size; |
| 1460 | int did_readaround = 0; | ||
| 1461 | int ret = 0; | 1508 | int ret = 0; |
| 1462 | 1509 | ||
| 1463 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1510 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 1464 | if (vmf->pgoff >= size) | 1511 | if (offset >= size) |
| 1465 | return VM_FAULT_SIGBUS; | 1512 | return VM_FAULT_SIGBUS; |
| 1466 | 1513 | ||
| 1467 | /* If we don't want any read-ahead, don't bother */ | ||
| 1468 | if (VM_RandomReadHint(vma)) | ||
| 1469 | goto no_cached_page; | ||
| 1470 | |||
| 1471 | /* | 1514 | /* |
| 1472 | * Do we have something in the page cache already? | 1515 | * Do we have something in the page cache already? |
| 1473 | */ | 1516 | */ |
| 1474 | retry_find: | 1517 | page = find_get_page(mapping, offset); |
| 1475 | page = find_lock_page(mapping, vmf->pgoff); | 1518 | if (likely(page)) { |
| 1476 | /* | ||
| 1477 | * For sequential accesses, we use the generic readahead logic. | ||
| 1478 | */ | ||
| 1479 | if (VM_SequentialReadHint(vma)) { | ||
| 1480 | if (!page) { | ||
| 1481 | page_cache_sync_readahead(mapping, ra, file, | ||
| 1482 | vmf->pgoff, 1); | ||
| 1483 | page = find_lock_page(mapping, vmf->pgoff); | ||
| 1484 | if (!page) | ||
| 1485 | goto no_cached_page; | ||
| 1486 | } | ||
| 1487 | if (PageReadahead(page)) { | ||
| 1488 | page_cache_async_readahead(mapping, ra, file, page, | ||
| 1489 | vmf->pgoff, 1); | ||
| 1490 | } | ||
| 1491 | } | ||
| 1492 | |||
| 1493 | if (!page) { | ||
| 1494 | unsigned long ra_pages; | ||
| 1495 | |||
| 1496 | ra->mmap_miss++; | ||
| 1497 | |||
| 1498 | /* | 1519 | /* |
| 1499 | * Do we miss much more than hit in this file? If so, | 1520 | * We found the page, so try async readahead before |
| 1500 | * stop bothering with read-ahead. It will only hurt. | 1521 | * waiting for the lock. |
| 1501 | */ | 1522 | */ |
| 1502 | if (ra->mmap_miss > MMAP_LOTSAMISS) | 1523 | do_async_mmap_readahead(vma, ra, file, page, offset); |
| 1503 | goto no_cached_page; | 1524 | lock_page(page); |
| 1504 | 1525 | ||
| 1505 | /* | 1526 | /* Did it get truncated? */ |
| 1506 | * To keep the pgmajfault counter straight, we need to | 1527 | if (unlikely(page->mapping != mapping)) { |
| 1507 | * check did_readaround, as this is an inner loop. | 1528 | unlock_page(page); |
| 1508 | */ | 1529 | put_page(page); |
| 1509 | if (!did_readaround) { | 1530 | goto no_cached_page; |
| 1510 | ret = VM_FAULT_MAJOR; | ||
| 1511 | count_vm_event(PGMAJFAULT); | ||
| 1512 | } | ||
| 1513 | did_readaround = 1; | ||
| 1514 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | ||
| 1515 | if (ra_pages) { | ||
| 1516 | pgoff_t start = 0; | ||
| 1517 | |||
| 1518 | if (vmf->pgoff > ra_pages / 2) | ||
| 1519 | start = vmf->pgoff - ra_pages / 2; | ||
| 1520 | do_page_cache_readahead(mapping, file, start, ra_pages); | ||
| 1521 | } | 1531 | } |
| 1522 | page = find_lock_page(mapping, vmf->pgoff); | 1532 | } else { |
| 1533 | /* No page in the page cache at all */ | ||
| 1534 | do_sync_mmap_readahead(vma, ra, file, offset); | ||
| 1535 | count_vm_event(PGMAJFAULT); | ||
| 1536 | ret = VM_FAULT_MAJOR; | ||
| 1537 | retry_find: | ||
| 1538 | page = find_lock_page(mapping, offset); | ||
| 1523 | if (!page) | 1539 | if (!page) |
| 1524 | goto no_cached_page; | 1540 | goto no_cached_page; |
| 1525 | } | 1541 | } |
| 1526 | 1542 | ||
| 1527 | if (!did_readaround) | ||
| 1528 | ra->mmap_miss--; | ||
| 1529 | |||
| 1530 | /* | 1543 | /* |
| 1531 | * We have a locked page in the page cache, now we need to check | 1544 | * We have a locked page in the page cache, now we need to check |
| 1532 | * that it's up-to-date. If not, it is going to be due to an error. | 1545 | * that it's up-to-date. If not, it is going to be due to an error. |
| @@ -1534,18 +1547,18 @@ retry_find: | |||
| 1534 | if (unlikely(!PageUptodate(page))) | 1547 | if (unlikely(!PageUptodate(page))) |
| 1535 | goto page_not_uptodate; | 1548 | goto page_not_uptodate; |
| 1536 | 1549 | ||
| 1537 | /* Must recheck i_size under page lock */ | 1550 | /* |
| 1551 | * Found the page and have a reference on it. | ||
| 1552 | * We must recheck i_size under page lock. | ||
| 1553 | */ | ||
| 1538 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1554 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 1539 | if (unlikely(vmf->pgoff >= size)) { | 1555 | if (unlikely(offset >= size)) { |
| 1540 | unlock_page(page); | 1556 | unlock_page(page); |
| 1541 | page_cache_release(page); | 1557 | page_cache_release(page); |
| 1542 | return VM_FAULT_SIGBUS; | 1558 | return VM_FAULT_SIGBUS; |
| 1543 | } | 1559 | } |
| 1544 | 1560 | ||
| 1545 | /* | 1561 | ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; |
| 1546 | * Found the page and have a reference on it. | ||
| 1547 | */ | ||
| 1548 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | ||
| 1549 | vmf->page = page; | 1562 | vmf->page = page; |
| 1550 | return ret | VM_FAULT_LOCKED; | 1563 | return ret | VM_FAULT_LOCKED; |
| 1551 | 1564 | ||
| @@ -1554,7 +1567,7 @@ no_cached_page: | |||
| 1554 | * We're only likely to ever get here if MADV_RANDOM is in | 1567 | * We're only likely to ever get here if MADV_RANDOM is in |
| 1555 | * effect. | 1568 | * effect. |
| 1556 | */ | 1569 | */ |
| 1557 | error = page_cache_read(file, vmf->pgoff); | 1570 | error = page_cache_read(file, offset); |
| 1558 | 1571 | ||
| 1559 | /* | 1572 | /* |
| 1560 | * The page we want has now been added to the page cache. | 1573 | * The page we want has now been added to the page cache. |
| @@ -1574,12 +1587,6 @@ no_cached_page: | |||
| 1574 | return VM_FAULT_SIGBUS; | 1587 | return VM_FAULT_SIGBUS; |
| 1575 | 1588 | ||
| 1576 | page_not_uptodate: | 1589 | page_not_uptodate: |
| 1577 | /* IO error path */ | ||
| 1578 | if (!did_readaround) { | ||
| 1579 | ret = VM_FAULT_MAJOR; | ||
| 1580 | count_vm_event(PGMAJFAULT); | ||
| 1581 | } | ||
| 1582 | |||
| 1583 | /* | 1590 | /* |
| 1584 | * Umm, take care of errors if the page isn't up-to-date. | 1591 | * Umm, take care of errors if the page isn't up-to-date. |
| 1585 | * Try to re-read it _once_. We do this synchronously, | 1592 | * Try to re-read it _once_. We do this synchronously, |
| @@ -1604,7 +1611,7 @@ page_not_uptodate: | |||
| 1604 | } | 1611 | } |
| 1605 | EXPORT_SYMBOL(filemap_fault); | 1612 | EXPORT_SYMBOL(filemap_fault); |
| 1606 | 1613 | ||
| 1607 | struct vm_operations_struct generic_file_vm_ops = { | 1614 | const struct vm_operations_struct generic_file_vm_ops = { |
| 1608 | .fault = filemap_fault, | 1615 | .fault = filemap_fault, |
| 1609 | }; | 1616 | }; |
| 1610 | 1617 | ||
| @@ -2123,20 +2130,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2123 | } | 2130 | } |
| 2124 | *ppos = end; | 2131 | *ppos = end; |
| 2125 | } | 2132 | } |
| 2126 | |||
| 2127 | /* | ||
| 2128 | * Sync the fs metadata but not the minor inode changes and | ||
| 2129 | * of course not the data as we did direct DMA for the IO. | ||
| 2130 | * i_mutex is held, which protects generic_osync_inode() from | ||
| 2131 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. | ||
| 2132 | */ | ||
| 2133 | out: | 2133 | out: |
| 2134 | if ((written >= 0 || written == -EIOCBQUEUED) && | ||
| 2135 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2136 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 2137 | if (err < 0) | ||
| 2138 | written = err; | ||
| 2139 | } | ||
| 2140 | return written; | 2134 | return written; |
| 2141 | } | 2135 | } |
| 2142 | EXPORT_SYMBOL(generic_file_direct_write); | 2136 | EXPORT_SYMBOL(generic_file_direct_write); |
| @@ -2228,6 +2222,7 @@ again: | |||
| 2228 | pagefault_enable(); | 2222 | pagefault_enable(); |
| 2229 | flush_dcache_page(page); | 2223 | flush_dcache_page(page); |
| 2230 | 2224 | ||
| 2225 | mark_page_accessed(page); | ||
| 2231 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | 2226 | status = a_ops->write_end(file, mapping, pos, bytes, copied, |
| 2232 | page, fsdata); | 2227 | page, fsdata); |
| 2233 | if (unlikely(status < 0)) | 2228 | if (unlikely(status < 0)) |
| @@ -2267,8 +2262,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2267 | { | 2262 | { |
| 2268 | struct file *file = iocb->ki_filp; | 2263 | struct file *file = iocb->ki_filp; |
| 2269 | struct address_space *mapping = file->f_mapping; | 2264 | struct address_space *mapping = file->f_mapping; |
| 2270 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2271 | struct inode *inode = mapping->host; | ||
| 2272 | ssize_t status; | 2265 | ssize_t status; |
| 2273 | struct iov_iter i; | 2266 | struct iov_iter i; |
| 2274 | 2267 | ||
| @@ -2278,16 +2271,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2278 | if (likely(status >= 0)) { | 2271 | if (likely(status >= 0)) { |
| 2279 | written += status; | 2272 | written += status; |
| 2280 | *ppos = pos + status; | 2273 | *ppos = pos + status; |
| 2281 | |||
| 2282 | /* | ||
| 2283 | * For now, when the user asks for O_SYNC, we'll actually give | ||
| 2284 | * O_DSYNC | ||
| 2285 | */ | ||
| 2286 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2287 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | ||
| 2288 | status = generic_osync_inode(inode, mapping, | ||
| 2289 | OSYNC_METADATA|OSYNC_DATA); | ||
| 2290 | } | ||
| 2291 | } | 2274 | } |
| 2292 | 2275 | ||
| 2293 | /* | 2276 | /* |
| @@ -2303,9 +2286,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2303 | } | 2286 | } |
| 2304 | EXPORT_SYMBOL(generic_file_buffered_write); | 2287 | EXPORT_SYMBOL(generic_file_buffered_write); |
| 2305 | 2288 | ||
| 2306 | static ssize_t | 2289 | /** |
| 2307 | __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | 2290 | * __generic_file_aio_write - write data to a file |
| 2308 | unsigned long nr_segs, loff_t *ppos) | 2291 | * @iocb: IO state structure (file, offset, etc.) |
| 2292 | * @iov: vector with data to write | ||
| 2293 | * @nr_segs: number of segments in the vector | ||
| 2294 | * @ppos: position where to write | ||
| 2295 | * | ||
| 2296 | * This function does all the work needed for actually writing data to a | ||
| 2297 | * file. It does all basic checks, removes SUID from the file, updates | ||
| 2298 | * modification times and calls proper subroutines depending on whether we | ||
| 2299 | * do direct IO or a standard buffered write. | ||
| 2300 | * | ||
| 2301 | * It expects i_mutex to be grabbed unless we work on a block device or similar | ||
| 2302 | * object which does not need locking at all. | ||
| 2303 | * | ||
| 2304 | * This function does *not* take care of syncing data in case of O_SYNC write. | ||
| 2305 | * A caller has to handle it. This is mainly due to the fact that we want to | ||
| 2306 | * avoid syncing under i_mutex. | ||
| 2307 | */ | ||
| 2308 | ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | ||
| 2309 | unsigned long nr_segs, loff_t *ppos) | ||
| 2309 | { | 2310 | { |
| 2310 | struct file *file = iocb->ki_filp; | 2311 | struct file *file = iocb->ki_filp; |
| 2311 | struct address_space * mapping = file->f_mapping; | 2312 | struct address_space * mapping = file->f_mapping; |
| @@ -2402,51 +2403,37 @@ out: | |||
| 2402 | current->backing_dev_info = NULL; | 2403 | current->backing_dev_info = NULL; |
| 2403 | return written ? written : err; | 2404 | return written ? written : err; |
| 2404 | } | 2405 | } |
| 2406 | EXPORT_SYMBOL(__generic_file_aio_write); | ||
| 2405 | 2407 | ||
| 2406 | ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, | 2408 | /** |
| 2407 | const struct iovec *iov, unsigned long nr_segs, loff_t pos) | 2409 | * generic_file_aio_write - write data to a file |
| 2408 | { | 2410 | * @iocb: IO state structure |
| 2409 | struct file *file = iocb->ki_filp; | 2411 | * @iov: vector with data to write |
| 2410 | struct address_space *mapping = file->f_mapping; | 2412 | * @nr_segs: number of segments in the vector |
| 2411 | struct inode *inode = mapping->host; | 2413 | * @pos: position in file where to write |
| 2412 | ssize_t ret; | 2414 | * |
| 2413 | 2415 | * This is a wrapper around __generic_file_aio_write() to be used by most | |
| 2414 | BUG_ON(iocb->ki_pos != pos); | 2416 | * filesystems. It takes care of syncing the file in case of O_SYNC file |
| 2415 | 2417 | * and acquires i_mutex as needed. | |
| 2416 | ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, | 2418 | */ |
| 2417 | &iocb->ki_pos); | ||
| 2418 | |||
| 2419 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2420 | ssize_t err; | ||
| 2421 | |||
| 2422 | err = sync_page_range_nolock(inode, mapping, pos, ret); | ||
| 2423 | if (err < 0) | ||
| 2424 | ret = err; | ||
| 2425 | } | ||
| 2426 | return ret; | ||
| 2427 | } | ||
| 2428 | EXPORT_SYMBOL(generic_file_aio_write_nolock); | ||
| 2429 | |||
| 2430 | ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | 2419 | ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, |
| 2431 | unsigned long nr_segs, loff_t pos) | 2420 | unsigned long nr_segs, loff_t pos) |
| 2432 | { | 2421 | { |
| 2433 | struct file *file = iocb->ki_filp; | 2422 | struct file *file = iocb->ki_filp; |
| 2434 | struct address_space *mapping = file->f_mapping; | 2423 | struct inode *inode = file->f_mapping->host; |
| 2435 | struct inode *inode = mapping->host; | ||
| 2436 | ssize_t ret; | 2424 | ssize_t ret; |
| 2437 | 2425 | ||
| 2438 | BUG_ON(iocb->ki_pos != pos); | 2426 | BUG_ON(iocb->ki_pos != pos); |
| 2439 | 2427 | ||
| 2440 | mutex_lock(&inode->i_mutex); | 2428 | mutex_lock(&inode->i_mutex); |
| 2441 | ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, | 2429 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
| 2442 | &iocb->ki_pos); | ||
| 2443 | mutex_unlock(&inode->i_mutex); | 2430 | mutex_unlock(&inode->i_mutex); |
| 2444 | 2431 | ||
| 2445 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2432 | if (ret > 0 || ret == -EIOCBQUEUED) { |
| 2446 | ssize_t err; | 2433 | ssize_t err; |
| 2447 | 2434 | ||
| 2448 | err = sync_page_range(inode, mapping, pos, ret); | 2435 | err = generic_write_sync(file, pos, ret); |
| 2449 | if (err < 0) | 2436 | if (err < 0 && ret > 0) |
| 2450 | ret = err; | 2437 | ret = err; |
| 2451 | } | 2438 | } |
| 2452 | return ret; | 2439 | return ret; |
| @@ -2463,6 +2450,9 @@ EXPORT_SYMBOL(generic_file_aio_write); | |||
| 2463 | * (presumably at page->private). If the release was successful, return `1'. | 2450 | * (presumably at page->private). If the release was successful, return `1'. |
| 2464 | * Otherwise return zero. | 2451 | * Otherwise return zero. |
| 2465 | * | 2452 | * |
| 2453 | * This may also be called if PG_fscache is set on a page, indicating that the | ||
| 2454 | * page is known to the local caching routines. | ||
| 2455 | * | ||
| 2466 | * The @gfp_mask argument specifies whether I/O may be performed to release | 2456 | * The @gfp_mask argument specifies whether I/O may be performed to release |
| 2467 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). | 2457 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). |
| 2468 | * | 2458 | * |
