aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c382
1 files changed, 186 insertions, 196 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d3973b3d1..ef169f37156d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
39/* 39/*
40 * FIXME: remove all knowledge of the buffer layer from the core VM 40 * FIXME: remove all knowledge of the buffer layer from the core VM
41 */ 41 */
42#include <linux/buffer_head.h> /* for generic_osync_inode */ 42#include <linux/buffer_head.h> /* for try_to_free_buffers */
43 43
44#include <asm/mman.h> 44#include <asm/mman.h>
45 45
46
47/* 46/*
48 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
49 * though. 48 * though.
@@ -59,7 +58,7 @@
59/* 58/*
60 * Lock ordering: 59 * Lock ordering:
61 * 60 *
62 * ->i_mmap_lock (vmtruncate) 61 * ->i_mmap_lock (truncate_pagecache)
63 * ->private_lock (__free_pte->__set_page_dirty_buffers) 62 * ->private_lock (__free_pte->__set_page_dirty_buffers)
64 * ->swap_lock (exclusive_swap_page, others) 63 * ->swap_lock (exclusive_swap_page, others)
65 * ->mapping->tree_lock 64 * ->mapping->tree_lock
@@ -105,6 +104,10 @@
105 * 104 *
106 * ->task->proc_lock 105 * ->task->proc_lock
107 * ->dcache_lock (proc_pid_lookup) 106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock
108 */ 111 */
109 112
110/* 113/*
@@ -120,8 +123,9 @@ void __remove_from_page_cache(struct page *page)
120 page->mapping = NULL; 123 page->mapping = NULL;
121 mapping->nrpages--; 124 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 125 __dec_zone_page_state(page, NR_FILE_PAGES);
126 if (PageSwapBacked(page))
127 __dec_zone_page_state(page, NR_SHMEM);
123 BUG_ON(page_mapped(page)); 128 BUG_ON(page_mapped(page));
124 mem_cgroup_uncharge_cache_page(page);
125 129
126 /* 130 /*
127 * Some filesystems seem to re-dirty the page even after 131 * Some filesystems seem to re-dirty the page even after
@@ -145,6 +149,7 @@ void remove_from_page_cache(struct page *page)
145 spin_lock_irq(&mapping->tree_lock); 149 spin_lock_irq(&mapping->tree_lock);
146 __remove_from_page_cache(page); 150 __remove_from_page_cache(page);
147 spin_unlock_irq(&mapping->tree_lock); 151 spin_unlock_irq(&mapping->tree_lock);
152 mem_cgroup_uncharge_cache_page(page);
148} 153}
149 154
150static int sync_page(void *word) 155static int sync_page(void *word)
@@ -307,68 +312,24 @@ int wait_on_page_writeback_range(struct address_space *mapping,
307} 312}
308 313
309/** 314/**
310 * sync_page_range - write and wait on all pages in the passed range 315 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
311 * @inode: target inode 316 * @mapping: address space structure to wait for
312 * @mapping: target address_space 317 * @start: offset in bytes where the range starts
313 * @pos: beginning offset in pages to write 318 * @end: offset in bytes where the range ends (inclusive)
314 * @count: number of bytes to write
315 *
316 * Write and wait upon all the pages in the passed range. This is a "data
317 * integrity" operation. It waits upon in-flight writeout before starting and
318 * waiting upon new writeout. If there was an IO error, return it.
319 * 319 *
320 * We need to re-take i_mutex during the generic_osync_inode list walk because 320 * Walk the list of under-writeback pages of the given address space
321 * it is otherwise livelockable. 321 * in the given range and wait for all of them.
322 */
323int sync_page_range(struct inode *inode, struct address_space *mapping,
324 loff_t pos, loff_t count)
325{
326 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
327 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
328 int ret;
329
330 if (!mapping_cap_writeback_dirty(mapping) || !count)
331 return 0;
332 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
333 if (ret == 0) {
334 mutex_lock(&inode->i_mutex);
335 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
336 mutex_unlock(&inode->i_mutex);
337 }
338 if (ret == 0)
339 ret = wait_on_page_writeback_range(mapping, start, end);
340 return ret;
341}
342EXPORT_SYMBOL(sync_page_range);
343
344/**
345 * sync_page_range_nolock - write & wait on all pages in the passed range without locking
346 * @inode: target inode
347 * @mapping: target address_space
348 * @pos: beginning offset in pages to write
349 * @count: number of bytes to write
350 * 322 *
351 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea 323 * This is just a simple wrapper so that callers don't have to convert offsets
352 * as it forces O_SYNC writers to different parts of the same file 324 * to page indexes themselves
353 * to be serialised right until io completion.
354 */ 325 */
355int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, 326int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
356 loff_t pos, loff_t count) 327 loff_t end)
357{ 328{
358 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 329 return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
359 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 330 end >> PAGE_CACHE_SHIFT);
360 int ret;
361
362 if (!mapping_cap_writeback_dirty(mapping) || !count)
363 return 0;
364 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
365 if (ret == 0)
366 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
367 if (ret == 0)
368 ret = wait_on_page_writeback_range(mapping, start, end);
369 return ret;
370} 331}
371EXPORT_SYMBOL(sync_page_range_nolock); 332EXPORT_SYMBOL(filemap_fdatawait_range);
372 333
373/** 334/**
374 * filemap_fdatawait - wait for all under-writeback pages to complete 335 * filemap_fdatawait - wait for all under-writeback pages to complete
@@ -441,6 +402,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
441 } 402 }
442 return err; 403 return err;
443} 404}
405EXPORT_SYMBOL(filemap_write_and_wait_range);
444 406
445/** 407/**
446 * add_to_page_cache_locked - add a locked page to the pagecache 408 * add_to_page_cache_locked - add a locked page to the pagecache
@@ -475,13 +437,15 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
475 if (likely(!error)) { 437 if (likely(!error)) {
476 mapping->nrpages++; 438 mapping->nrpages++;
477 __inc_zone_page_state(page, NR_FILE_PAGES); 439 __inc_zone_page_state(page, NR_FILE_PAGES);
440 if (PageSwapBacked(page))
441 __inc_zone_page_state(page, NR_SHMEM);
442 spin_unlock_irq(&mapping->tree_lock);
478 } else { 443 } else {
479 page->mapping = NULL; 444 page->mapping = NULL;
445 spin_unlock_irq(&mapping->tree_lock);
480 mem_cgroup_uncharge_cache_page(page); 446 mem_cgroup_uncharge_cache_page(page);
481 page_cache_release(page); 447 page_cache_release(page);
482 } 448 }
483
484 spin_unlock_irq(&mapping->tree_lock);
485 radix_tree_preload_end(); 449 radix_tree_preload_end();
486 } else 450 } else
487 mem_cgroup_uncharge_cache_page(page); 451 mem_cgroup_uncharge_cache_page(page);
@@ -513,13 +477,14 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
513 } 477 }
514 return ret; 478 return ret;
515} 479}
480EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
516 481
517#ifdef CONFIG_NUMA 482#ifdef CONFIG_NUMA
518struct page *__page_cache_alloc(gfp_t gfp) 483struct page *__page_cache_alloc(gfp_t gfp)
519{ 484{
520 if (cpuset_do_page_mem_spread()) { 485 if (cpuset_do_page_mem_spread()) {
521 int n = cpuset_mem_spread_node(); 486 int n = cpuset_mem_spread_node();
522 return alloc_pages_node(n, gfp, 0); 487 return alloc_pages_exact_node(n, gfp, 0);
523 } 488 }
524 return alloc_pages(gfp, 0); 489 return alloc_pages(gfp, 0);
525} 490}
@@ -565,6 +530,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
565EXPORT_SYMBOL(wait_on_page_bit); 530EXPORT_SYMBOL(wait_on_page_bit);
566 531
567/** 532/**
533 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
534 * @page: Page defining the wait queue of interest
535 * @waiter: Waiter to add to the queue
536 *
537 * Add an arbitrary @waiter to the wait queue for the nominated @page.
538 */
539void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
540{
541 wait_queue_head_t *q = page_waitqueue(page);
542 unsigned long flags;
543
544 spin_lock_irqsave(&q->lock, flags);
545 __add_wait_queue(q, waiter);
546 spin_unlock_irqrestore(&q->lock, flags);
547}
548EXPORT_SYMBOL_GPL(add_page_wait_queue);
549
550/**
568 * unlock_page - unlock a locked page 551 * unlock_page - unlock a locked page
569 * @page: the page 552 * @page: the page
570 * 553 *
@@ -627,6 +610,7 @@ int __lock_page_killable(struct page *page)
627 return __wait_on_bit_lock(page_waitqueue(page), &wait, 610 return __wait_on_bit_lock(page_waitqueue(page), &wait,
628 sync_page_killable, TASK_KILLABLE); 611 sync_page_killable, TASK_KILLABLE);
629} 612}
613EXPORT_SYMBOL_GPL(__lock_page_killable);
630 614
631/** 615/**
632 * __lock_page_nosync - get a lock on the page, without calling sync_page() 616 * __lock_page_nosync - get a lock on the page, without calling sync_page()
@@ -983,9 +967,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
983static void shrink_readahead_size_eio(struct file *filp, 967static void shrink_readahead_size_eio(struct file *filp,
984 struct file_ra_state *ra) 968 struct file_ra_state *ra)
985{ 969{
986 if (!ra->ra_pages)
987 return;
988
989 ra->ra_pages /= 4; 970 ra->ra_pages /= 4;
990} 971}
991 972
@@ -1369,8 +1350,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
1369 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1350 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1370 return -EINVAL; 1351 return -EINVAL;
1371 1352
1372 force_page_cache_readahead(mapping, filp, index, 1353 force_page_cache_readahead(mapping, filp, index, nr);
1373 max_sane_readahead(nr));
1374 return 0; 1354 return 0;
1375} 1355}
1376 1356
@@ -1436,6 +1416,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1436 1416
1437#define MMAP_LOTSAMISS (100) 1417#define MMAP_LOTSAMISS (100)
1438 1418
1419/*
1420 * Synchronous readahead happens when we don't even find
1421 * a page in the page cache at all.
1422 */
1423static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1424 struct file_ra_state *ra,
1425 struct file *file,
1426 pgoff_t offset)
1427{
1428 unsigned long ra_pages;
1429 struct address_space *mapping = file->f_mapping;
1430
1431 /* If we don't want any read-ahead, don't bother */
1432 if (VM_RandomReadHint(vma))
1433 return;
1434
1435 if (VM_SequentialReadHint(vma) ||
1436 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1437 page_cache_sync_readahead(mapping, ra, file, offset,
1438 ra->ra_pages);
1439 return;
1440 }
1441
1442 if (ra->mmap_miss < INT_MAX)
1443 ra->mmap_miss++;
1444
1445 /*
1446 * Do we miss much more than hit in this file? If so,
1447 * stop bothering with read-ahead. It will only hurt.
1448 */
1449 if (ra->mmap_miss > MMAP_LOTSAMISS)
1450 return;
1451
1452 /*
1453 * mmap read-around
1454 */
1455 ra_pages = max_sane_readahead(ra->ra_pages);
1456 if (ra_pages) {
1457 ra->start = max_t(long, 0, offset - ra_pages/2);
1458 ra->size = ra_pages;
1459 ra->async_size = 0;
1460 ra_submit(ra, mapping, file);
1461 }
1462}
1463
1464/*
1465 * Asynchronous readahead happens when we find the page and PG_readahead,
1466 * so we want to possibly extend the readahead further..
1467 */
1468static void do_async_mmap_readahead(struct vm_area_struct *vma,
1469 struct file_ra_state *ra,
1470 struct file *file,
1471 struct page *page,
1472 pgoff_t offset)
1473{
1474 struct address_space *mapping = file->f_mapping;
1475
1476 /* If we don't want any read-ahead, don't bother */
1477 if (VM_RandomReadHint(vma))
1478 return;
1479 if (ra->mmap_miss > 0)
1480 ra->mmap_miss--;
1481 if (PageReadahead(page))
1482 page_cache_async_readahead(mapping, ra, file,
1483 page, offset, ra->ra_pages);
1484}
1485
1439/** 1486/**
1440 * filemap_fault - read in file data for page fault handling 1487 * filemap_fault - read in file data for page fault handling
1441 * @vma: vma in which the fault was taken 1488 * @vma: vma in which the fault was taken
@@ -1455,78 +1502,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1455 struct address_space *mapping = file->f_mapping; 1502 struct address_space *mapping = file->f_mapping;
1456 struct file_ra_state *ra = &file->f_ra; 1503 struct file_ra_state *ra = &file->f_ra;
1457 struct inode *inode = mapping->host; 1504 struct inode *inode = mapping->host;
1505 pgoff_t offset = vmf->pgoff;
1458 struct page *page; 1506 struct page *page;
1459 pgoff_t size; 1507 pgoff_t size;
1460 int did_readaround = 0;
1461 int ret = 0; 1508 int ret = 0;
1462 1509
1463 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1510 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1464 if (vmf->pgoff >= size) 1511 if (offset >= size)
1465 return VM_FAULT_SIGBUS; 1512 return VM_FAULT_SIGBUS;
1466 1513
1467 /* If we don't want any read-ahead, don't bother */
1468 if (VM_RandomReadHint(vma))
1469 goto no_cached_page;
1470
1471 /* 1514 /*
1472 * Do we have something in the page cache already? 1515 * Do we have something in the page cache already?
1473 */ 1516 */
1474retry_find: 1517 page = find_get_page(mapping, offset);
1475 page = find_lock_page(mapping, vmf->pgoff); 1518 if (likely(page)) {
1476 /*
1477 * For sequential accesses, we use the generic readahead logic.
1478 */
1479 if (VM_SequentialReadHint(vma)) {
1480 if (!page) {
1481 page_cache_sync_readahead(mapping, ra, file,
1482 vmf->pgoff, 1);
1483 page = find_lock_page(mapping, vmf->pgoff);
1484 if (!page)
1485 goto no_cached_page;
1486 }
1487 if (PageReadahead(page)) {
1488 page_cache_async_readahead(mapping, ra, file, page,
1489 vmf->pgoff, 1);
1490 }
1491 }
1492
1493 if (!page) {
1494 unsigned long ra_pages;
1495
1496 ra->mmap_miss++;
1497
1498 /* 1519 /*
1499 * Do we miss much more than hit in this file? If so, 1520 * We found the page, so try async readahead before
1500 * stop bothering with read-ahead. It will only hurt. 1521 * waiting for the lock.
1501 */ 1522 */
1502 if (ra->mmap_miss > MMAP_LOTSAMISS) 1523 do_async_mmap_readahead(vma, ra, file, page, offset);
1503 goto no_cached_page; 1524 lock_page(page);
1504 1525
1505 /* 1526 /* Did it get truncated? */
1506 * To keep the pgmajfault counter straight, we need to 1527 if (unlikely(page->mapping != mapping)) {
1507 * check did_readaround, as this is an inner loop. 1528 unlock_page(page);
1508 */ 1529 put_page(page);
1509 if (!did_readaround) { 1530 goto no_cached_page;
1510 ret = VM_FAULT_MAJOR;
1511 count_vm_event(PGMAJFAULT);
1512 }
1513 did_readaround = 1;
1514 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1515 if (ra_pages) {
1516 pgoff_t start = 0;
1517
1518 if (vmf->pgoff > ra_pages / 2)
1519 start = vmf->pgoff - ra_pages / 2;
1520 do_page_cache_readahead(mapping, file, start, ra_pages);
1521 } 1531 }
1522 page = find_lock_page(mapping, vmf->pgoff); 1532 } else {
1533 /* No page in the page cache at all */
1534 do_sync_mmap_readahead(vma, ra, file, offset);
1535 count_vm_event(PGMAJFAULT);
1536 ret = VM_FAULT_MAJOR;
1537retry_find:
1538 page = find_lock_page(mapping, offset);
1523 if (!page) 1539 if (!page)
1524 goto no_cached_page; 1540 goto no_cached_page;
1525 } 1541 }
1526 1542
1527 if (!did_readaround)
1528 ra->mmap_miss--;
1529
1530 /* 1543 /*
1531 * We have a locked page in the page cache, now we need to check 1544 * We have a locked page in the page cache, now we need to check
1532 * that it's up-to-date. If not, it is going to be due to an error. 1545 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1534,18 +1547,18 @@ retry_find:
1534 if (unlikely(!PageUptodate(page))) 1547 if (unlikely(!PageUptodate(page)))
1535 goto page_not_uptodate; 1548 goto page_not_uptodate;
1536 1549
1537 /* Must recheck i_size under page lock */ 1550 /*
1551 * Found the page and have a reference on it.
1552 * We must recheck i_size under page lock.
1553 */
1538 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1554 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1539 if (unlikely(vmf->pgoff >= size)) { 1555 if (unlikely(offset >= size)) {
1540 unlock_page(page); 1556 unlock_page(page);
1541 page_cache_release(page); 1557 page_cache_release(page);
1542 return VM_FAULT_SIGBUS; 1558 return VM_FAULT_SIGBUS;
1543 } 1559 }
1544 1560
1545 /* 1561 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1546 * Found the page and have a reference on it.
1547 */
1548 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1549 vmf->page = page; 1562 vmf->page = page;
1550 return ret | VM_FAULT_LOCKED; 1563 return ret | VM_FAULT_LOCKED;
1551 1564
@@ -1554,7 +1567,7 @@ no_cached_page:
1554 * We're only likely to ever get here if MADV_RANDOM is in 1567 * We're only likely to ever get here if MADV_RANDOM is in
1555 * effect. 1568 * effect.
1556 */ 1569 */
1557 error = page_cache_read(file, vmf->pgoff); 1570 error = page_cache_read(file, offset);
1558 1571
1559 /* 1572 /*
1560 * The page we want has now been added to the page cache. 1573 * The page we want has now been added to the page cache.
@@ -1574,12 +1587,6 @@ no_cached_page:
1574 return VM_FAULT_SIGBUS; 1587 return VM_FAULT_SIGBUS;
1575 1588
1576page_not_uptodate: 1589page_not_uptodate:
1577 /* IO error path */
1578 if (!did_readaround) {
1579 ret = VM_FAULT_MAJOR;
1580 count_vm_event(PGMAJFAULT);
1581 }
1582
1583 /* 1590 /*
1584 * Umm, take care of errors if the page isn't up-to-date. 1591 * Umm, take care of errors if the page isn't up-to-date.
1585 * Try to re-read it _once_. We do this synchronously, 1592 * Try to re-read it _once_. We do this synchronously,
@@ -1604,7 +1611,7 @@ page_not_uptodate:
1604} 1611}
1605EXPORT_SYMBOL(filemap_fault); 1612EXPORT_SYMBOL(filemap_fault);
1606 1613
1607struct vm_operations_struct generic_file_vm_ops = { 1614const struct vm_operations_struct generic_file_vm_ops = {
1608 .fault = filemap_fault, 1615 .fault = filemap_fault,
1609}; 1616};
1610 1617
@@ -2123,20 +2130,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2123 } 2130 }
2124 *ppos = end; 2131 *ppos = end;
2125 } 2132 }
2126
2127 /*
2128 * Sync the fs metadata but not the minor inode changes and
2129 * of course not the data as we did direct DMA for the IO.
2130 * i_mutex is held, which protects generic_osync_inode() from
2131 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2132 */
2133out: 2133out:
2134 if ((written >= 0 || written == -EIOCBQUEUED) &&
2135 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2136 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2137 if (err < 0)
2138 written = err;
2139 }
2140 return written; 2134 return written;
2141} 2135}
2142EXPORT_SYMBOL(generic_file_direct_write); 2136EXPORT_SYMBOL(generic_file_direct_write);
@@ -2228,6 +2222,7 @@ again:
2228 pagefault_enable(); 2222 pagefault_enable();
2229 flush_dcache_page(page); 2223 flush_dcache_page(page);
2230 2224
2225 mark_page_accessed(page);
2231 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2226 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2232 page, fsdata); 2227 page, fsdata);
2233 if (unlikely(status < 0)) 2228 if (unlikely(status < 0))
@@ -2267,8 +2262,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2267{ 2262{
2268 struct file *file = iocb->ki_filp; 2263 struct file *file = iocb->ki_filp;
2269 struct address_space *mapping = file->f_mapping; 2264 struct address_space *mapping = file->f_mapping;
2270 const struct address_space_operations *a_ops = mapping->a_ops;
2271 struct inode *inode = mapping->host;
2272 ssize_t status; 2265 ssize_t status;
2273 struct iov_iter i; 2266 struct iov_iter i;
2274 2267
@@ -2278,16 +2271,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2278 if (likely(status >= 0)) { 2271 if (likely(status >= 0)) {
2279 written += status; 2272 written += status;
2280 *ppos = pos + status; 2273 *ppos = pos + status;
2281
2282 /*
2283 * For now, when the user asks for O_SYNC, we'll actually give
2284 * O_DSYNC
2285 */
2286 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2287 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2288 status = generic_osync_inode(inode, mapping,
2289 OSYNC_METADATA|OSYNC_DATA);
2290 }
2291 } 2274 }
2292 2275
2293 /* 2276 /*
@@ -2303,9 +2286,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2303} 2286}
2304EXPORT_SYMBOL(generic_file_buffered_write); 2287EXPORT_SYMBOL(generic_file_buffered_write);
2305 2288
2306static ssize_t 2289/**
2307__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2290 * __generic_file_aio_write - write data to a file
2308 unsigned long nr_segs, loff_t *ppos) 2291 * @iocb: IO state structure (file, offset, etc.)
2292 * @iov: vector with data to write
2293 * @nr_segs: number of segments in the vector
2294 * @ppos: position where to write
2295 *
2296 * This function does all the work needed for actually writing data to a
2297 * file. It does all basic checks, removes SUID from the file, updates
2298 * modification times and calls proper subroutines depending on whether we
2299 * do direct IO or a standard buffered write.
2300 *
2301 * It expects i_mutex to be grabbed unless we work on a block device or similar
2302 * object which does not need locking at all.
2303 *
2304 * This function does *not* take care of syncing data in case of O_SYNC write.
2305 * A caller has to handle it. This is mainly due to the fact that we want to
2306 * avoid syncing under i_mutex.
2307 */
2308ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2309 unsigned long nr_segs, loff_t *ppos)
2309{ 2310{
2310 struct file *file = iocb->ki_filp; 2311 struct file *file = iocb->ki_filp;
2311 struct address_space * mapping = file->f_mapping; 2312 struct address_space * mapping = file->f_mapping;
@@ -2402,51 +2403,37 @@ out:
2402 current->backing_dev_info = NULL; 2403 current->backing_dev_info = NULL;
2403 return written ? written : err; 2404 return written ? written : err;
2404} 2405}
2406EXPORT_SYMBOL(__generic_file_aio_write);
2405 2407
2406ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, 2408/**
2407 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 2409 * generic_file_aio_write - write data to a file
2408{ 2410 * @iocb: IO state structure
2409 struct file *file = iocb->ki_filp; 2411 * @iov: vector with data to write
2410 struct address_space *mapping = file->f_mapping; 2412 * @nr_segs: number of segments in the vector
2411 struct inode *inode = mapping->host; 2413 * @pos: position in file where to write
2412 ssize_t ret; 2414 *
2413 2415 * This is a wrapper around __generic_file_aio_write() to be used by most
2414 BUG_ON(iocb->ki_pos != pos); 2416 * filesystems. It takes care of syncing the file in case of O_SYNC file
2415 2417 * and acquires i_mutex as needed.
2416 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2418 */
2417 &iocb->ki_pos);
2418
2419 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2420 ssize_t err;
2421
2422 err = sync_page_range_nolock(inode, mapping, pos, ret);
2423 if (err < 0)
2424 ret = err;
2425 }
2426 return ret;
2427}
2428EXPORT_SYMBOL(generic_file_aio_write_nolock);
2429
2430ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2419ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2431 unsigned long nr_segs, loff_t pos) 2420 unsigned long nr_segs, loff_t pos)
2432{ 2421{
2433 struct file *file = iocb->ki_filp; 2422 struct file *file = iocb->ki_filp;
2434 struct address_space *mapping = file->f_mapping; 2423 struct inode *inode = file->f_mapping->host;
2435 struct inode *inode = mapping->host;
2436 ssize_t ret; 2424 ssize_t ret;
2437 2425
2438 BUG_ON(iocb->ki_pos != pos); 2426 BUG_ON(iocb->ki_pos != pos);
2439 2427
2440 mutex_lock(&inode->i_mutex); 2428 mutex_lock(&inode->i_mutex);
2441 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2429 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2442 &iocb->ki_pos);
2443 mutex_unlock(&inode->i_mutex); 2430 mutex_unlock(&inode->i_mutex);
2444 2431
2445 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2432 if (ret > 0 || ret == -EIOCBQUEUED) {
2446 ssize_t err; 2433 ssize_t err;
2447 2434
2448 err = sync_page_range(inode, mapping, pos, ret); 2435 err = generic_write_sync(file, pos, ret);
2449 if (err < 0) 2436 if (err < 0 && ret > 0)
2450 ret = err; 2437 ret = err;
2451 } 2438 }
2452 return ret; 2439 return ret;
@@ -2463,6 +2450,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
2463 * (presumably at page->private). If the release was successful, return `1'. 2450 * (presumably at page->private). If the release was successful, return `1'.
2464 * Otherwise return zero. 2451 * Otherwise return zero.
2465 * 2452 *
2453 * This may also be called if PG_fscache is set on a page, indicating that the
2454 * page is known to the local caching routines.
2455 *
2466 * The @gfp_mask argument specifies whether I/O may be performed to release 2456 * The @gfp_mask argument specifies whether I/O may be performed to release
2467 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2457 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2468 * 2458 *