diff options
author | Nick Piggin <npiggin@suse.de> | 2007-10-16 04:25:01 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:42:55 -0400 |
commit | afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch) | |
tree | 4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /fs | |
parent | 637aff46f94a754207c80c8c64bf1b74f24b967d (diff) |
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).
[mark.fasheh@oracle.com: API design contributions, code review and fixes]
[akpm@linux-foundation.org: various fixes]
[dmonakhov@sw.ru: new aop block_write_begin fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/buffer.c | 201 | ||||
-rw-r--r-- | fs/libfs.c | 44 | ||||
-rw-r--r-- | fs/namei.c | 46 | ||||
-rw-r--r-- | fs/splice.c | 69 |
4 files changed, 231 insertions, 129 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 9ece6c2086d0..68b8fbdc1b28 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1770,6 +1770,48 @@ recover: | |||
1770 | goto done; | 1770 | goto done; |
1771 | } | 1771 | } |
1772 | 1772 | ||
1773 | /* | ||
1774 | * If a page has any new buffers, zero them out here, and mark them uptodate | ||
1775 | * and dirty so they'll be written out (in order to prevent uninitialised | ||
1776 | * block data from leaking). And clear the new bit. | ||
1777 | */ | ||
1778 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) | ||
1779 | { | ||
1780 | unsigned int block_start, block_end; | ||
1781 | struct buffer_head *head, *bh; | ||
1782 | |||
1783 | BUG_ON(!PageLocked(page)); | ||
1784 | if (!page_has_buffers(page)) | ||
1785 | return; | ||
1786 | |||
1787 | bh = head = page_buffers(page); | ||
1788 | block_start = 0; | ||
1789 | do { | ||
1790 | block_end = block_start + bh->b_size; | ||
1791 | |||
1792 | if (buffer_new(bh)) { | ||
1793 | if (block_end > from && block_start < to) { | ||
1794 | if (!PageUptodate(page)) { | ||
1795 | unsigned start, size; | ||
1796 | |||
1797 | start = max(from, block_start); | ||
1798 | size = min(to, block_end) - start; | ||
1799 | |||
1800 | zero_user_page(page, start, size, KM_USER0); | ||
1801 | set_buffer_uptodate(bh); | ||
1802 | } | ||
1803 | |||
1804 | clear_buffer_new(bh); | ||
1805 | mark_buffer_dirty(bh); | ||
1806 | } | ||
1807 | } | ||
1808 | |||
1809 | block_start = block_end; | ||
1810 | bh = bh->b_this_page; | ||
1811 | } while (bh != head); | ||
1812 | } | ||
1813 | EXPORT_SYMBOL(page_zero_new_buffers); | ||
1814 | |||
1773 | static int __block_prepare_write(struct inode *inode, struct page *page, | 1815 | static int __block_prepare_write(struct inode *inode, struct page *page, |
1774 | unsigned from, unsigned to, get_block_t *get_block) | 1816 | unsigned from, unsigned to, get_block_t *get_block) |
1775 | { | 1817 | { |
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page, | |||
1854 | if (!buffer_uptodate(*wait_bh)) | 1896 | if (!buffer_uptodate(*wait_bh)) |
1855 | err = -EIO; | 1897 | err = -EIO; |
1856 | } | 1898 | } |
1857 | if (!err) { | 1899 | if (unlikely(err)) |
1858 | bh = head; | 1900 | page_zero_new_buffers(page, from, to); |
1859 | do { | ||
1860 | if (buffer_new(bh)) | ||
1861 | clear_buffer_new(bh); | ||
1862 | } while ((bh = bh->b_this_page) != head); | ||
1863 | return 0; | ||
1864 | } | ||
1865 | /* Error case: */ | ||
1866 | /* | ||
1867 | * Zero out any newly allocated blocks to avoid exposing stale | ||
1868 | * data. If BH_New is set, we know that the block was newly | ||
1869 | * allocated in the above loop. | ||
1870 | */ | ||
1871 | bh = head; | ||
1872 | block_start = 0; | ||
1873 | do { | ||
1874 | block_end = block_start+blocksize; | ||
1875 | if (block_end <= from) | ||
1876 | goto next_bh; | ||
1877 | if (block_start >= to) | ||
1878 | break; | ||
1879 | if (buffer_new(bh)) { | ||
1880 | clear_buffer_new(bh); | ||
1881 | zero_user_page(page, block_start, bh->b_size, KM_USER0); | ||
1882 | set_buffer_uptodate(bh); | ||
1883 | mark_buffer_dirty(bh); | ||
1884 | } | ||
1885 | next_bh: | ||
1886 | block_start = block_end; | ||
1887 | bh = bh->b_this_page; | ||
1888 | } while (bh != head); | ||
1889 | return err; | 1901 | return err; |
1890 | } | 1902 | } |
1891 | 1903 | ||
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page, | |||
1910 | set_buffer_uptodate(bh); | 1922 | set_buffer_uptodate(bh); |
1911 | mark_buffer_dirty(bh); | 1923 | mark_buffer_dirty(bh); |
1912 | } | 1924 | } |
1925 | clear_buffer_new(bh); | ||
1913 | } | 1926 | } |
1914 | 1927 | ||
1915 | /* | 1928 | /* |
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode *inode, struct page *page, | |||
1924 | } | 1937 | } |
1925 | 1938 | ||
1926 | /* | 1939 | /* |
1940 | * block_write_begin takes care of the basic task of block allocation and | ||
1941 | * bringing partial write blocks uptodate first. | ||
1942 | * | ||
1943 | * If *pagep is not NULL, then block_write_begin uses the locked page | ||
1944 | * at *pagep rather than allocating its own. In this case, the page will | ||
1945 | * not be unlocked or deallocated on failure. | ||
1946 | */ | ||
1947 | int block_write_begin(struct file *file, struct address_space *mapping, | ||
1948 | loff_t pos, unsigned len, unsigned flags, | ||
1949 | struct page **pagep, void **fsdata, | ||
1950 | get_block_t *get_block) | ||
1951 | { | ||
1952 | struct inode *inode = mapping->host; | ||
1953 | int status = 0; | ||
1954 | struct page *page; | ||
1955 | pgoff_t index; | ||
1956 | unsigned start, end; | ||
1957 | int ownpage = 0; | ||
1958 | |||
1959 | index = pos >> PAGE_CACHE_SHIFT; | ||
1960 | start = pos & (PAGE_CACHE_SIZE - 1); | ||
1961 | end = start + len; | ||
1962 | |||
1963 | page = *pagep; | ||
1964 | if (page == NULL) { | ||
1965 | ownpage = 1; | ||
1966 | page = __grab_cache_page(mapping, index); | ||
1967 | if (!page) { | ||
1968 | status = -ENOMEM; | ||
1969 | goto out; | ||
1970 | } | ||
1971 | *pagep = page; | ||
1972 | } else | ||
1973 | BUG_ON(!PageLocked(page)); | ||
1974 | |||
1975 | status = __block_prepare_write(inode, page, start, end, get_block); | ||
1976 | if (unlikely(status)) { | ||
1977 | ClearPageUptodate(page); | ||
1978 | |||
1979 | if (ownpage) { | ||
1980 | unlock_page(page); | ||
1981 | page_cache_release(page); | ||
1982 | *pagep = NULL; | ||
1983 | |||
1984 | /* | ||
1985 | * prepare_write() may have instantiated a few blocks | ||
1986 | * outside i_size. Trim these off again. Don't need | ||
1987 | * i_size_read because we hold i_mutex. | ||
1988 | */ | ||
1989 | if (pos + len > inode->i_size) | ||
1990 | vmtruncate(inode, inode->i_size); | ||
1991 | } | ||
1992 | goto out; | ||
1993 | } | ||
1994 | |||
1995 | out: | ||
1996 | return status; | ||
1997 | } | ||
1998 | EXPORT_SYMBOL(block_write_begin); | ||
1999 | |||
2000 | int block_write_end(struct file *file, struct address_space *mapping, | ||
2001 | loff_t pos, unsigned len, unsigned copied, | ||
2002 | struct page *page, void *fsdata) | ||
2003 | { | ||
2004 | struct inode *inode = mapping->host; | ||
2005 | unsigned start; | ||
2006 | |||
2007 | start = pos & (PAGE_CACHE_SIZE - 1); | ||
2008 | |||
2009 | if (unlikely(copied < len)) { | ||
2010 | /* | ||
2011 | * The buffers that were written will now be uptodate, so we | ||
2012 | * don't have to worry about a readpage reading them and | ||
2013 | * overwriting a partial write. However if we have encountered | ||
2014 | * a short write and only partially written into a buffer, it | ||
2015 | * will not be marked uptodate, so a readpage might come in and | ||
2016 | * destroy our partial write. | ||
2017 | * | ||
2018 | * Do the simplest thing, and just treat any short write to a | ||
2019 | * non uptodate page as a zero-length write, and force the | ||
2020 | * caller to redo the whole thing. | ||
2021 | */ | ||
2022 | if (!PageUptodate(page)) | ||
2023 | copied = 0; | ||
2024 | |||
2025 | page_zero_new_buffers(page, start+copied, start+len); | ||
2026 | } | ||
2027 | flush_dcache_page(page); | ||
2028 | |||
2029 | /* This could be a short (even 0-length) commit */ | ||
2030 | __block_commit_write(inode, page, start, start+copied); | ||
2031 | |||
2032 | return copied; | ||
2033 | } | ||
2034 | EXPORT_SYMBOL(block_write_end); | ||
2035 | |||
2036 | int generic_write_end(struct file *file, struct address_space *mapping, | ||
2037 | loff_t pos, unsigned len, unsigned copied, | ||
2038 | struct page *page, void *fsdata) | ||
2039 | { | ||
2040 | struct inode *inode = mapping->host; | ||
2041 | |||
2042 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | ||
2043 | |||
2044 | /* | ||
2045 | * No need to use i_size_read() here, the i_size | ||
2046 | * cannot change under us because we hold i_mutex. | ||
2047 | * | ||
2048 | * But it's important to update i_size while still holding page lock: | ||
2049 | * page writeout could otherwise come in and zero beyond i_size. | ||
2050 | */ | ||
2051 | if (pos+copied > inode->i_size) { | ||
2052 | i_size_write(inode, pos+copied); | ||
2053 | mark_inode_dirty(inode); | ||
2054 | } | ||
2055 | |||
2056 | unlock_page(page); | ||
2057 | page_cache_release(page); | ||
2058 | |||
2059 | return copied; | ||
2060 | } | ||
2061 | EXPORT_SYMBOL(generic_write_end); | ||
2062 | |||
2063 | /* | ||
1927 | * Generic "read page" function for block devices that have the normal | 2064 | * Generic "read page" function for block devices that have the normal |
1928 | * get_block functionality. This is most of the block device filesystems. | 2065 | * get_block functionality. This is most of the block device filesystems. |
1929 | * Reads the page asynchronously --- the unlock_buffer() and | 2066 | * Reads the page asynchronously --- the unlock_buffer() and |
diff --git a/fs/libfs.c b/fs/libfs.c index 5294de1f40c4..f2b32d3a9093 100644 --- a/fs/libfs.c +++ b/fs/libfs.c | |||
@@ -351,6 +351,26 @@ int simple_prepare_write(struct file *file, struct page *page, | |||
351 | return 0; | 351 | return 0; |
352 | } | 352 | } |
353 | 353 | ||
354 | int simple_write_begin(struct file *file, struct address_space *mapping, | ||
355 | loff_t pos, unsigned len, unsigned flags, | ||
356 | struct page **pagep, void **fsdata) | ||
357 | { | ||
358 | struct page *page; | ||
359 | pgoff_t index; | ||
360 | unsigned from; | ||
361 | |||
362 | index = pos >> PAGE_CACHE_SHIFT; | ||
363 | from = pos & (PAGE_CACHE_SIZE - 1); | ||
364 | |||
365 | page = __grab_cache_page(mapping, index); | ||
366 | if (!page) | ||
367 | return -ENOMEM; | ||
368 | |||
369 | *pagep = page; | ||
370 | |||
371 | return simple_prepare_write(file, page, from, from+len); | ||
372 | } | ||
373 | |||
354 | int simple_commit_write(struct file *file, struct page *page, | 374 | int simple_commit_write(struct file *file, struct page *page, |
355 | unsigned from, unsigned to) | 375 | unsigned from, unsigned to) |
356 | { | 376 | { |
@@ -369,6 +389,28 @@ int simple_commit_write(struct file *file, struct page *page, | |||
369 | return 0; | 389 | return 0; |
370 | } | 390 | } |
371 | 391 | ||
392 | int simple_write_end(struct file *file, struct address_space *mapping, | ||
393 | loff_t pos, unsigned len, unsigned copied, | ||
394 | struct page *page, void *fsdata) | ||
395 | { | ||
396 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | ||
397 | |||
398 | /* zero the stale part of the page if we did a short copy */ | ||
399 | if (copied < len) { | ||
400 | void *kaddr = kmap_atomic(page, KM_USER0); | ||
401 | memset(kaddr + from + copied, 0, len - copied); | ||
402 | flush_dcache_page(page); | ||
403 | kunmap_atomic(kaddr, KM_USER0); | ||
404 | } | ||
405 | |||
406 | simple_commit_write(file, page, from, from+copied); | ||
407 | |||
408 | unlock_page(page); | ||
409 | page_cache_release(page); | ||
410 | |||
411 | return copied; | ||
412 | } | ||
413 | |||
372 | /* | 414 | /* |
373 | * the inodes created here are not hashed. If you use iunique to generate | 415 | * the inodes created here are not hashed. If you use iunique to generate |
374 | * unique inode values later for this filesystem, then you must take care | 416 | * unique inode values later for this filesystem, then you must take care |
@@ -642,6 +684,8 @@ EXPORT_SYMBOL(dcache_dir_open); | |||
642 | EXPORT_SYMBOL(dcache_readdir); | 684 | EXPORT_SYMBOL(dcache_readdir); |
643 | EXPORT_SYMBOL(generic_read_dir); | 685 | EXPORT_SYMBOL(generic_read_dir); |
644 | EXPORT_SYMBOL(get_sb_pseudo); | 686 | EXPORT_SYMBOL(get_sb_pseudo); |
687 | EXPORT_SYMBOL(simple_write_begin); | ||
688 | EXPORT_SYMBOL(simple_write_end); | ||
645 | EXPORT_SYMBOL(simple_commit_write); | 689 | EXPORT_SYMBOL(simple_commit_write); |
646 | EXPORT_SYMBOL(simple_dir_inode_operations); | 690 | EXPORT_SYMBOL(simple_dir_inode_operations); |
647 | EXPORT_SYMBOL(simple_dir_operations); | 691 | EXPORT_SYMBOL(simple_dir_operations); |
diff --git a/fs/namei.c b/fs/namei.c index a83160acd748..b40b8084eefc 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -2729,53 +2729,29 @@ int __page_symlink(struct inode *inode, const char *symname, int len, | |||
2729 | { | 2729 | { |
2730 | struct address_space *mapping = inode->i_mapping; | 2730 | struct address_space *mapping = inode->i_mapping; |
2731 | struct page *page; | 2731 | struct page *page; |
2732 | void *fsdata; | ||
2732 | int err; | 2733 | int err; |
2733 | char *kaddr; | 2734 | char *kaddr; |
2734 | 2735 | ||
2735 | retry: | 2736 | retry: |
2736 | err = -ENOMEM; | 2737 | err = pagecache_write_begin(NULL, mapping, 0, len-1, |
2737 | page = find_or_create_page(mapping, 0, gfp_mask); | 2738 | AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); |
2738 | if (!page) | ||
2739 | goto fail; | ||
2740 | err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); | ||
2741 | if (err == AOP_TRUNCATED_PAGE) { | ||
2742 | page_cache_release(page); | ||
2743 | goto retry; | ||
2744 | } | ||
2745 | if (err) | 2739 | if (err) |
2746 | goto fail_map; | 2740 | goto fail; |
2741 | |||
2747 | kaddr = kmap_atomic(page, KM_USER0); | 2742 | kaddr = kmap_atomic(page, KM_USER0); |
2748 | memcpy(kaddr, symname, len-1); | 2743 | memcpy(kaddr, symname, len-1); |
2749 | kunmap_atomic(kaddr, KM_USER0); | 2744 | kunmap_atomic(kaddr, KM_USER0); |
2750 | err = mapping->a_ops->commit_write(NULL, page, 0, len-1); | 2745 | |
2751 | if (err == AOP_TRUNCATED_PAGE) { | 2746 | err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, |
2752 | page_cache_release(page); | 2747 | page, fsdata); |
2753 | goto retry; | ||
2754 | } | ||
2755 | if (err) | ||
2756 | goto fail_map; | ||
2757 | /* | ||
2758 | * Notice that we are _not_ going to block here - end of page is | ||
2759 | * unmapped, so this will only try to map the rest of page, see | ||
2760 | * that it is unmapped (typically even will not look into inode - | ||
2761 | * ->i_size will be enough for everything) and zero it out. | ||
2762 | * OTOH it's obviously correct and should make the page up-to-date. | ||
2763 | */ | ||
2764 | if (!PageUptodate(page)) { | ||
2765 | err = mapping->a_ops->readpage(NULL, page); | ||
2766 | if (err != AOP_TRUNCATED_PAGE) | ||
2767 | wait_on_page_locked(page); | ||
2768 | } else { | ||
2769 | unlock_page(page); | ||
2770 | } | ||
2771 | page_cache_release(page); | ||
2772 | if (err < 0) | 2748 | if (err < 0) |
2773 | goto fail; | 2749 | goto fail; |
2750 | if (err < len-1) | ||
2751 | goto retry; | ||
2752 | |||
2774 | mark_inode_dirty(inode); | 2753 | mark_inode_dirty(inode); |
2775 | return 0; | 2754 | return 0; |
2776 | fail_map: | ||
2777 | unlock_page(page); | ||
2778 | page_cache_release(page); | ||
2779 | fail: | 2755 | fail: |
2780 | return err; | 2756 | return err; |
2781 | } | 2757 | } |
diff --git a/fs/splice.c b/fs/splice.c index 2df6be43c667..a7568bcc0f99 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -563,7 +563,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | |||
563 | struct address_space *mapping = file->f_mapping; | 563 | struct address_space *mapping = file->f_mapping; |
564 | unsigned int offset, this_len; | 564 | unsigned int offset, this_len; |
565 | struct page *page; | 565 | struct page *page; |
566 | pgoff_t index; | 566 | void *fsdata; |
567 | int ret; | 567 | int ret; |
568 | 568 | ||
569 | /* | 569 | /* |
@@ -573,49 +573,16 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | |||
573 | if (unlikely(ret)) | 573 | if (unlikely(ret)) |
574 | return ret; | 574 | return ret; |
575 | 575 | ||
576 | index = sd->pos >> PAGE_CACHE_SHIFT; | ||
577 | offset = sd->pos & ~PAGE_CACHE_MASK; | 576 | offset = sd->pos & ~PAGE_CACHE_MASK; |
578 | 577 | ||
579 | this_len = sd->len; | 578 | this_len = sd->len; |
580 | if (this_len + offset > PAGE_CACHE_SIZE) | 579 | if (this_len + offset > PAGE_CACHE_SIZE) |
581 | this_len = PAGE_CACHE_SIZE - offset; | 580 | this_len = PAGE_CACHE_SIZE - offset; |
582 | 581 | ||
583 | find_page: | 582 | ret = pagecache_write_begin(file, mapping, sd->pos, this_len, |
584 | page = find_lock_page(mapping, index); | 583 | AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); |
585 | if (!page) { | 584 | if (unlikely(ret)) |
586 | ret = -ENOMEM; | 585 | goto out; |
587 | page = page_cache_alloc_cold(mapping); | ||
588 | if (unlikely(!page)) | ||
589 | goto out_ret; | ||
590 | |||
591 | /* | ||
592 | * This will also lock the page | ||
593 | */ | ||
594 | ret = add_to_page_cache_lru(page, mapping, index, | ||
595 | GFP_KERNEL); | ||
596 | if (unlikely(ret)) | ||
597 | goto out_release; | ||
598 | } | ||
599 | |||
600 | ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); | ||
601 | if (unlikely(ret)) { | ||
602 | loff_t isize = i_size_read(mapping->host); | ||
603 | |||
604 | if (ret != AOP_TRUNCATED_PAGE) | ||
605 | unlock_page(page); | ||
606 | page_cache_release(page); | ||
607 | if (ret == AOP_TRUNCATED_PAGE) | ||
608 | goto find_page; | ||
609 | |||
610 | /* | ||
611 | * prepare_write() may have instantiated a few blocks | ||
612 | * outside i_size. Trim these off again. | ||
613 | */ | ||
614 | if (sd->pos + this_len > isize) | ||
615 | vmtruncate(mapping->host, isize); | ||
616 | |||
617 | goto out_ret; | ||
618 | } | ||
619 | 586 | ||
620 | if (buf->page != page) { | 587 | if (buf->page != page) { |
621 | /* | 588 | /* |
@@ -629,31 +596,9 @@ find_page: | |||
629 | kunmap_atomic(dst, KM_USER1); | 596 | kunmap_atomic(dst, KM_USER1); |
630 | buf->ops->unmap(pipe, buf, src); | 597 | buf->ops->unmap(pipe, buf, src); |
631 | } | 598 | } |
632 | 599 | ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, | |
633 | ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); | 600 | page, fsdata); |
634 | if (ret) { | ||
635 | if (ret == AOP_TRUNCATED_PAGE) { | ||
636 | page_cache_release(page); | ||
637 | goto find_page; | ||
638 | } | ||
639 | if (ret < 0) | ||
640 | goto out; | ||
641 | /* | ||
642 | * Partial write has happened, so 'ret' already initialized by | ||
643 | * number of bytes written, Where is nothing we have to do here. | ||
644 | */ | ||
645 | } else | ||
646 | ret = this_len; | ||
647 | /* | ||
648 | * Return the number of bytes written and mark page as | ||
649 | * accessed, we are now done! | ||
650 | */ | ||
651 | mark_page_accessed(page); | ||
652 | out: | 601 | out: |
653 | unlock_page(page); | ||
654 | out_release: | ||
655 | page_cache_release(page); | ||
656 | out_ret: | ||
657 | return ret; | 602 | return ret; |
658 | } | 603 | } |
659 | 604 | ||