aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-10-16 04:25:01 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:42:55 -0400
commitafddba49d18f346e5cc2938b6ed7c512db18ca68 (patch)
tree4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /fs
parent637aff46f94a754207c80c8c64bf1b74f24b967d (diff)
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more flexible alternatives that are also able to avoid the buffered write deadlock problems efficiently (which prepare_write is unable to do). [mark.fasheh@oracle.com: API design contributions, code review and fixes] [akpm@linux-foundation.org: various fixes] [dmonakhov@sw.ru: new aop block_write_begin fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/buffer.c201
-rw-r--r--fs/libfs.c44
-rw-r--r--fs/namei.c46
-rw-r--r--fs/splice.c69
4 files changed, 231 insertions, 129 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 9ece6c2086d0..68b8fbdc1b28 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1770,6 +1770,48 @@ recover:
1770 goto done; 1770 goto done;
1771} 1771}
1772 1772
1773/*
1774 * If a page has any new buffers, zero them out here, and mark them uptodate
1775 * and dirty so they'll be written out (in order to prevent uninitialised
1776 * block data from leaking). And clear the new bit.
1777 */
1778void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1779{
1780 unsigned int block_start, block_end;
1781 struct buffer_head *head, *bh;
1782
1783 BUG_ON(!PageLocked(page));
1784 if (!page_has_buffers(page))
1785 return;
1786
1787 bh = head = page_buffers(page);
1788 block_start = 0;
1789 do {
1790 block_end = block_start + bh->b_size;
1791
1792 if (buffer_new(bh)) {
1793 if (block_end > from && block_start < to) {
1794 if (!PageUptodate(page)) {
1795 unsigned start, size;
1796
1797 start = max(from, block_start);
1798 size = min(to, block_end) - start;
1799
1800 zero_user_page(page, start, size, KM_USER0);
1801 set_buffer_uptodate(bh);
1802 }
1803
1804 clear_buffer_new(bh);
1805 mark_buffer_dirty(bh);
1806 }
1807 }
1808
1809 block_start = block_end;
1810 bh = bh->b_this_page;
1811 } while (bh != head);
1812}
1813EXPORT_SYMBOL(page_zero_new_buffers);
1814
1773static int __block_prepare_write(struct inode *inode, struct page *page, 1815static int __block_prepare_write(struct inode *inode, struct page *page,
1774 unsigned from, unsigned to, get_block_t *get_block) 1816 unsigned from, unsigned to, get_block_t *get_block)
1775{ 1817{
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1854 if (!buffer_uptodate(*wait_bh)) 1896 if (!buffer_uptodate(*wait_bh))
1855 err = -EIO; 1897 err = -EIO;
1856 } 1898 }
1857 if (!err) { 1899 if (unlikely(err))
1858 bh = head; 1900 page_zero_new_buffers(page, from, to);
1859 do {
1860 if (buffer_new(bh))
1861 clear_buffer_new(bh);
1862 } while ((bh = bh->b_this_page) != head);
1863 return 0;
1864 }
1865 /* Error case: */
1866 /*
1867 * Zero out any newly allocated blocks to avoid exposing stale
1868 * data. If BH_New is set, we know that the block was newly
1869 * allocated in the above loop.
1870 */
1871 bh = head;
1872 block_start = 0;
1873 do {
1874 block_end = block_start+blocksize;
1875 if (block_end <= from)
1876 goto next_bh;
1877 if (block_start >= to)
1878 break;
1879 if (buffer_new(bh)) {
1880 clear_buffer_new(bh);
1881 zero_user_page(page, block_start, bh->b_size, KM_USER0);
1882 set_buffer_uptodate(bh);
1883 mark_buffer_dirty(bh);
1884 }
1885next_bh:
1886 block_start = block_end;
1887 bh = bh->b_this_page;
1888 } while (bh != head);
1889 return err; 1901 return err;
1890} 1902}
1891 1903
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1910 set_buffer_uptodate(bh); 1922 set_buffer_uptodate(bh);
1911 mark_buffer_dirty(bh); 1923 mark_buffer_dirty(bh);
1912 } 1924 }
1925 clear_buffer_new(bh);
1913 } 1926 }
1914 1927
1915 /* 1928 /*
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1924} 1937}
1925 1938
1926/* 1939/*
1940 * block_write_begin takes care of the basic task of block allocation and
1941 * bringing partial write blocks uptodate first.
1942 *
1943 * If *pagep is not NULL, then block_write_begin uses the locked page
1944 * at *pagep rather than allocating its own. In this case, the page will
1945 * not be unlocked or deallocated on failure.
1946 */
1947int block_write_begin(struct file *file, struct address_space *mapping,
1948 loff_t pos, unsigned len, unsigned flags,
1949 struct page **pagep, void **fsdata,
1950 get_block_t *get_block)
1951{
1952 struct inode *inode = mapping->host;
1953 int status = 0;
1954 struct page *page;
1955 pgoff_t index;
1956 unsigned start, end;
1957 int ownpage = 0;
1958
1959 index = pos >> PAGE_CACHE_SHIFT;
1960 start = pos & (PAGE_CACHE_SIZE - 1);
1961 end = start + len;
1962
1963 page = *pagep;
1964 if (page == NULL) {
1965 ownpage = 1;
1966 page = __grab_cache_page(mapping, index);
1967 if (!page) {
1968 status = -ENOMEM;
1969 goto out;
1970 }
1971 *pagep = page;
1972 } else
1973 BUG_ON(!PageLocked(page));
1974
1975 status = __block_prepare_write(inode, page, start, end, get_block);
1976 if (unlikely(status)) {
1977 ClearPageUptodate(page);
1978
1979 if (ownpage) {
1980 unlock_page(page);
1981 page_cache_release(page);
1982 *pagep = NULL;
1983
1984 /*
1985 * prepare_write() may have instantiated a few blocks
1986 * outside i_size. Trim these off again. Don't need
1987 * i_size_read because we hold i_mutex.
1988 */
1989 if (pos + len > inode->i_size)
1990 vmtruncate(inode, inode->i_size);
1991 }
1992 goto out;
1993 }
1994
1995out:
1996 return status;
1997}
1998EXPORT_SYMBOL(block_write_begin);
1999
2000int block_write_end(struct file *file, struct address_space *mapping,
2001 loff_t pos, unsigned len, unsigned copied,
2002 struct page *page, void *fsdata)
2003{
2004 struct inode *inode = mapping->host;
2005 unsigned start;
2006
2007 start = pos & (PAGE_CACHE_SIZE - 1);
2008
2009 if (unlikely(copied < len)) {
2010 /*
2011 * The buffers that were written will now be uptodate, so we
2012 * don't have to worry about a readpage reading them and
2013 * overwriting a partial write. However if we have encountered
2014 * a short write and only partially written into a buffer, it
2015 * will not be marked uptodate, so a readpage might come in and
2016 * destroy our partial write.
2017 *
2018 * Do the simplest thing, and just treat any short write to a
2019 * non uptodate page as a zero-length write, and force the
2020 * caller to redo the whole thing.
2021 */
2022 if (!PageUptodate(page))
2023 copied = 0;
2024
2025 page_zero_new_buffers(page, start+copied, start+len);
2026 }
2027 flush_dcache_page(page);
2028
2029 /* This could be a short (even 0-length) commit */
2030 __block_commit_write(inode, page, start, start+copied);
2031
2032 return copied;
2033}
2034EXPORT_SYMBOL(block_write_end);
2035
2036int generic_write_end(struct file *file, struct address_space *mapping,
2037 loff_t pos, unsigned len, unsigned copied,
2038 struct page *page, void *fsdata)
2039{
2040 struct inode *inode = mapping->host;
2041
2042 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2043
2044 /*
2045 * No need to use i_size_read() here, the i_size
2046 * cannot change under us because we hold i_mutex.
2047 *
2048 * But it's important to update i_size while still holding page lock:
2049 * page writeout could otherwise come in and zero beyond i_size.
2050 */
2051 if (pos+copied > inode->i_size) {
2052 i_size_write(inode, pos+copied);
2053 mark_inode_dirty(inode);
2054 }
2055
2056 unlock_page(page);
2057 page_cache_release(page);
2058
2059 return copied;
2060}
2061EXPORT_SYMBOL(generic_write_end);
2062
2063/*
1927 * Generic "read page" function for block devices that have the normal 2064 * Generic "read page" function for block devices that have the normal
1928 * get_block functionality. This is most of the block device filesystems. 2065 * get_block functionality. This is most of the block device filesystems.
1929 * Reads the page asynchronously --- the unlock_buffer() and 2066 * Reads the page asynchronously --- the unlock_buffer() and
diff --git a/fs/libfs.c b/fs/libfs.c
index 5294de1f40c4..f2b32d3a9093 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -351,6 +351,26 @@ int simple_prepare_write(struct file *file, struct page *page,
351 return 0; 351 return 0;
352} 352}
353 353
354int simple_write_begin(struct file *file, struct address_space *mapping,
355 loff_t pos, unsigned len, unsigned flags,
356 struct page **pagep, void **fsdata)
357{
358 struct page *page;
359 pgoff_t index;
360 unsigned from;
361
362 index = pos >> PAGE_CACHE_SHIFT;
363 from = pos & (PAGE_CACHE_SIZE - 1);
364
365 page = __grab_cache_page(mapping, index);
366 if (!page)
367 return -ENOMEM;
368
369 *pagep = page;
370
371 return simple_prepare_write(file, page, from, from+len);
372}
373
354int simple_commit_write(struct file *file, struct page *page, 374int simple_commit_write(struct file *file, struct page *page,
355 unsigned from, unsigned to) 375 unsigned from, unsigned to)
356{ 376{
@@ -369,6 +389,28 @@ int simple_commit_write(struct file *file, struct page *page,
369 return 0; 389 return 0;
370} 390}
371 391
392int simple_write_end(struct file *file, struct address_space *mapping,
393 loff_t pos, unsigned len, unsigned copied,
394 struct page *page, void *fsdata)
395{
396 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
397
398 /* zero the stale part of the page if we did a short copy */
399 if (copied < len) {
400 void *kaddr = kmap_atomic(page, KM_USER0);
401 memset(kaddr + from + copied, 0, len - copied);
402 flush_dcache_page(page);
403 kunmap_atomic(kaddr, KM_USER0);
404 }
405
406 simple_commit_write(file, page, from, from+copied);
407
408 unlock_page(page);
409 page_cache_release(page);
410
411 return copied;
412}
413
372/* 414/*
373 * the inodes created here are not hashed. If you use iunique to generate 415 * the inodes created here are not hashed. If you use iunique to generate
374 * unique inode values later for this filesystem, then you must take care 416 * unique inode values later for this filesystem, then you must take care
@@ -642,6 +684,8 @@ EXPORT_SYMBOL(dcache_dir_open);
642EXPORT_SYMBOL(dcache_readdir); 684EXPORT_SYMBOL(dcache_readdir);
643EXPORT_SYMBOL(generic_read_dir); 685EXPORT_SYMBOL(generic_read_dir);
644EXPORT_SYMBOL(get_sb_pseudo); 686EXPORT_SYMBOL(get_sb_pseudo);
687EXPORT_SYMBOL(simple_write_begin);
688EXPORT_SYMBOL(simple_write_end);
645EXPORT_SYMBOL(simple_commit_write); 689EXPORT_SYMBOL(simple_commit_write);
646EXPORT_SYMBOL(simple_dir_inode_operations); 690EXPORT_SYMBOL(simple_dir_inode_operations);
647EXPORT_SYMBOL(simple_dir_operations); 691EXPORT_SYMBOL(simple_dir_operations);
diff --git a/fs/namei.c b/fs/namei.c
index a83160acd748..b40b8084eefc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2729,53 +2729,29 @@ int __page_symlink(struct inode *inode, const char *symname, int len,
2729{ 2729{
2730 struct address_space *mapping = inode->i_mapping; 2730 struct address_space *mapping = inode->i_mapping;
2731 struct page *page; 2731 struct page *page;
2732 void *fsdata;
2732 int err; 2733 int err;
2733 char *kaddr; 2734 char *kaddr;
2734 2735
2735retry: 2736retry:
2736 err = -ENOMEM; 2737 err = pagecache_write_begin(NULL, mapping, 0, len-1,
2737 page = find_or_create_page(mapping, 0, gfp_mask); 2738 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
2738 if (!page)
2739 goto fail;
2740 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2741 if (err == AOP_TRUNCATED_PAGE) {
2742 page_cache_release(page);
2743 goto retry;
2744 }
2745 if (err) 2739 if (err)
2746 goto fail_map; 2740 goto fail;
2741
2747 kaddr = kmap_atomic(page, KM_USER0); 2742 kaddr = kmap_atomic(page, KM_USER0);
2748 memcpy(kaddr, symname, len-1); 2743 memcpy(kaddr, symname, len-1);
2749 kunmap_atomic(kaddr, KM_USER0); 2744 kunmap_atomic(kaddr, KM_USER0);
2750 err = mapping->a_ops->commit_write(NULL, page, 0, len-1); 2745
2751 if (err == AOP_TRUNCATED_PAGE) { 2746 err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
2752 page_cache_release(page); 2747 page, fsdata);
2753 goto retry;
2754 }
2755 if (err)
2756 goto fail_map;
2757 /*
2758 * Notice that we are _not_ going to block here - end of page is
2759 * unmapped, so this will only try to map the rest of page, see
2760 * that it is unmapped (typically even will not look into inode -
2761 * ->i_size will be enough for everything) and zero it out.
2762 * OTOH it's obviously correct and should make the page up-to-date.
2763 */
2764 if (!PageUptodate(page)) {
2765 err = mapping->a_ops->readpage(NULL, page);
2766 if (err != AOP_TRUNCATED_PAGE)
2767 wait_on_page_locked(page);
2768 } else {
2769 unlock_page(page);
2770 }
2771 page_cache_release(page);
2772 if (err < 0) 2748 if (err < 0)
2773 goto fail; 2749 goto fail;
2750 if (err < len-1)
2751 goto retry;
2752
2774 mark_inode_dirty(inode); 2753 mark_inode_dirty(inode);
2775 return 0; 2754 return 0;
2776fail_map:
2777 unlock_page(page);
2778 page_cache_release(page);
2779fail: 2755fail:
2780 return err; 2756 return err;
2781} 2757}
diff --git a/fs/splice.c b/fs/splice.c
index 2df6be43c667..a7568bcc0f99 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -563,7 +563,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
563 struct address_space *mapping = file->f_mapping; 563 struct address_space *mapping = file->f_mapping;
564 unsigned int offset, this_len; 564 unsigned int offset, this_len;
565 struct page *page; 565 struct page *page;
566 pgoff_t index; 566 void *fsdata;
567 int ret; 567 int ret;
568 568
569 /* 569 /*
@@ -573,49 +573,16 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
573 if (unlikely(ret)) 573 if (unlikely(ret))
574 return ret; 574 return ret;
575 575
576 index = sd->pos >> PAGE_CACHE_SHIFT;
577 offset = sd->pos & ~PAGE_CACHE_MASK; 576 offset = sd->pos & ~PAGE_CACHE_MASK;
578 577
579 this_len = sd->len; 578 this_len = sd->len;
580 if (this_len + offset > PAGE_CACHE_SIZE) 579 if (this_len + offset > PAGE_CACHE_SIZE)
581 this_len = PAGE_CACHE_SIZE - offset; 580 this_len = PAGE_CACHE_SIZE - offset;
582 581
583find_page: 582 ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
584 page = find_lock_page(mapping, index); 583 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
585 if (!page) { 584 if (unlikely(ret))
586 ret = -ENOMEM; 585 goto out;
587 page = page_cache_alloc_cold(mapping);
588 if (unlikely(!page))
589 goto out_ret;
590
591 /*
592 * This will also lock the page
593 */
594 ret = add_to_page_cache_lru(page, mapping, index,
595 GFP_KERNEL);
596 if (unlikely(ret))
597 goto out_release;
598 }
599
600 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
601 if (unlikely(ret)) {
602 loff_t isize = i_size_read(mapping->host);
603
604 if (ret != AOP_TRUNCATED_PAGE)
605 unlock_page(page);
606 page_cache_release(page);
607 if (ret == AOP_TRUNCATED_PAGE)
608 goto find_page;
609
610 /*
611 * prepare_write() may have instantiated a few blocks
612 * outside i_size. Trim these off again.
613 */
614 if (sd->pos + this_len > isize)
615 vmtruncate(mapping->host, isize);
616
617 goto out_ret;
618 }
619 586
620 if (buf->page != page) { 587 if (buf->page != page) {
621 /* 588 /*
@@ -629,31 +596,9 @@ find_page:
629 kunmap_atomic(dst, KM_USER1); 596 kunmap_atomic(dst, KM_USER1);
630 buf->ops->unmap(pipe, buf, src); 597 buf->ops->unmap(pipe, buf, src);
631 } 598 }
632 599 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
633 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 600 page, fsdata);
634 if (ret) {
635 if (ret == AOP_TRUNCATED_PAGE) {
636 page_cache_release(page);
637 goto find_page;
638 }
639 if (ret < 0)
640 goto out;
641 /*
642 * Partial write has happened, so 'ret' already initialized by
643 * number of bytes written, Where is nothing we have to do here.
644 */
645 } else
646 ret = this_len;
647 /*
648 * Return the number of bytes written and mark page as
649 * accessed, we are now done!
650 */
651 mark_page_accessed(page);
652out: 601out:
653 unlock_page(page);
654out_release:
655 page_cache_release(page);
656out_ret:
657 return ret; 602 return ret;
658} 603}
659 604