From 3b5e6454aaf6b4439b19400d8365e2ec2d24e411 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 4 Sep 2014 22:04:42 -0400 Subject: fs/buffer.c: support buffer cache allocations with gfp modifiers A buffer cache is allocated from movable area because it is referred for a while and released soon. But some filesystems are taking buffer cache for a long time and it can disturb page migration. New APIs are introduced to allocate buffer cache with user specific flag. *_gfp APIs are for user want to set page allocation flag for page cache allocation. And *_unmovable APIs are for the user wants to allocate page cache from non-movable area. Signed-off-by: Gioh Kim Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/buffer.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 8f05111bbb8b..9a6029e0dd71 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, */ static int grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size, int sizebits) + pgoff_t index, int size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct page *page; @@ -1002,8 +1002,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; - gfp_mask |= __GFP_MOVABLE; + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + /* * XXX: __getblk_slow() can not really deal with failure and * will endlessly loop on improvised global reclaim. Prefer @@ -1058,7 +1058,7 @@ failed: * that page was dirty, the buffers are set dirty also. */ static int -grow_buffers(struct block_device *bdev, sector_t block, int size) +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) { pgoff_t index; int sizebits; @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) } /* Create a page with the proper size buffers.. */ - return grow_dev_page(bdev, block, index, size, sizebits); + return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -static struct buffer_head * -__getblk_slow(struct block_device *bdev, sector_t block, int size) +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_logical_block_size(bdev)-1) || @@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) if (bh) return bh; - ret = grow_buffers(bdev, block, size); + ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; if (ret == 0) free_more_memory(); } } +EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: @@ -1371,24 +1373,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__find_get_block); /* - * __getblk will locate (and, if necessary, create) the buffer_head + * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() - * attempt is failing. FIXME, perhaps? + * __getblk_gfp() will lock up the machine if grow_dev_page's + * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * -__getblk(struct block_device *bdev, sector_t block, unsigned size) +__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { struct buffer_head *bh = __find_get_block(bdev, block, size); might_sleep(); if (bh == NULL) - bh = __getblk_slow(bdev, block, size); + bh = __getblk_slow(bdev, block, size, gfp); return bh; } -EXPORT_SYMBOL(__getblk); +EXPORT_SYMBOL(__getblk_gfp); /* * Do async read-ahead on a buffer.. @@ -1404,24 +1407,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__breadahead); /** - * __bread() - reads a specified block and returns the bh + * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read - * + * @gfp: page allocation flag + * * Reads a specified block, and returns buffer head that contains it. + * The page cache can be allocated from non-movable area + * not to prevent page migration if you set gfp to zero. * It returns NULL if the block was unreadable. */ struct buffer_head * -__bread(struct block_device *bdev, sector_t block, unsigned size) +__bread_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); return bh; } -EXPORT_SYMBOL(__bread); +EXPORT_SYMBOL(__bread_gfp); /* * invalidate_bh_lrus() is called rarely - but not only at unmount. -- cgit v1.2.2 From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Oct 2014 21:49:18 -0400 Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data ->page_mkwrite() is used by filesystems to allocate blocks under a page which is becoming writeably mmapped in some process' address space. This allows a filesystem to return a page fault if there is not enough space available, user exceeds quota or similar problem happens, rather than silently discarding data later when writepage is called. However VFS fails to call ->page_mkwrite() in all the cases where filesystems need it when blocksize < pagesize. For example when blocksize = 1024, pagesize = 4096 the following is problematic: ftruncate(fd, 0); pwrite(fd, buf, 1024, 0); map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0); map[0] = 'a'; ----> page_mkwrite() for index 0 is called ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */ mremap(map, 1024, 10000, 0); map[4095] = 'a'; ----> no page_mkwrite() called At the moment ->page_mkwrite() is called, filesystem can allocate only one block for the page because i_size == 1024. Otherwise it would create blocks beyond i_size which is generally undesirable. But later at ->writepage() time, we also need to store data at offset 4095 but we don't have block allocated for it. This patch introduces a helper function filesystems can use to have ->page_mkwrite() called at all the necessary moments. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 9a6029e0dd71..6dc1475dcb2d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock -- cgit v1.2.2 From b744c2ac4bbc040794efb33207d6ebc14f88ea2e Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 21 Oct 2014 13:55:09 -0600 Subject: fs: merge I/O error prints into one line buffer.c uses two printk calls to print these messages: [67353.422338] Buffer I/O error on device sdr, logical block 212868488 [67353.422338] lost page write due to I/O error on sdr In a busy system, they may be interleaved with other prints, losing the context for the second message. Merge them into one line with one printk call so the prints are atomic. Also, differentiate between async page writes, sync page writes, and async page reads. Also, shorten "device" to "dev" to match the block layer prints: [67353.467906] blk_update_request: critical target error, dev sdr, sector 1707107328 Also, use %llu rather than %Lu. Resulting prints look like: [ 1356.437006] blk_update_request: critical target error, dev sdr, sector 1719693992 [ 1361.383522] quiet_error: 659876 callbacks suppressed [ 1361.385816] Buffer I/O error on dev sdr, logical block 256902912, lost async page write [ 1361.385819] Buffer I/O error on dev sdr, logical block 256903644, lost async page write Signed-off-by: Robert Elliott Reviewed-by: Webb Scales Signed-off-by: Jens Axboe --- fs/buffer.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 6c48f20eddd4..9d1da1d314a2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -137,12 +137,12 @@ static int quiet_error(struct buffer_head *bh) } -static void buffer_io_error(struct buffer_head *bh) +static void buffer_io_error(struct buffer_head *bh, char *msg) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", + printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n", bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); + (unsigned long long)bh->b_blocknr, msg); } /* @@ -177,17 +177,11 @@ EXPORT_SYMBOL(end_buffer_read_sync); void end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) { - buffer_io_error(bh); - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } + if (!quiet_error(bh)) + buffer_io_error(bh, ", lost sync page write"); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } @@ -305,7 +299,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } else { clear_buffer_uptodate(bh); if (!quiet_error(bh)) - buffer_io_error(bh); + buffer_io_error(bh, ", async page read"); SetPageError(page); } @@ -353,7 +347,6 @@ still_busy: */ void end_buffer_async_write(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; @@ -365,12 +358,8 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) { - buffer_io_error(bh); - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } + if (!quiet_error(bh)) + buffer_io_error(bh, ", lost async page write"); set_bit(AS_EIO, &page->mapping->flags); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); -- cgit v1.2.2 From 432f16e64f50fd4999a476543d04dd52f7a2d753 Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 21 Oct 2014 13:55:11 -0600 Subject: fs: clarify rate limit suppressed buffer I/O errors When quiet_error applies rate limiting to buffer_io_error calls, what the they apply to is unclear because the name is so generic, particularly if the messages are interleaved with others: [ 1936.063572] quiet_error: 664293 callbacks suppressed [ 1936.065297] Buffer I/O error on dev sdr, logical block 257429952, lost async page write [ 1936.067814] Buffer I/O error on dev sdr, logical block 257429953, lost async page write Also, the function uses printk_ratelimit(), although printk.h includes a comment advising "Please don't use... Instead use printk_ratelimited()." Change buffer_io_error to check the BH_Quiet bit itself, drop the printk_ratelimit call, and print using printk_ratelimited. This makes the messages look like: [ 387.208839] buffer_io_error: 676394 callbacks suppressed [ 387.210693] Buffer I/O error on dev sdr, logical block 211291776, lost async page write [ 387.213432] Buffer I/O error on dev sdr, logical block 211291777, lost async page write Signed-off-by: Robert Elliott Reviewed-by: Webb Scales Signed-off-by: Jens Axboe --- fs/buffer.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 9d1da1d314a2..20805db2c987 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -128,19 +128,13 @@ __clear_page_buffers(struct page *page) page_cache_release(page); } - -static int quiet_error(struct buffer_head *bh) -{ - if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) - return 0; - return 1; -} - - static void buffer_io_error(struct buffer_head *bh, char *msg) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n", + + if (!test_bit(BH_Quiet, &bh->b_state)) + printk_ratelimited(KERN_ERR + "Buffer I/O error on dev %s, logical block %llu%s\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr, msg); } @@ -180,8 +174,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) - buffer_io_error(bh, ", lost sync page write"); + buffer_io_error(bh, ", lost sync page write"); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } @@ -298,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); - if (!quiet_error(bh)) - buffer_io_error(bh, ", async page read"); + buffer_io_error(bh, ", async page read"); SetPageError(page); } @@ -358,8 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) - buffer_io_error(bh, ", lost async page write"); + buffer_io_error(bh, ", lost async page write"); set_bit(AS_EIO, &page->mapping->flags); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); -- cgit v1.2.2