aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorGu Zheng <guz.fnst@cn.fujitsu.com>2014-11-06 04:46:21 -0500
committerBenjamin LaHaise <bcrl@kvack.org>2014-11-06 14:27:19 -0500
commit835f252c6debd204fcd607c79975089b1ecd3472 (patch)
tree7ca7a2fd65540709e3a02df5638ad79912ff698c /fs
parent6098b45b32e6baeacc04790773ced9340601d511 (diff)
aio: fix uncorrent dirty pages accouting when truncating AIO ring buffer
https://bugzilla.kernel.org/show_bug.cgi?id=86831 Markus reported that when shutting down mysqld (with AIO support, on a ext3 formatted Harddrive) leads to a negative number of dirty pages (underrun to the counter). The negative number results in a drastic reduction of the write performance because the page cache is not used, because the kernel thinks it is still 2 ^ 32 dirty pages open. Add a warn trace in __dec_zone_state will catch this easily: static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); + WARN_ON_ONCE(item == NR_FILE_DIRTY && atomic_long_read(&zone->vm_stat[item]) < 0); atomic_long_dec(&vm_stat[item]); } [ 21.341632] ------------[ cut here ]------------ [ 21.346294] WARNING: CPU: 0 PID: 309 at include/linux/vmstat.h:242 cancel_dirty_page+0x164/0x224() [ 21.355296] Modules linked in: wutbox_cp sata_mv [ 21.359968] CPU: 0 PID: 309 Comm: kworker/0:1 Not tainted 3.14.21-WuT #80 [ 21.366793] Workqueue: events free_ioctx [ 21.370760] [<c0016a64>] (unwind_backtrace) from [<c0012f88>] (show_stack+0x20/0x24) [ 21.378562] [<c0012f88>] (show_stack) from [<c03f8ccc>] (dump_stack+0x24/0x28) [ 21.385840] [<c03f8ccc>] (dump_stack) from [<c0023ae4>] (warn_slowpath_common+0x84/0x9c) [ 21.393976] [<c0023ae4>] (warn_slowpath_common) from [<c0023bb8>] (warn_slowpath_null+0x2c/0x34) [ 21.402800] [<c0023bb8>] (warn_slowpath_null) from [<c00c0688>] (cancel_dirty_page+0x164/0x224) [ 21.411524] [<c00c0688>] (cancel_dirty_page) from [<c00c080c>] (truncate_inode_page+0x8c/0x158) [ 21.420272] [<c00c080c>] (truncate_inode_page) from [<c00c0a94>] (truncate_inode_pages_range+0x11c/0x53c) [ 21.429890] [<c00c0a94>] (truncate_inode_pages_range) from [<c00c0f6c>] (truncate_pagecache+0x88/0xac) [ 21.439252] [<c00c0f6c>] (truncate_pagecache) from [<c00c0fec>] (truncate_setsize+0x5c/0x74) [ 21.447731] [<c00c0fec>] (truncate_setsize) from [<c013b3a8>] (put_aio_ring_file.isra.14+0x34/0x90) [ 21.456826] [<c013b3a8>] (put_aio_ring_file.isra.14) from [<c013b424>] (aio_free_ring+0x20/0xcc) [ 21.465660] [<c013b424>] (aio_free_ring) from [<c013b4f4>] (free_ioctx+0x24/0x44) [ 21.473190] [<c013b4f4>] (free_ioctx) from [<c003d8d8>] (process_one_work+0x134/0x47c) [ 21.481132] [<c003d8d8>] (process_one_work) from [<c003e988>] (worker_thread+0x130/0x414) [ 21.489350] [<c003e988>] (worker_thread) from [<c00448ac>] (kthread+0xd4/0xec) [ 21.496621] [<c00448ac>] (kthread) from [<c000ec18>] (ret_from_fork+0x14/0x20) [ 21.503884] ---[ end trace 79c4bf42c038c9a1 ]--- The cause is that we set the aio ring file pages as *DIRTY* via SetPageDirty (bypasses the VFS dirty pages increment) when init, and aio fs uses *default_backing_dev_info* as the backing dev, which does not disable the dirty pages accounting capability. So truncating aio ring file will contribute to accounting dirty pages (VFS dirty pages decrement), then error occurs. The original goal is keeping these pages in memory (can not be reclaimed or swapped) in life-time via marking it dirty. But thinking more, we have already pinned pages via elevating the page's refcount, which can already achieve the goal, so the SetPageDirty seems unnecessary. In order to fix the issue, using the __set_page_dirty_no_writeback instead of the nop .set_page_dirty, and dropped the SetPageDirty (don't manually set the dirty flags, don't disable set_page_dirty(), rely on default behaviour). With the above change, the dirty pages accounting can work well. But as we known, aio fs is an anonymous one, which should never cause any real write-back, we can ignore the dirty pages (write back) accounting by disabling the dirty pages (write back) accounting capability. So we introduce an aio private backing dev info (disabled the ACCT_DIRTY/WRITEBACK/ACCT_WB capabilities) to replace the default one. Reported-by: Markus Königshaus <m.koenigshaus@wut.de> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com> Cc: stable <stable@vger.kernel.org> Acked-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c21
1 files changed, 14 insertions, 7 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 733750096b71..0ff7c464a478 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,6 +165,15 @@ static struct vfsmount *aio_mnt;
165static const struct file_operations aio_ring_fops; 165static const struct file_operations aio_ring_fops;
166static const struct address_space_operations aio_ctx_aops; 166static const struct address_space_operations aio_ctx_aops;
167 167
168/* Backing dev info for aio fs.
169 * -no dirty page accounting or writeback happens
170 */
171static struct backing_dev_info aio_fs_backing_dev_info = {
172 .name = "aiofs",
173 .state = 0,
174 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
175};
176
168static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) 177static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
169{ 178{
170 struct qstr this = QSTR_INIT("[aio]", 5); 179 struct qstr this = QSTR_INIT("[aio]", 5);
@@ -176,6 +185,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
176 185
177 inode->i_mapping->a_ops = &aio_ctx_aops; 186 inode->i_mapping->a_ops = &aio_ctx_aops;
178 inode->i_mapping->private_data = ctx; 187 inode->i_mapping->private_data = ctx;
188 inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
179 inode->i_size = PAGE_SIZE * nr_pages; 189 inode->i_size = PAGE_SIZE * nr_pages;
180 190
181 path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); 191 path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -220,6 +230,9 @@ static int __init aio_setup(void)
220 if (IS_ERR(aio_mnt)) 230 if (IS_ERR(aio_mnt))
221 panic("Failed to create aio fs mount."); 231 panic("Failed to create aio fs mount.");
222 232
233 if (bdi_init(&aio_fs_backing_dev_info))
234 panic("Failed to init aio fs backing dev info.");
235
223 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 236 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
224 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 237 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
225 238
@@ -281,11 +294,6 @@ static const struct file_operations aio_ring_fops = {
281 .mmap = aio_ring_mmap, 294 .mmap = aio_ring_mmap,
282}; 295};
283 296
284static int aio_set_page_dirty(struct page *page)
285{
286 return 0;
287}
288
289#if IS_ENABLED(CONFIG_MIGRATION) 297#if IS_ENABLED(CONFIG_MIGRATION)
290static int aio_migratepage(struct address_space *mapping, struct page *new, 298static int aio_migratepage(struct address_space *mapping, struct page *new,
291 struct page *old, enum migrate_mode mode) 299 struct page *old, enum migrate_mode mode)
@@ -357,7 +365,7 @@ out:
357#endif 365#endif
358 366
359static const struct address_space_operations aio_ctx_aops = { 367static const struct address_space_operations aio_ctx_aops = {
360 .set_page_dirty = aio_set_page_dirty, 368 .set_page_dirty = __set_page_dirty_no_writeback,
361#if IS_ENABLED(CONFIG_MIGRATION) 369#if IS_ENABLED(CONFIG_MIGRATION)
362 .migratepage = aio_migratepage, 370 .migratepage = aio_migratepage,
363#endif 371#endif
@@ -412,7 +420,6 @@ static int aio_setup_ring(struct kioctx *ctx)
412 pr_debug("pid(%d) page[%d]->count=%d\n", 420 pr_debug("pid(%d) page[%d]->count=%d\n",
413 current->pid, i, page_count(page)); 421 current->pid, i, page_count(page));
414 SetPageUptodate(page); 422 SetPageUptodate(page);
415 SetPageDirty(page);
416 unlock_page(page); 423 unlock_page(page);
417 424
418 ctx->ring_pages[i] = page; 425 ctx->ring_pages[i] = page;