aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorGabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>2016-12-06 10:31:44 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-04-18 01:11:49 -0400
commitd7045cbf4a061ca38a3080fbbfcf6a021f8a49c7 (patch)
tree28a997195d6f3d2d0d7cce2f196ad0b7cd576e21 /block
parent16fc98c2479f5477f2df220acd9cb53686e33f4c (diff)
blk-mq: Avoid memory reclaim when remapping queues
commit 36e1f3d107867b25c616c2fd294f5a1c9d4e5d09 upstream. While stressing memory and IO at the same time we changed SMT settings, we were able to consistently trigger deadlocks in the mm system, which froze the entire machine. I think that under memory stress conditions, the large allocations performed by blk_mq_init_rq_map may trigger a reclaim, which stalls waiting on the block layer remmaping completion, thus deadlocking the system. The trace below was collected after the machine stalled, waiting for the hotplug event completion. The simplest fix for this is to make allocations in this path non-reclaimable, with GFP_NOIO. With this patch, We couldn't hit the issue anymore. This should apply on top of Jens's for-next branch cleanly. Changes since v1: - Use GFP_NOIO instead of GFP_NOWAIT. Call Trace: [c000000f0160aaf0] [c000000f0160ab50] 0xc000000f0160ab50 (unreliable) [c000000f0160acc0] [c000000000016624] __switch_to+0x2e4/0x430 [c000000f0160ad20] [c000000000b1a880] __schedule+0x310/0x9b0 [c000000f0160ae00] [c000000000b1af68] schedule+0x48/0xc0 [c000000f0160ae30] [c000000000b1b4b0] schedule_preempt_disabled+0x20/0x30 [c000000f0160ae50] [c000000000b1d4fc] __mutex_lock_slowpath+0xec/0x1f0 [c000000f0160aed0] [c000000000b1d678] mutex_lock+0x78/0xa0 [c000000f0160af00] [d000000019413cac] xfs_reclaim_inodes_ag+0x33c/0x380 [xfs] [c000000f0160b0b0] [d000000019415164] xfs_reclaim_inodes_nr+0x54/0x70 [xfs] [c000000f0160b0f0] [d0000000194297f8] xfs_fs_free_cached_objects+0x38/0x60 [xfs] [c000000f0160b120] [c0000000003172c8] super_cache_scan+0x1f8/0x210 [c000000f0160b190] [c00000000026301c] shrink_slab.part.13+0x21c/0x4c0 [c000000f0160b2d0] [c000000000268088] shrink_zone+0x2d8/0x3c0 [c000000f0160b380] [c00000000026834c] do_try_to_free_pages+0x1dc/0x520 [c000000f0160b450] [c00000000026876c] try_to_free_pages+0xdc/0x250 [c000000f0160b4e0] [c000000000251978] __alloc_pages_nodemask+0x868/0x10d0 [c000000f0160b6f0] [c000000000567030] blk_mq_init_rq_map+0x160/0x380 [c000000f0160b7a0] [c00000000056758c] blk_mq_map_swqueue+0x33c/0x360 [c000000f0160b820] [c000000000567904] blk_mq_queue_reinit+0x64/0xb0 [c000000f0160b850] [c00000000056a16c] blk_mq_queue_reinit_notify+0x19c/0x250 [c000000f0160b8a0] [c0000000000f5d38] notifier_call_chain+0x98/0x100 [c000000f0160b8f0] [c0000000000c5fb0] __cpu_notify+0x70/0xe0 [c000000f0160b930] [c0000000000c63c4] notify_prepare+0x44/0xb0 [c000000f0160b9b0] [c0000000000c52f4] cpuhp_invoke_callback+0x84/0x250 [c000000f0160ba10] [c0000000000c570c] cpuhp_up_callbacks+0x5c/0x120 [c000000f0160ba60] [c0000000000c7cb8] _cpu_up+0xf8/0x1d0 [c000000f0160bac0] [c0000000000c7eb0] do_cpu_up+0x120/0x150 [c000000f0160bb40] [c0000000006fe024] cpu_subsys_online+0x64/0xe0 [c000000f0160bb90] [c0000000006f5124] device_online+0xb4/0x120 [c000000f0160bbd0] [c0000000006f5244] online_store+0xb4/0xc0 [c000000f0160bc20] [c0000000006f0a68] dev_attr_store+0x68/0xa0 [c000000f0160bc60] [c0000000003ccc30] sysfs_kf_write+0x80/0xb0 [c000000f0160bca0] [c0000000003cbabc] kernfs_fop_write+0x17c/0x250 [c000000f0160bcf0] [c00000000030fe6c] __vfs_write+0x6c/0x1e0 [c000000f0160bd90] [c000000000311490] vfs_write+0xd0/0x270 [c000000f0160bde0] [c0000000003131fc] SyS_write+0x6c/0x110 [c000000f0160be30] [c000000000009204] system_call+0x38/0xec Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com> Cc: Brian King <brking@linux.vnet.ibm.com> Cc: Douglas Miller <dougmill@linux.vnet.ibm.com> Cc: linux-block@vger.kernel.org Cc: linux-scsi@vger.kernel.org Signed-off-by: Jens Axboe <axboe@fb.com> Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'block')
-rw-r--r--block/blk-mq.c6
1 files changed, 3 insertions, 3 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ee54ad01f7ac..7b597ec4e9c5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1474,7 +1474,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1474 INIT_LIST_HEAD(&tags->page_list); 1474 INIT_LIST_HEAD(&tags->page_list);
1475 1475
1476 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), 1476 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1477 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1477 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1478 set->numa_node); 1478 set->numa_node);
1479 if (!tags->rqs) { 1479 if (!tags->rqs) {
1480 blk_mq_free_tags(tags); 1480 blk_mq_free_tags(tags);
@@ -1500,7 +1500,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1500 1500
1501 do { 1501 do {
1502 page = alloc_pages_node(set->numa_node, 1502 page = alloc_pages_node(set->numa_node,
1503 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1503 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1504 this_order); 1504 this_order);
1505 if (page) 1505 if (page)
1506 break; 1506 break;
@@ -1521,7 +1521,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1521 * Allow kmemleak to scan these pages as they contain pointers 1521 * Allow kmemleak to scan these pages as they contain pointers
1522 * to additional allocations like via ops->init_request(). 1522 * to additional allocations like via ops->init_request().
1523 */ 1523 */
1524 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); 1524 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
1525 entries_per_page = order_to_size(this_order) / rq_size; 1525 entries_per_page = order_to_size(this_order) / rq_size;
1526 to_do = min(entries_per_page, set->queue_depth - i); 1526 to_do = min(entries_per_page, set->queue_depth - i);
1527 left -= to_do * rq_size; 1527 left -= to_do * rq_size;