aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>2016-12-06 10:31:44 -0500
committerJens Axboe <axboe@fb.com>2016-12-14 10:15:25 -0500
commit36e1f3d107867b25c616c2fd294f5a1c9d4e5d09 (patch)
tree7d7b233866fc76151d070706316bbe2d8c7f9c6d
parent7a62a52333f8b5b82753d9529c11b3404bc5c183 (diff)
blk-mq: Avoid memory reclaim when remapping queues
While stressing memory and IO at the same time we changed SMT settings, we were able to consistently trigger deadlocks in the mm system, which froze the entire machine. I think that under memory stress conditions, the large allocations performed by blk_mq_init_rq_map may trigger a reclaim, which stalls waiting on the block layer remmaping completion, thus deadlocking the system. The trace below was collected after the machine stalled, waiting for the hotplug event completion. The simplest fix for this is to make allocations in this path non-reclaimable, with GFP_NOIO. With this patch, We couldn't hit the issue anymore. This should apply on top of Jens's for-next branch cleanly. Changes since v1: - Use GFP_NOIO instead of GFP_NOWAIT. Call Trace: [c000000f0160aaf0] [c000000f0160ab50] 0xc000000f0160ab50 (unreliable) [c000000f0160acc0] [c000000000016624] __switch_to+0x2e4/0x430 [c000000f0160ad20] [c000000000b1a880] __schedule+0x310/0x9b0 [c000000f0160ae00] [c000000000b1af68] schedule+0x48/0xc0 [c000000f0160ae30] [c000000000b1b4b0] schedule_preempt_disabled+0x20/0x30 [c000000f0160ae50] [c000000000b1d4fc] __mutex_lock_slowpath+0xec/0x1f0 [c000000f0160aed0] [c000000000b1d678] mutex_lock+0x78/0xa0 [c000000f0160af00] [d000000019413cac] xfs_reclaim_inodes_ag+0x33c/0x380 [xfs] [c000000f0160b0b0] [d000000019415164] xfs_reclaim_inodes_nr+0x54/0x70 [xfs] [c000000f0160b0f0] [d0000000194297f8] xfs_fs_free_cached_objects+0x38/0x60 [xfs] [c000000f0160b120] [c0000000003172c8] super_cache_scan+0x1f8/0x210 [c000000f0160b190] [c00000000026301c] shrink_slab.part.13+0x21c/0x4c0 [c000000f0160b2d0] [c000000000268088] shrink_zone+0x2d8/0x3c0 [c000000f0160b380] [c00000000026834c] do_try_to_free_pages+0x1dc/0x520 [c000000f0160b450] [c00000000026876c] try_to_free_pages+0xdc/0x250 [c000000f0160b4e0] [c000000000251978] __alloc_pages_nodemask+0x868/0x10d0 [c000000f0160b6f0] [c000000000567030] blk_mq_init_rq_map+0x160/0x380 [c000000f0160b7a0] [c00000000056758c] blk_mq_map_swqueue+0x33c/0x360 [c000000f0160b820] [c000000000567904] blk_mq_queue_reinit+0x64/0xb0 [c000000f0160b850] [c00000000056a16c] blk_mq_queue_reinit_notify+0x19c/0x250 [c000000f0160b8a0] [c0000000000f5d38] notifier_call_chain+0x98/0x100 [c000000f0160b8f0] [c0000000000c5fb0] __cpu_notify+0x70/0xe0 [c000000f0160b930] [c0000000000c63c4] notify_prepare+0x44/0xb0 [c000000f0160b9b0] [c0000000000c52f4] cpuhp_invoke_callback+0x84/0x250 [c000000f0160ba10] [c0000000000c570c] cpuhp_up_callbacks+0x5c/0x120 [c000000f0160ba60] [c0000000000c7cb8] _cpu_up+0xf8/0x1d0 [c000000f0160bac0] [c0000000000c7eb0] do_cpu_up+0x120/0x150 [c000000f0160bb40] [c0000000006fe024] cpu_subsys_online+0x64/0xe0 [c000000f0160bb90] [c0000000006f5124] device_online+0xb4/0x120 [c000000f0160bbd0] [c0000000006f5244] online_store+0xb4/0xc0 [c000000f0160bc20] [c0000000006f0a68] dev_attr_store+0x68/0xa0 [c000000f0160bc60] [c0000000003ccc30] sysfs_kf_write+0x80/0xb0 [c000000f0160bca0] [c0000000003cbabc] kernfs_fop_write+0x17c/0x250 [c000000f0160bcf0] [c00000000030fe6c] __vfs_write+0x6c/0x1e0 [c000000f0160bd90] [c000000000311490] vfs_write+0xd0/0x270 [c000000f0160bde0] [c0000000003131fc] SyS_write+0x6c/0x110 [c000000f0160be30] [c000000000009204] system_call+0x38/0xec Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com> Cc: Brian King <brking@linux.vnet.ibm.com> Cc: Douglas Miller <dougmill@linux.vnet.ibm.com> Cc: linux-block@vger.kernel.org Cc: linux-scsi@vger.kernel.org Signed-off-by: Jens Axboe <axboe@fb.com>
-rw-r--r--block/blk-mq.c6
1 files changed, 3 insertions, 3 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d79fdc11b1ee..7ad7c11fe01d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1605,7 +1605,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1605 INIT_LIST_HEAD(&tags->page_list); 1605 INIT_LIST_HEAD(&tags->page_list);
1606 1606
1607 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), 1607 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1608 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1608 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1609 set->numa_node); 1609 set->numa_node);
1610 if (!tags->rqs) { 1610 if (!tags->rqs) {
1611 blk_mq_free_tags(tags); 1611 blk_mq_free_tags(tags);
@@ -1631,7 +1631,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1631 1631
1632 do { 1632 do {
1633 page = alloc_pages_node(set->numa_node, 1633 page = alloc_pages_node(set->numa_node,
1634 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1634 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1635 this_order); 1635 this_order);
1636 if (page) 1636 if (page)
1637 break; 1637 break;
@@ -1652,7 +1652,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1652 * Allow kmemleak to scan these pages as they contain pointers 1652 * Allow kmemleak to scan these pages as they contain pointers
1653 * to additional allocations like via ops->init_request(). 1653 * to additional allocations like via ops->init_request().
1654 */ 1654 */
1655 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); 1655 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
1656 entries_per_page = order_to_size(this_order) / rq_size; 1656 entries_per_page = order_to_size(this_order) / rq_size;
1657 to_do = min(entries_per_page, set->queue_depth - i); 1657 to_do = min(entries_per_page, set->queue_depth - i);
1658 left -= to_do * rq_size; 1658 left -= to_do * rq_size;