diff options
author | Hugh Dickins <hughd@google.com> | 2012-03-21 19:33:53 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-21 20:54:56 -0400 |
commit | 8575ec29f61da83a2bf382c8c490499dc022101e (patch) | |
tree | bba641ceec47b52ce2a91bdf117aed01d476ea9f /mm | |
parent | aff622495c9a0b56148192e53bdec539f5e147f2 (diff) |
compact_pgdat: workaround lockdep warning in kswapd
I get this lockdep warning from swapping load on linux-next, due to
"vmscan: kswapd carefully call compaction".
=================================
[ INFO: inconsistent lock state ]
3.3.0-rc2-next-20120201 #5 Not tainted
---------------------------------
inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage.
kswapd0/28 [HC0[0]:SC0[0]:HE1:SE1] takes:
(pcpu_alloc_mutex){+.+.?.}, at: [<ffffffff810d6684>] pcpu_alloc+0x67/0x325
{RECLAIM_FS-ON-W} state was registered at:
[<ffffffff81099b75>] mark_held_locks+0xd7/0x103
[<ffffffff8109a13c>] lockdep_trace_alloc+0x85/0x9e
[<ffffffff810f6bdc>] __kmalloc+0x6c/0x14b
[<ffffffff810d57fd>] pcpu_mem_zalloc+0x59/0x62
[<ffffffff810d5d16>] pcpu_extend_area_map+0x26/0xb1
[<ffffffff810d679f>] pcpu_alloc+0x182/0x325
[<ffffffff810d694d>] __alloc_percpu+0xb/0xd
[<ffffffff8142ebfd>] snmp_mib_init+0x1e/0x2e
[<ffffffff8185cd8d>] ipv4_mib_init_net+0x7a/0x184
[<ffffffff813dc963>] ops_init.clone.0+0x6b/0x73
[<ffffffff813dc9cc>] register_pernet_operations+0x61/0xa0
[<ffffffff813dca8e>] register_pernet_subsys+0x29/0x42
[<ffffffff8185d044>] inet_init+0x1ad/0x252
[<ffffffff810002e3>] do_one_initcall+0x7a/0x12f
[<ffffffff81832bc5>] kernel_init+0x9d/0x11e
[<ffffffff814e51e4>] kernel_thread_helper+0x4/0x10
irq event stamp: 656613
hardirqs last enabled at (656613): [<ffffffff814e0ddc>] __mutex_unlock_slowpath+0x104/0x128
hardirqs last disabled at (656612): [<ffffffff814e0d34>] __mutex_unlock_slowpath+0x5c/0x128
softirqs last enabled at (655568): [<ffffffff8105b4a5>] __do_softirq+0x120/0x136
softirqs last disabled at (654757): [<ffffffff814e52dc>] call_softirq+0x1c/0x30
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(pcpu_alloc_mutex);
<Interrupt>
lock(pcpu_alloc_mutex);
*** DEADLOCK ***
no locks held by kswapd0/28.
stack backtrace:
Pid: 28, comm: kswapd0 Not tainted 3.3.0-rc2-next-20120201 #5
Call Trace:
[<ffffffff810981f4>] print_usage_bug+0x1bf/0x1d0
[<ffffffff81096c3e>] ? print_irq_inversion_bug+0x1d9/0x1d9
[<ffffffff810982c0>] mark_lock_irq+0xbb/0x22e
[<ffffffff810c5399>] ? free_hot_cold_page+0x13d/0x14f
[<ffffffff81098684>] mark_lock+0x251/0x331
[<ffffffff81098893>] mark_irqflags+0x12f/0x141
[<ffffffff81098e32>] __lock_acquire+0x58d/0x753
[<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325
[<ffffffff81099433>] lock_acquire+0x54/0x6a
[<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325
[<ffffffff8107a5b8>] ? add_preempt_count+0xa9/0xae
[<ffffffff814e0a21>] mutex_lock_nested+0x5e/0x315
[<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325
[<ffffffff81098f81>] ? __lock_acquire+0x6dc/0x753
[<ffffffff810c9fb0>] ? __pagevec_release+0x2c/0x2c
[<ffffffff810d6684>] pcpu_alloc+0x67/0x325
[<ffffffff810c9fb0>] ? __pagevec_release+0x2c/0x2c
[<ffffffff810d694d>] __alloc_percpu+0xb/0xd
[<ffffffff8106c35e>] schedule_on_each_cpu+0x23/0x110
[<ffffffff810c9fcb>] lru_add_drain_all+0x10/0x12
[<ffffffff810f126f>] __compact_pgdat+0x20/0x182
[<ffffffff810f15c2>] compact_pgdat+0x27/0x29
[<ffffffff810c306b>] ? zone_watermark_ok+0x1a/0x1c
[<ffffffff810cdf6f>] balance_pgdat+0x732/0x751
[<ffffffff810ce0ed>] kswapd+0x15f/0x178
[<ffffffff810cdf8e>] ? balance_pgdat+0x751/0x751
[<ffffffff8106fd11>] kthread+0x84/0x8c
[<ffffffff814e51e4>] kernel_thread_helper+0x4/0x10
[<ffffffff810787ed>] ? finish_task_switch+0x85/0xea
[<ffffffff814e3861>] ? retint_restore_args+0xe/0xe
[<ffffffff8106fc8d>] ? __init_kthread_worker+0x56/0x56
[<ffffffff814e51e0>] ? gs_change+0xb/0xb
The RECLAIM_FS notations indicate that it's doing the GFP_FS checking that
Nick hacked into lockdep a while back: I think we're intended to read that
"<Interrupt>" in the DEADLOCK scenario as "<Direct reclaim>".
I'm hazy, I have not reached any conclusion as to whether it's right to
complain or not; but I believe it's uneasy about kswapd now doing the
mutex_lock(&pcpu_alloc_mutex) which lru_add_drain_all() entails. Nor have
I reached any conclusion as to whether it's important for kswapd to do
that draining or not.
But so as not to get blocked on this, with lockdep disabled from giving
further reports, here's a patch which removes the lru_add_drain_all() from
kswapd's callpath (and calls it only once from compact_nodes(), instead of
once per node).
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 22 |
1 files changed, 12 insertions, 10 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index c4b344a95032..a08bf219f88c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -680,9 +680,6 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
680 | int zoneid; | 680 | int zoneid; |
681 | struct zone *zone; | 681 | struct zone *zone; |
682 | 682 | ||
683 | /* Flush pending updates to the LRU lists */ | ||
684 | lru_add_drain_all(); | ||
685 | |||
686 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | 683 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { |
687 | 684 | ||
688 | zone = &pgdat->node_zones[zoneid]; | 685 | zone = &pgdat->node_zones[zoneid]; |
@@ -727,17 +724,12 @@ int compact_pgdat(pg_data_t *pgdat, int order) | |||
727 | 724 | ||
728 | static int compact_node(int nid) | 725 | static int compact_node(int nid) |
729 | { | 726 | { |
730 | pg_data_t *pgdat; | ||
731 | struct compact_control cc = { | 727 | struct compact_control cc = { |
732 | .order = -1, | 728 | .order = -1, |
733 | .sync = true, | 729 | .sync = true, |
734 | }; | 730 | }; |
735 | 731 | ||
736 | if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) | 732 | return __compact_pgdat(NODE_DATA(nid), &cc); |
737 | return -EINVAL; | ||
738 | pgdat = NODE_DATA(nid); | ||
739 | |||
740 | return __compact_pgdat(pgdat, &cc); | ||
741 | } | 733 | } |
742 | 734 | ||
743 | /* Compact all nodes in the system */ | 735 | /* Compact all nodes in the system */ |
@@ -745,6 +737,9 @@ static int compact_nodes(void) | |||
745 | { | 737 | { |
746 | int nid; | 738 | int nid; |
747 | 739 | ||
740 | /* Flush pending updates to the LRU lists */ | ||
741 | lru_add_drain_all(); | ||
742 | |||
748 | for_each_online_node(nid) | 743 | for_each_online_node(nid) |
749 | compact_node(nid); | 744 | compact_node(nid); |
750 | 745 | ||
@@ -777,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev, | |||
777 | struct device_attribute *attr, | 772 | struct device_attribute *attr, |
778 | const char *buf, size_t count) | 773 | const char *buf, size_t count) |
779 | { | 774 | { |
780 | compact_node(dev->id); | 775 | int nid = dev->id; |
776 | |||
777 | if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { | ||
778 | /* Flush pending updates to the LRU lists */ | ||
779 | lru_add_drain_all(); | ||
780 | |||
781 | compact_node(nid); | ||
782 | } | ||
781 | 783 | ||
782 | return count; | 784 | return count; |
783 | } | 785 | } |