aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-11-26 19:29:45 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-11-26 20:41:24 -0500
commit82b212f40059bffd6808c07266a942d444d5558a (patch)
treebf0910ed6dade9445f2a8a7fc9d351565e0a45b1
parent05f564849d49499ced97913a0914b5950577d07d (diff)
Revert "mm: remove __GFP_NO_KSWAPD"
With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. The temptation is to supply a patch that checks if kswapd was woken for THP and if so ignore pgdat->kswapd_max_order but it'll be a hack and not backed up by proper testing. As 3.7 is very close to release and this is not a bug we should release with, a safer path is to revert "mm: remove __GFP_NO_KSWAPD" for now and revisit it with the view to ironing out the balance_pgdat() logic in general. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Zdenek Kabelac <zkabelac@redhat.com> Cc: Seth Jennings <sjenning@linux.vnet.ibm.com> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: Jiri Slaby <jirislaby@gmail.com> Cc: Rik van Riel <riel@redhat.com> Cc: Robert Jennings <rcj@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/mtd/mtdcore.c6
-rw-r--r--include/linux/gfp.h5
-rw-r--r--include/trace/events/gfpflags.h1
-rw-r--r--mm/page_alloc.c7
4 files changed, 13 insertions, 6 deletions
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 374c46dff7dd..ec794a72975d 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1077,7 +1077,8 @@ EXPORT_SYMBOL_GPL(mtd_writev);
1077 * until the request succeeds or until the allocation size falls below 1077 * until the request succeeds or until the allocation size falls below
1078 * the system page size. This attempts to make sure it does not adversely 1078 * the system page size. This attempts to make sure it does not adversely
1079 * impact system performance, so when allocating more than one page, we 1079 * impact system performance, so when allocating more than one page, we
1080 * ask the memory allocator to avoid re-trying. 1080 * ask the memory allocator to avoid re-trying, swapping, writing back
1081 * or performing I/O.
1081 * 1082 *
1082 * Note, this function also makes sure that the allocated buffer is aligned to 1083 * Note, this function also makes sure that the allocated buffer is aligned to
1083 * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. 1084 * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value.
@@ -1091,7 +1092,8 @@ EXPORT_SYMBOL_GPL(mtd_writev);
1091 */ 1092 */
1092void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) 1093void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
1093{ 1094{
1094 gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY; 1095 gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
1096 __GFP_NORETRY | __GFP_NO_KSWAPD;
1095 size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); 1097 size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
1096 void *kbuf; 1098 void *kbuf;
1097 1099
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 02c1c9710be0..d0a79678f169 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -31,6 +31,7 @@ struct vm_area_struct;
31#define ___GFP_THISNODE 0x40000u 31#define ___GFP_THISNODE 0x40000u
32#define ___GFP_RECLAIMABLE 0x80000u 32#define ___GFP_RECLAIMABLE 0x80000u
33#define ___GFP_NOTRACK 0x200000u 33#define ___GFP_NOTRACK 0x200000u
34#define ___GFP_NO_KSWAPD 0x400000u
34#define ___GFP_OTHER_NODE 0x800000u 35#define ___GFP_OTHER_NODE 0x800000u
35#define ___GFP_WRITE 0x1000000u 36#define ___GFP_WRITE 0x1000000u
36 37
@@ -85,6 +86,7 @@ struct vm_area_struct;
85#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ 86#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
86#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ 87#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */
87 88
89#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
88#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ 90#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
89#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ 91#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
90 92
@@ -114,7 +116,8 @@ struct vm_area_struct;
114 __GFP_MOVABLE) 116 __GFP_MOVABLE)
115#define GFP_IOFS (__GFP_IO | __GFP_FS) 117#define GFP_IOFS (__GFP_IO | __GFP_FS)
116#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ 118#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
117 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) 119 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
120 __GFP_NO_KSWAPD)
118 121
119#ifdef CONFIG_NUMA 122#ifdef CONFIG_NUMA
120#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) 123#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 9391706e9254..d6fd8e5b14b7 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -36,6 +36,7 @@
36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ 36 {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
37 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ 37 {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
38 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ 38 {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
39 {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \
39 {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ 40 {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \
40 ) : "GFP_NOWAIT" 41 ) : "GFP_NOWAIT"
41 42
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bcb72c6e2b2d..92871579cbee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2416,8 +2416,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2416 goto nopage; 2416 goto nopage;
2417 2417
2418restart: 2418restart:
2419 wake_all_kswapd(order, zonelist, high_zoneidx, 2419 if (!(gfp_mask & __GFP_NO_KSWAPD))
2420 zone_idx(preferred_zone)); 2420 wake_all_kswapd(order, zonelist, high_zoneidx,
2421 zone_idx(preferred_zone));
2421 2422
2422 /* 2423 /*
2423 * OK, we're below the kswapd watermark and have kicked background 2424 * OK, we're below the kswapd watermark and have kicked background
@@ -2494,7 +2495,7 @@ rebalance:
2494 * system then fail the allocation instead of entering direct reclaim. 2495 * system then fail the allocation instead of entering direct reclaim.
2495 */ 2496 */
2496 if ((deferred_compaction || contended_compaction) && 2497 if ((deferred_compaction || contended_compaction) &&
2497 (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) 2498 (gfp_mask & __GFP_NO_KSWAPD))
2498 goto nopage; 2499 goto nopage;
2499 2500
2500 /* Try direct reclaim and then allocating */ 2501 /* Try direct reclaim and then allocating */