From 88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:41 -0800 Subject: mm: page allocator: adjust the per-cpu counter threshold when memory is low Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To avoid synchronization overhead, these counters are maintained on a per-cpu basis and drained both periodically and when a threshold is above a threshold. On large CPU systems, the difference between the estimate and real value of NR_FREE_PAGES can be very high. The system can get into a case where pages are allocated far below the min watermark potentially causing livelock issues. The commit solved the problem by taking a better reading of NR_FREE_PAGES when memory was low. Unfortately, as reported by Shaohua Li this accurate reading can consume a large amount of CPU time on systems with many sockets due to cache line bouncing. This patch takes a different approach. For large machines where counter drift might be unsafe and while kswapd is awake, the per-cpu thresholds for the target pgdat are reduced to limit the level of drift to what should be a safe level. This incurs a performance penalty in heavy memory pressure by a factor that depends on the workload and the machine but the machine should function correctly without accidentally exhausting all memory on a node. There is an additional cost when kswapd wakes and sleeps but the event is not expected to be frequent - in Shaohua's test case, there was one recorded sleep and wake event at least. To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is introduced that takes a more accurate reading of NR_FREE_PAGES when called from wakeup_kswapd, when deciding whether it is really safe to go back to sleep in sleeping_prematurely() and when deciding if a zone is really balanced or not in balance_pgdat(). We are still using an expensive function but limiting how often it is called. When the test case is reproduced, the time spent in the watermark functions is reduced. The following report is on the percentage of time spent cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(), zone_page_state(). vanilla 11.6615% disable-threshold 0.2584% David said: : We had to pull aa454840 "mm: page allocator: calculate a better estimate : of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36 : internally because tests showed that it would cause the machine to stall : as the result of heavy kswapd activity. I merged it back with this fix as : it is pending in the -mm tree and it solves the issue we were seeing, so I : definitely think this should be pushed to -stable (and I would seriously : consider it for 2.6.37 inclusion even at this late date). Signed-off-by: Mel Gorman Reported-by: Shaohua Li Reviewed-by: Christoph Lameter Tested-by: Nicolas Bareil Cc: David Rientjes Cc: Kyle McMartin Cc: [2.6.37.1, 2.6.36.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/vmstat.h') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index eaaea37b3b75..e4cc21cf5870 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); +void restore_pgdat_percpu_threshold(pg_data_t *pgdat); #else /* CONFIG_SMP */ /* @@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } +static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } + static inline void refresh_cpu_vm_stats(int cpu) { } #endif -- cgit v1.2.2 From b44129b30652c8771db2265939bb8b463724043d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:43 -0800 Subject: mm: vmstat: use a single setter function and callback for adjusting percpu thresholds reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid errors due to counter drift. The functions duplicate some code so this patch replaces them with a single set_pgdat_percpu_threshold() that takes a callback function to calculate the desired threshold as a parameter. [akpm@linux-foundation.org: readability tweak] [kosaki.motohiro@jp.fujitsu.com: set_pgdat_percpu_threshold(): don't use for_each_online_cpu] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux/vmstat.h') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4cc21cf5870..833e676d6d92 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); -void restore_pgdat_percpu_threshold(pg_data_t *pgdat); + +int calculate_pressure_threshold(struct zone *zone); +int calculate_normal_threshold(struct zone *zone); +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* @@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state -static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } -static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } +#define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } #endif -- cgit v1.2.2 From 78afd5612deb8268bafc8b6507d72341d5ed9aac Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 22 Mar 2011 16:33:12 -0700 Subject: mm: add __GFP_OTHER_NODE flag Add a new __GFP_OTHER_NODE flag to tell the low level numa statistics in zone_statistics() that an allocation is on behalf of another thread. This way the local and remote counters can be still correct, even when background daemons like khugepaged are changing memory mappings. This only affects the accounting, but I think it's worth doing that right to avoid confusing users. I first tried to just pass down the right node, but this required a lot of changes to pass down this parameter and at least one addition of a 10th argument to a 9 argument function. Using the flag is a lot less intrusive. Open: should be also used for migration? [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andi Kleen Cc: Andrea Arcangeli Reviewed-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/vmstat.h') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 833e676d6d92..461c0119664f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -220,12 +220,12 @@ static inline unsigned long node_page_state(int node, zone_page_state(&zones[ZONE_MOVABLE], item); } -extern void zone_statistics(struct zone *, struct zone *); +extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #else #define node_page_state(node, item) global_page_state(item) -#define zone_statistics(_zl,_z) do { } while (0) +#define zone_statistics(_zl, _z, gfp) do { } while (0) #endif /* CONFIG_NUMA */ -- cgit v1.2.2 From 81ab4201fb7d91d6b0cd9ad5b4b16776e4bed145 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Apr 2011 15:22:06 -0700 Subject: mm: add VM counters for transparent hugepages I found it difficult to make sense of transparent huge pages without having any counters for its actions. Add some counters to vmstat for allocation of transparent hugepages and fallback to smaller pages. Optional patch, but useful for development and understanding the system. Contains improvements from Andrea Arcangeli and Johannes Weiner [akpm@linux-foundation.org: coding-style fixes] [hannes@cmpxchg.org: fix vmstat_text[] entries] Signed-off-by: Andi Kleen Acked-by: Andrea Arcangeli Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/vmstat.h') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 461c0119664f..2b3831b58aa4 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -58,6 +58,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ UNEVICTABLE_MLOCKFREED, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_FAULT_FALLBACK, + THP_COLLAPSE_ALLOC, + THP_COLLAPSE_ALLOC_FAILED, + THP_SPLIT, +#endif NR_VM_EVENT_ITEMS }; -- cgit v1.2.2 From fa25c503dfa203b921199ea42c0046c89f2ed49f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:28 -0700 Subject: mm: per-node vmstat: show proper vmstats commit 2ac390370a ("writeback: add /sys/devices/system/node//vmstat") added vmstat entry. But strangely it only show nr_written and nr_dirtied. # cat /sys/devices/system/node/node20/vmstat nr_written 0 nr_dirtied 0 Of course, It's not adequate. With this patch, the vmstat show all vm stastics as /proc/vmstat. # cat /sys/devices/system/node/node0/vmstat nr_free_pages 899224 nr_inactive_anon 201 nr_active_anon 17380 nr_inactive_file 31572 nr_active_file 28277 nr_unevictable 0 nr_mlock 0 nr_anon_pages 17321 nr_mapped 8640 nr_file_pages 60107 nr_dirty 33 nr_writeback 0 nr_slab_reclaimable 6850 nr_slab_unreclaimable 7604 nr_page_table_pages 3105 nr_kernel_stack 175 nr_unstable 0 nr_bounce 0 nr_vmscan_write 0 nr_writeback_temp 0 nr_isolated_anon 0 nr_isolated_file 0 nr_shmem 260 nr_dirtied 1050 nr_written 938 numa_hit 962872 numa_miss 0 numa_foreign 0 numa_interleave 8617 numa_local 962872 numa_other 0 nr_anon_transparent_hugepages 0 [akpm@linux-foundation.org: no externs in .c files] Signed-off-by: KOSAKI Motohiro Cc: Michael Rubin Cc: Wu Fengguang Acked-by: David Rientjes Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/vmstat.h') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 2b3831b58aa4..e73d1030f2f7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -313,6 +313,8 @@ static inline void __dec_zone_page_state(struct page *page, #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } -#endif +#endif /* CONFIG_SMP */ + +extern const char * const vmstat_text[]; #endif /* _LINUX_VMSTAT_H */ -- cgit v1.2.2 From a6cccdc36c966e51fd969560d870cfd37afbfa9c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:11:33 -0700 Subject: mm, mem-hotplug: update pcp->stat_threshold when memory hotplug occur Currently, cpu hotplug updates pcp->stat_threshold, but memory hotplug doesn't. There is no reason for this. [akpm@linux-foundation.org: fix CONFIG_SMP=n build] Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/vmstat.h') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e73d1030f2f7..51359837511a 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -261,6 +261,7 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); +void refresh_zone_stat_thresholds(void); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); @@ -313,6 +314,8 @@ static inline void __dec_zone_page_state(struct page *page, #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } +static inline void refresh_zone_stat_thresholds(void) { } + #endif /* CONFIG_SMP */ extern const char * const vmstat_text[]; -- cgit v1.2.2 From f042e707ee671e4beb5389abeb9a1819a2cf5532 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 26 May 2011 16:25:24 -0700 Subject: mm: move enum vm_event_item into a standalone header file enums are problematic because they cannot be forward-declared: akpm2:/home/akpm> cat t.c enum foo; static inline void bar(enum foo f) { } akpm2:/home/akpm> gcc -c t.c t.c:4: error: parameter 1 ('f') has incomplete type So move the enum's definition into a standalone header file which can be used wherever its definition is needed. Cc: Ying Han Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 62 +------------------------------------------------- 1 file changed, 1 insertion(+), 61 deletions(-) (limited to 'include/linux/vmstat.h') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 51359837511a..bcd942fa611c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -5,69 +5,9 @@ #include #include #include +#include #include -#ifdef CONFIG_ZONE_DMA -#define DMA_ZONE(xx) xx##_DMA, -#else -#define DMA_ZONE(xx) -#endif - -#ifdef CONFIG_ZONE_DMA32 -#define DMA32_ZONE(xx) xx##_DMA32, -#else -#define DMA32_ZONE(xx) -#endif - -#ifdef CONFIG_HIGHMEM -#define HIGHMEM_ZONE(xx) , xx##_HIGH -#else -#define HIGHMEM_ZONE(xx) -#endif - - -#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE - -enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, - FOR_ALL_ZONES(PGALLOC), - PGFREE, PGACTIVATE, PGDEACTIVATE, - PGFAULT, PGMAJFAULT, - FOR_ALL_ZONES(PGREFILL), - FOR_ALL_ZONES(PGSTEAL), - FOR_ALL_ZONES(PGSCAN_KSWAPD), - FOR_ALL_ZONES(PGSCAN_DIRECT), -#ifdef CONFIG_NUMA - PGSCAN_ZONE_RECLAIM_FAILED, -#endif - PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, - KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, - KSWAPD_SKIP_CONGESTION_WAIT, - PAGEOUTRUN, ALLOCSTALL, PGROTATED, -#ifdef CONFIG_COMPACTION - COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED, - COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, -#endif -#ifdef CONFIG_HUGETLB_PAGE - HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, -#endif - UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ - UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ - UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ - UNEVICTABLE_PGMLOCKED, - UNEVICTABLE_PGMUNLOCKED, - UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ - UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ - UNEVICTABLE_MLOCKFREED, -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - THP_FAULT_ALLOC, - THP_FAULT_FALLBACK, - THP_COLLAPSE_ALLOC, - THP_COLLAPSE_ALLOC_FAILED, - THP_SPLIT, -#endif - NR_VM_EVENT_ITEMS -}; - extern int sysctl_stat_interval; #ifdef CONFIG_VM_EVENT_COUNTERS -- cgit v1.2.2