aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>2010-08-09 20:19:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-09 23:45:00 -0400
commit25edde0332916ae706ccf83de688be57bcc844b7 (patch)
tree35a5b0e651f9cdb48d9a55a748970339c4f681bc
parentb898cc70019ce1835bbf6c47bdf978adc36faa42 (diff)
vmscan: kill prev_priority completely
Since 2.6.28 zone->prev_priority is unused. Then it can be removed safely. It reduce stack usage slightly. Now I have to say that I'm sorry. 2 years ago, I thought prev_priority can be integrate again, it's useful. but four (or more) times trying haven't got good performance number. Thus I give up such approach. The rest of this changelog is notes on prev_priority and why it existed in the first place and why it might be not necessary any more. This information is based heavily on discussions between Andrew Morton, Rik van Riel and Kosaki Motohiro who is heavily quotes from. Historically prev_priority was important because it determined when the VM would start unmapping PTE pages. i.e. there are no balances of note within the VM, Anon vs File and Mapped vs Unmapped. Without prev_priority, there is a potential risk of unnecessarily increasing minor faults as a large amount of read activity of use-once pages could push mapped pages to the end of the LRU and get unmapped. There is no proof this is still a problem but currently it is not considered to be. Active files are not deactivated if the active file list is smaller than the inactive list reducing the liklihood that file-mapped pages are being pushed off the LRU and referenced executable pages are kept on the active list to avoid them getting pushed out by read activity. Even if it is a problem, prev_priority prev_priority wouldn't works nowadays. First of all, current vmscan still a lot of UP centric code. it expose some weakness on some dozens CPUs machine. I think we need more and more improvement. The problem is, current vmscan mix up per-system-pressure, per-zone-pressure and per-task-pressure a bit. example, prev_priority try to boost priority to other concurrent priority. but if the another task have mempolicy restriction, it is unnecessary, but also makes wrong big latency and exceeding reclaim. per-task based priority + prev_priority adjustment make the emulation of per-system pressure. but it have two issue 1) too rough and brutal emulation 2) we need per-zone pressure, not per-system. Another example, currently DEF_PRIORITY is 12. it mean the lru rotate about 2 cycle (1/4096 + 1/2048 + 1/1024 + .. + 1) before invoking OOM-Killer. but if 10,0000 thrreads enter DEF_PRIORITY reclaim at the same time, the system have higher memory pressure than priority==0 (1/4096*10,000 > 2). prev_priority can't solve such multithreads workload issue. In other word, prev_priority concept assume the sysmtem don't have lots threads." Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Chris Mason <chris.mason@oracle.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Christoph Hellwig <hch@infradead.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michael Rubin <mrubin@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h5
-rw-r--r--include/linux/mmzone.h15
-rw-r--r--mm/memcontrol.c31
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/vmscan.c57
-rw-r--r--mm/vmstat.c2
6 files changed, 0 insertions, 112 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9411d32840b0..9f1afd361583 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -98,11 +98,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
98/* 98/*
99 * For memory reclaim. 99 * For memory reclaim.
100 */ 100 */
101extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
102extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
103 int priority);
104extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
105 int priority);
106int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); 101int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
107int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); 102int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
108unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 103unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9ed9c459b14c..6e6e62648a4d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -348,21 +348,6 @@ struct zone {
348 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 348 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
349 349
350 /* 350 /*
351 * prev_priority holds the scanning priority for this zone. It is
352 * defined as the scanning priority at which we achieved our reclaim
353 * target at the previous try_to_free_pages() or balance_pgdat()
354 * invocation.
355 *
356 * We use prev_priority as a measure of how much stress page reclaim is
357 * under - it drives the swappiness decision: whether to unmap mapped
358 * pages.
359 *
360 * Access to both this field is quite racy even on uniprocessor. But
361 * it is expected to average out OK.
362 */
363 int prev_priority;
364
365 /*
366 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on 351 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
367 * this zone's LRU. Maintained by the pageout code. 352 * this zone's LRU. Maintained by the pageout code.
368 */ 353 */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..31abd1c2c0c5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -211,8 +211,6 @@ struct mem_cgroup {
211 */ 211 */
212 spinlock_t reclaim_param_lock; 212 spinlock_t reclaim_param_lock;
213 213
214 int prev_priority; /* for recording reclaim priority */
215
216 /* 214 /*
217 * While reclaiming in a hierarchy, we cache the last child we 215 * While reclaiming in a hierarchy, we cache the last child we
218 * reclaimed from. 216 * reclaimed from.
@@ -858,35 +856,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
858 return ret; 856 return ret;
859} 857}
860 858
861/*
862 * prev_priority control...this will be used in memory reclaim path.
863 */
864int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
865{
866 int prev_priority;
867
868 spin_lock(&mem->reclaim_param_lock);
869 prev_priority = mem->prev_priority;
870 spin_unlock(&mem->reclaim_param_lock);
871
872 return prev_priority;
873}
874
875void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
876{
877 spin_lock(&mem->reclaim_param_lock);
878 if (priority < mem->prev_priority)
879 mem->prev_priority = priority;
880 spin_unlock(&mem->reclaim_param_lock);
881}
882
883void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
884{
885 spin_lock(&mem->reclaim_param_lock);
886 mem->prev_priority = priority;
887 spin_unlock(&mem->reclaim_param_lock);
888}
889
890static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 859static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
891{ 860{
892 unsigned long active; 861 unsigned long active;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 33c6b4c1277b..a9649f4b261e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4100,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4100 zone_seqlock_init(zone); 4100 zone_seqlock_init(zone);
4101 zone->zone_pgdat = pgdat; 4101 zone->zone_pgdat = pgdat;
4102 4102
4103 zone->prev_priority = DEF_PRIORITY;
4104
4105 zone_pcp_init(zone); 4103 zone_pcp_init(zone);
4106 for_each_lru(l) { 4104 for_each_lru(l) {
4107 INIT_LIST_HEAD(&zone->lru[l].list); 4105 INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b7a4e6a3cf89..594eba8a44c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1290,20 +1290,6 @@ done:
1290} 1290}
1291 1291
1292/* 1292/*
1293 * We are about to scan this zone at a certain priority level. If that priority
1294 * level is smaller (ie: more urgent) than the previous priority, then note
1295 * that priority level within the zone. This is done so that when the next
1296 * process comes in to scan this zone, it will immediately start out at this
1297 * priority level rather than having to build up its own scanning priority.
1298 * Here, this priority affects only the reclaim-mapped threshold.
1299 */
1300static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1301{
1302 if (priority < zone->prev_priority)
1303 zone->prev_priority = priority;
1304}
1305
1306/*
1307 * This moves pages from the active list to the inactive list. 1293 * This moves pages from the active list to the inactive list.
1308 * 1294 *
1309 * We move them the other way if the page is referenced by one or more 1295 * We move them the other way if the page is referenced by one or more
@@ -1766,17 +1752,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
1766 if (scanning_global_lru(sc)) { 1752 if (scanning_global_lru(sc)) {
1767 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1753 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1768 continue; 1754 continue;
1769 note_zone_scanning_priority(zone, priority);
1770
1771 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1755 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1772 continue; /* Let kswapd poll it */ 1756 continue; /* Let kswapd poll it */
1773 } else {
1774 /*
1775 * Ignore cpuset limitation here. We just want to reduce
1776 * # of used pages by us regardless of memory shortage.
1777 */
1778 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1779 priority);
1780 } 1757 }
1781 1758
1782 shrink_zone(priority, zone, sc); 1759 shrink_zone(priority, zone, sc);
@@ -1877,17 +1854,6 @@ out:
1877 if (priority < 0) 1854 if (priority < 0)
1878 priority = 0; 1855 priority = 0;
1879 1856
1880 if (scanning_global_lru(sc)) {
1881 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1882
1883 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1884 continue;
1885
1886 zone->prev_priority = priority;
1887 }
1888 } else
1889 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1890
1891 delayacct_freepages_end(); 1857 delayacct_freepages_end();
1892 put_mems_allowed(); 1858 put_mems_allowed();
1893 1859
@@ -2053,22 +2019,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2053 .order = order, 2019 .order = order,
2054 .mem_cgroup = NULL, 2020 .mem_cgroup = NULL,
2055 }; 2021 };
2056 /*
2057 * temp_priority is used to remember the scanning priority at which
2058 * this zone was successfully refilled to
2059 * free_pages == high_wmark_pages(zone).
2060 */
2061 int temp_priority[MAX_NR_ZONES];
2062
2063loop_again: 2022loop_again:
2064 total_scanned = 0; 2023 total_scanned = 0;
2065 sc.nr_reclaimed = 0; 2024 sc.nr_reclaimed = 0;
2066 sc.may_writepage = !laptop_mode; 2025 sc.may_writepage = !laptop_mode;
2067 count_vm_event(PAGEOUTRUN); 2026 count_vm_event(PAGEOUTRUN);
2068 2027
2069 for (i = 0; i < pgdat->nr_zones; i++)
2070 temp_priority[i] = DEF_PRIORITY;
2071
2072 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2028 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2073 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2029 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2074 unsigned long lru_pages = 0; 2030 unsigned long lru_pages = 0;
@@ -2136,9 +2092,7 @@ loop_again:
2136 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2092 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2137 continue; 2093 continue;
2138 2094
2139 temp_priority[i] = priority;
2140 sc.nr_scanned = 0; 2095 sc.nr_scanned = 0;
2141 note_zone_scanning_priority(zone, priority);
2142 2096
2143 nid = pgdat->node_id; 2097 nid = pgdat->node_id;
2144 zid = zone_idx(zone); 2098 zid = zone_idx(zone);
@@ -2211,16 +2165,6 @@ loop_again:
2211 break; 2165 break;
2212 } 2166 }
2213out: 2167out:
2214 /*
2215 * Note within each zone the priority level at which this zone was
2216 * brought into a happy state. So that the next thread which scans this
2217 * zone will start out at that priority level.
2218 */
2219 for (i = 0; i < pgdat->nr_zones; i++) {
2220 struct zone *zone = pgdat->node_zones + i;
2221
2222 zone->prev_priority = temp_priority[i];
2223 }
2224 if (!all_zones_ok) { 2168 if (!all_zones_ok) {
2225 cond_resched(); 2169 cond_resched();
2226 2170
@@ -2639,7 +2583,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2639 */ 2583 */
2640 priority = ZONE_RECLAIM_PRIORITY; 2584 priority = ZONE_RECLAIM_PRIORITY;
2641 do { 2585 do {
2642 note_zone_scanning_priority(zone, priority);
2643 shrink_zone(priority, zone, &sc); 2586 shrink_zone(priority, zone, &sc);
2644 priority--; 2587 priority--;
2645 } while (priority >= 0 && sc.nr_reclaimed < nr_pages); 2588 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 15a14b16e176..f389168f9a83 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
853 } 853 }
854 seq_printf(m, 854 seq_printf(m,
855 "\n all_unreclaimable: %u" 855 "\n all_unreclaimable: %u"
856 "\n prev_priority: %i"
857 "\n start_pfn: %lu" 856 "\n start_pfn: %lu"
858 "\n inactive_ratio: %u", 857 "\n inactive_ratio: %u",
859 zone->all_unreclaimable, 858 zone->all_unreclaimable,
860 zone->prev_priority,
861 zone->zone_start_pfn, 859 zone->zone_start_pfn,
862 zone->inactive_ratio); 860 zone->inactive_ratio);
863 seq_putc(m, '\n'); 861 seq_putc(m, '\n');