[PATCH] Use ZVC for inactive and active counts

The determination of the dirty ratio to determine writeback behavior is currently based on the number of total pages on the system. However, not all pages in the system may be dirtied. Thus the ratio is always too low and can never reach 100%. The ratio may be particularly skewed if large hugepage allocations, slab allocations or device driver buffers make large sections of memory not available anymore. In that case we may get into a situation in which f.e. the background writeback ratio of 40% cannot be reached anymore which leads to undesired writeback behavior. This patchset fixes that issue by determining the ratio based on the actual pages that may potentially be dirty. These are the pages on the active and the inactive list plus free pages. The problem with those counts has so far been that it is expensive to calculate these because counts from multiple nodes and multiple zones will have to be summed up. This patchset makes these counters ZVC counters. This means that a current sum per zone, per node and for the whole system is always available via global variables and not expensive anymore to calculate. The patchset results in some other good side effects: - Removal of the various functions that sum up free, active and inactive page counts - Cleanup of the functions that display information via the proc filesystem. This patch: The use of a ZVC for nr_inactive and nr_active allows a simplification of some counter operations. More ZVC functionality is used for sums etc in the following patches. [akpm@osdl.org: UP build fix] Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Christoph Lameter <clameter@sgi.com> 2007-02-10 04:43:01 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-02-11 13:51:17 -0500
commit: c878538598d1e7ab41ecc0de8894e34e2fdef630 (patch)
tree: d22e73fddef75521e287c3e7754a1d3224c348d9 /mm
parent: c3704ceb4ad055b489b143f4e37c57d128908012 (diff)
3 files changed, 43 insertions, 42 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f26fdc94393e..07c954e53270 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1616,8 +1616,8 @@ void show_free_areas(void)
                        K(zone->pages_min),
                        K(zone->pages_low),
                        K(zone->pages_high),
-                        K(zone->nr_active),
+                        K(zone_page_state(zone, NR_ACTIVE)),
-                        K(zone->nr_inactive),
+                        K(zone_page_state(zone, NR_INACTIVE)),
                        K(zone->present_pages),
                        zone->pages_scanned,
                        (zone->all_unreclaimable ? "yes" : "no")
@@ -2684,8 +2684,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                INIT_LIST_HEAD(&zone->inactive_list);
                zone->nr_scan_active = 0;
                zone->nr_scan_inactive = 0;
-                zone->nr_active = 0;
-                zone->nr_inactive = 0;
                zap_zone_vm_stats(zone);
                atomic_set(&zone->reclaim_in_progress, 0);
                if (!size)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7430df68cb64..0655d5fe73e8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -679,7 +679,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                nr_taken = isolate_lru_pages(sc->swap_cluster_max,
                                             &zone->inactive_list,
                                             &page_list, &nr_scan);
-                zone->nr_inactive -= nr_taken;
+                __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken);
                zone->pages_scanned += nr_scan;
                spin_unlock_irq(&zone->lru_lock);
@@ -740,7 +740,8 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
 static inline int zone_is_near_oom(struct zone *zone)
 {
-        return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
+        return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
+                                + zone_page_state(zone, NR_INACTIVE))*3;
 }
 /*
@@ -825,7 +826,7 @@ force_reclaim_mapped:
        pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
                                    &l_hold, &pgscanned);
        zone->pages_scanned += pgscanned;
-        zone->nr_active -= pgmoved;
+        __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
        spin_unlock_irq(&zone->lru_lock);
        while (!list_empty(&l_hold)) {
@@ -857,7 +858,7 @@ force_reclaim_mapped:
                list_move(&page->lru, &zone->inactive_list);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
-                        zone->nr_inactive += pgmoved;
+                        __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
                        spin_unlock_irq(&zone->lru_lock);
                        pgdeactivate += pgmoved;
                        pgmoved = 0;
@@ -867,7 +868,7 @@ force_reclaim_mapped:
                        spin_lock_irq(&zone->lru_lock);
                }
        }
-        zone->nr_inactive += pgmoved;
+        __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
        pgdeactivate += pgmoved;
        if (buffer_heads_over_limit) {
                spin_unlock_irq(&zone->lru_lock);
@@ -885,14 +886,14 @@ force_reclaim_mapped:
                list_move(&page->lru, &zone->active_list);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
-                        zone->nr_active += pgmoved;
+                        __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
                        pgmoved = 0;
                        spin_unlock_irq(&zone->lru_lock);
                        __pagevec_release(&pvec);
                        spin_lock_irq(&zone->lru_lock);
                }
        }
-        zone->nr_active += pgmoved;
+        __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
        __count_zone_vm_events(PGREFILL, zone, pgscanned);
        __count_vm_events(PGDEACTIVATE, pgdeactivate);
@@ -918,14 +919,16 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
         * Add one to `nr_to_scan' just to make sure that the kernel will
         * slowly sift through the active list.
         */
-        zone->nr_scan_active += (zone->nr_active >> priority) + 1;
+        zone->nr_scan_active +=
+                (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
        nr_active = zone->nr_scan_active;
        if (nr_active >= sc->swap_cluster_max)
                zone->nr_scan_active = 0;
        else
                nr_active = 0;
-        zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
+        zone->nr_scan_inactive +=
+                (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
        nr_inactive = zone->nr_scan_inactive;
        if (nr_inactive >= sc->swap_cluster_max)
                zone->nr_scan_inactive = 0;
@@ -1037,7 +1040,8 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
-                lru_pages += zone->nr_active + zone->nr_inactive;
+                lru_pages += zone_page_state(zone, NR_ACTIVE)
+                                + zone_page_state(zone, NR_INACTIVE);
        }
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
@@ -1182,7 +1186,8 @@ loop_again:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                        lru_pages += zone->nr_active + zone->nr_inactive;
+                        lru_pages += zone_page_state(zone, NR_ACTIVE)
+                                        + zone_page_state(zone, NR_INACTIVE);
                }
                /*
@@ -1219,8 +1224,9 @@ loop_again:
                        if (zone->all_unreclaimable)
                                continue;
                        if (nr_slab == 0 && zone->pages_scanned >=
-                                    (zone->nr_active + zone->nr_inactive) * 6)
+                                (zone_page_state(zone, NR_ACTIVE)
-                                zone->all_unreclaimable = 1;
+                                + zone_page_state(zone, NR_INACTIVE)) * 6)
+                                        zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -1385,18 +1391,22 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                /* For pass = 0 we don't shrink the active list */
                if (pass > 0) {
-                        zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+                        zone->nr_scan_active +=
+                                (zone_page_state(zone, NR_ACTIVE) >> prio) + 1;
                        if (zone->nr_scan_active >= nr_pages || pass > 3) {
                                zone->nr_scan_active = 0;
-                                nr_to_scan = min(nr_pages, zone->nr_active);
+                                nr_to_scan = min(nr_pages,
+                                        zone_page_state(zone, NR_ACTIVE));
                                shrink_active_list(nr_to_scan, zone, sc, prio);
                        }
                }
-                zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+                zone->nr_scan_inactive +=
+                        (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
                if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
                        zone->nr_scan_inactive = 0;
-                        nr_to_scan = min(nr_pages, zone->nr_inactive);
+                        nr_to_scan = min(nr_pages,
+                                zone_page_state(zone, NR_INACTIVE));
                        ret += shrink_inactive_list(nr_to_scan, zone, sc);
                        if (ret >= nr_pages)
                                return ret;
@@ -1408,12 +1418,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 static unsigned long count_lru_pages(void)
 {
-        struct zone *zone;
+        return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
-        unsigned long ret = 0;
-        for_each_zone(zone)
-                ret += zone->nr_active + zone->nr_inactive;
-        return ret;
 }
 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index bf62a8232100..5462106725d7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -19,12 +19,10 @@ void __get_zone_counts(unsigned long *active, unsigned long *inactive,
        struct zone *zones = pgdat->node_zones;
        int i;
-        *active = 0;
+        *active = node_page_state(pgdat->node_id, NR_ACTIVE);
-        *inactive = 0;
+        *inactive = node_page_state(pgdat->node_id, NR_INACTIVE);
        *free = 0;
        for (i = 0; i < MAX_NR_ZONES; i++) {
-                *active += zones[i].nr_active;
-                *inactive += zones[i].nr_inactive;
                *free += zones[i].free_pages;
        }
 }
@@ -34,14 +32,12 @@ void get_zone_counts(unsigned long *active,
 {
        struct pglist_data *pgdat;
-        *active = 0;
+        *active = global_page_state(NR_ACTIVE);
-        *inactive = 0;
+        *inactive = global_page_state(NR_INACTIVE);
        *free = 0;
        for_each_online_pgdat(pgdat) {
                unsigned long l, m, n;
                __get_zone_counts(&l, &m, &n, pgdat);
-                *active += l;
-                *inactive += m;
                *free += n;
        }
 }
@@ -239,7 +235,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
 * in between and therefore the atomicity vs. interrupt cannot be exploited
 * in a useful way here.
 */
-static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
        s8 *p = pcp->vm_stat_diff + item;
@@ -260,9 +256,8 @@ void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 }
 EXPORT_SYMBOL(__inc_zone_page_state);
-void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct zone *zone = page_zone(page);
        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
        s8 *p = pcp->vm_stat_diff + item;
@@ -275,6 +270,11 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
                *p = overstep;
        }
 }
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        __dec_zone_state(page_zone(page), item);
+}
 EXPORT_SYMBOL(__dec_zone_page_state);
 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
@@ -454,6 +454,8 @@ const struct seq_operations fragmentation_op = {
 static const char * const vmstat_text[] = {
        /* Zoned VM counters */
+        "nr_active",
+        "nr_inactive",
        "nr_anon_pages",
        "nr_mapped",
        "nr_file_pages",
@@ -529,8 +531,6 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                           "\n        min      %lu"
                           "\n        low      %lu"
                           "\n        high     %lu"
-                           "\n        active   %lu"
-                           "\n        inactive %lu"
                           "\n        scanned  %lu (a: %lu i: %lu)"
                           "\n        spanned  %lu"
                           "\n        present  %lu",
@@ -538,8 +538,6 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                           zone->pages_min,
                           zone->pages_low,
                           zone->pages_high,
-                           zone->nr_active,
-                           zone->nr_inactive,
                           zone->pages_scanned,
                           zone->nr_scan_active, zone->nr_scan_inactive,
                           zone->spanned_pages,
author	Christoph Lameter <clameter@sgi.com>	2007-02-10 04:43:01 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-02-11 13:51:17 -0500
commit	c878538598d1e7ab41ecc0de8894e34e2fdef630 (patch)
tree	d22e73fddef75521e287c3e7754a1d3224c348d9 /mm
parent	c3704ceb4ad055b489b143f4e37c57d128908012 (diff)