[PATCH] zoned vm counters: basic ZVC (zoned vm counter) implementation

Per zone counter infrastructure The counters that we currently have for the VM are split per processor. The processor however has not much to do with the zone these pages belong to. We cannot tell f.e. how many ZONE_DMA pages are dirty. So we are blind to potentially inbalances in the usage of memory in various zones. F.e. in a NUMA system we cannot tell how many pages are dirty on a particular node. If we knew then we could put measures into the VM to balance the use of memory between different zones and different nodes in a NUMA system. For example it would be possible to limit the dirty pages per node so that fast local memory is kept available even if a process is dirtying huge amounts of pages. Another example is zone reclaim. We do not know how many unmapped pages exist per zone. So we just have to try to reclaim. If it is not working then we pause and try again later. It would be better if we knew when it makes sense to reclaim unmapped pages from a zone. This patchset allows the determination of the number of unmapped pages per zone. We can remove the zone reclaim interval with the counters introduced here. Futhermore the ability to have various usage statistics available will allow the development of new NUMA balancing algorithms that may be able to improve the decision making in the scheduler of when to move a process to another node and hopefully will also enable automatic page migration through a user space program that can analyse the memory load distribution and then rebalance memory use in order to increase performance. The counter framework here implements differential counters for each processor in struct zone. The differential counters are consolidated when a threshold is exceeded (like done in the current implementation for nr_pageache), when slab reaping occurs or when a consolidation function is called. Consolidation uses atomic operations and accumulates counters per zone in the zone structure and also globally in the vm_stat array. VM functions can access the counts by simply indexing a global or zone specific array. The arrangement of counters in an array also simplifies processing when output has to be generated for /proc/*. Counters can be updated by calling inc/dec_zone_page_state or _inc/dec_zone_page_state analogous to *_page_state. The second group of functions can be called if it is known that interrupts are disabled. Special optimized increment and decrement functions are provided. These can avoid certain checks and use increment or decrement instructions that an architecture may provide. We also add a new CONFIG_DMA_IS_NORMAL that signifies that an architecture can do DMA to all memory and therefore ZONE_NORMAL will not be populated. This is only currently set for IA64 SGI SN2 and currently only affects node_page_state(). In the best case node_page_state can be reduced to retrieving a single counter for the one zone on the node. [akpm@osdl.org: cleanups] [akpm@osdl.org: export vm_stat[] for filesystems] Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@sgi.com> 2006-06-30 04:55:33 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-06-30 14:25:34 -0400
commit: 2244b95a7bcf8d24196f8a3a44187ba5dfff754c (patch)
tree: 771ef8eae45c2794fd73f870109c74d67c28888a
parent: f6ac2354d791195ca40822b84d73d48a4e8b7f2b (diff)
6 files changed, 359 insertions, 5 deletions
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index b487e227a1f7..47de9ee6bcd6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -70,6 +70,11 @@ config DMA_IS_DMA32
        bool
        default y
+config DMA_IS_NORMAL
+        bool
+        depends on IA64_SGI_SN2
+        default y
 choice
        prompt "System type"
        default IA64_GENERIC
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d6120fa69116..543f9e411563 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -46,6 +46,9 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
+enum zone_stat_item {
+        NR_VM_ZONE_STAT_ITEMS };
 struct per_cpu_pages {
        int count;              /* number of pages in the list */
        int high;               /* high watermark, emptying needed */
@@ -55,6 +58,10 @@ struct per_cpu_pages {
 struct per_cpu_pageset {
        struct per_cpu_pages pcp[2];    /* 0: hot.  1: cold */
+#ifdef CONFIG_SMP
+        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+#endif
 #ifdef CONFIG_NUMA
        unsigned long numa_hit;         /* allocated in intended node */
        unsigned long numa_miss;        /* allocated in non intended node */
@@ -165,6 +172,8 @@ struct zone {
        /* A count of how many reclaimers are scanning this zone */
        atomic_t                reclaim_in_progress;
+        /* Zone statistics */
+        atomic_long_t           vm_stat[NR_VM_ZONE_STAT_ITEMS];
        /*
         * timestamp (in jiffies) of the last zone reclaim that did not
         * result in freeing of pages. This is used to avoid repeated scans
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3ca0c1989fc2..3fd5c11e544a 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -3,6 +3,9 @@
 #include <linux/types.h>
 #include <linux/percpu.h>
+#include <linux/config.h>
+#include <linux/mmzone.h>
+#include <asm/atomic.h>
 /*
 * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -134,5 +137,129 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 DECLARE_PER_CPU(struct page_state, page_states);
-#endif /* _LINUX_VMSTAT_H */
+/*
+ * Zone based page accounting with per cpu differentials.
+ */
+extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+static inline void zone_page_state_add(long x, struct zone *zone,
+                                 enum zone_stat_item item)
+{
+        atomic_long_add(x, &zone->vm_stat[item]);
+        atomic_long_add(x, &vm_stat[item]);
+}
+static inline unsigned long global_page_state(enum zone_stat_item item)
+{
+        long x = atomic_long_read(&vm_stat[item]);
+#ifdef CONFIG_SMP
+        if (x < 0)
+                x = 0;
+#endif
+        return x;
+}
+static inline unsigned long zone_page_state(struct zone *zone,
+                                        enum zone_stat_item item)
+{
+        long x = atomic_long_read(&zone->vm_stat[item]);
+#ifdef CONFIG_SMP
+        if (x < 0)
+                x = 0;
+#endif
+        return x;
+}
+#ifdef CONFIG_NUMA
+/*
+ * Determine the per node value of a stat item. This function
+ * is called frequently in a NUMA machine, so try to be as
+ * frugal as possible.
+ */
+static inline unsigned long node_page_state(int node,
+                                 enum zone_stat_item item)
+{
+        struct zone *zones = NODE_DATA(node)->node_zones;
+        return
+#ifndef CONFIG_DMA_IS_NORMAL
+#if !defined(CONFIG_DMA_IS_DMA32) && BITS_PER_LONG >= 64
+                zone_page_state(&zones[ZONE_DMA32], item) +
+#endif
+                zone_page_state(&zones[ZONE_NORMAL], item) +
+#endif
+#ifdef CONFIG_HIGHMEM
+                zone_page_state(&zones[ZONE_HIGHMEM], item) +
+#endif
+                zone_page_state(&zones[ZONE_DMA], item);
+}
+#else
+#define node_page_state(node, item) global_page_state(item)
+#endif
+#define __add_zone_page_state(__z, __i, __d)    \
+                __mod_zone_page_state(__z, __i, __d)
+#define __sub_zone_page_state(__z, __i, __d)    \
+                __mod_zone_page_state(__z, __i,-(__d))
+#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
+#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
+static inline void zap_zone_vm_stats(struct zone *zone)
+{
+        memset(zone->vm_stat, 0, sizeof(zone->vm_stat));
+}
+#ifdef CONFIG_SMP
+void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int);
+void __inc_zone_page_state(struct page *, enum zone_stat_item);
+void __dec_zone_page_state(struct page *, enum zone_stat_item);
+void mod_zone_page_state(struct zone *, enum zone_stat_item, int);
+void inc_zone_page_state(struct page *, enum zone_stat_item);
+void dec_zone_page_state(struct page *, enum zone_stat_item);
+extern void inc_zone_state(struct zone *, enum zone_stat_item);
+void refresh_cpu_vm_stats(int);
+void refresh_vm_stats(void);
+#else /* CONFIG_SMP */
+/*
+ * We do not maintain differentials in a single processor configuration.
+ * The functions directly modify the zone and global counters.
+ */
+static inline void __mod_zone_page_state(struct zone *zone,
+                        enum zone_stat_item item, int delta)
+{
+        zone_page_state_add(delta, zone, item);
+}
+static inline void __inc_zone_page_state(struct page *page,
+                        enum zone_stat_item item)
+{
+        atomic_long_inc(&page_zone(page)->vm_stat[item]);
+        atomic_long_inc(&vm_stat[item]);
+}
+static inline void __dec_zone_page_state(struct page *page,
+                        enum zone_stat_item item)
+{
+        atomic_long_dec(&page_zone(page)->vm_stat[item]);
+        atomic_long_dec(&vm_stat[item]);
+}
+/*
+ * We only use atomic operations to update counters. So there is no need to
+ * disable interrupts.
+ */
+#define inc_zone_page_state __inc_zone_page_state
+#define dec_zone_page_state __dec_zone_page_state
+#define mod_zone_page_state __mod_zone_page_state
+static inline void refresh_cpu_vm_stats(int cpu) { }
+static inline void refresh_vm_stats(void) { }
+#endif
+#endif /* _LINUX_VMSTAT_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 87dc1297fe39..3a877fecc300 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2045,6 +2045,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                zone->nr_scan_inactive = 0;
                zone->nr_active = 0;
                zone->nr_inactive = 0;
+                zap_zone_vm_stats(zone);
                atomic_set(&zone->reclaim_in_progress, 0);
                if (!size)
                        continue;
@@ -2147,6 +2148,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
                }
                local_irq_enable();
+                refresh_cpu_vm_stats(cpu);
        }
        return NOTIFY_OK;
 }
diff --git a/mm/slab.c b/mm/slab.c
index 233e39d14caf..0c33820038cb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3763,6 +3763,7 @@ next:
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
        next_reap_node();
+        refresh_cpu_vm_stats(smp_processor_id());
        /* Set up the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ad456202ff1a..210f9bbbb04f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -3,10 +3,15 @@
 *
 *  Manages VM statistics
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  zoned VM statistics
+ *  Copyright (C) 2006 Silicon Graphics, Inc.,
+ *              Christoph Lameter <christoph@lameter.com>
 */
 #include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 /*
 * Accumulate the page_state information across all CPUs.
@@ -143,6 +148,197 @@ void get_zone_counts(unsigned long *active,
        }
 }
+/*
+ * Manage combined zone based / global counters
+ *
+ * vm_stat contains the global counters
+ */
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+EXPORT_SYMBOL(vm_stat);
+#ifdef CONFIG_SMP
+#define STAT_THRESHOLD 32
+/*
+ * Determine pointer to currently valid differential byte given a zone and
+ * the item number.
+ *
+ * Preemption must be off
+ */
+static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
+{
+        return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
+}
+/*
+ * For use when we know that interrupts are disabled.
+ */
+void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                                int delta)
+{
+        s8 *p;
+        long x;
+        p = diff_pointer(zone, item);
+        x = delta + *p;
+        if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
+                zone_page_state_add(x, zone, item);
+                x = 0;
+        }
+        *p = x;
+}
+EXPORT_SYMBOL(__mod_zone_page_state);
+/*
+ * For an unknown interrupt state
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                                        int delta)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __mod_zone_page_state(zone, item, delta);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+/*
+ * Optimized increment and decrement functions.
+ *
+ * These are only for a single page and therefore can take a struct page *
+ * argument instead of struct zone *. This allows the inclusion of the code
+ * generated for page_zone(page) into the optimized functions.
+ *
+ * No overflow check is necessary and therefore the differential can be
+ * incremented or decremented in place which may allow the compilers to
+ * generate better code.
+ *
+ * The increment or decrement is known and therefore one boundary check can
+ * be omitted.
+ *
+ * Some processors have inc/dec instructions that are atomic vs an interrupt.
+ * However, the code must first determine the differential location in a zone
+ * based on the processor number and then inc/dec the counter. There is no
+ * guarantee without disabling preemption that the processor will not change
+ * in between and therefore the atomicity vs. interrupt cannot be exploited
+ * in a useful way here.
+ */
+void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        struct zone *zone = page_zone(page);
+        s8 *p = diff_pointer(zone, item);
+        (*p)++;
+        if (unlikely(*p > STAT_THRESHOLD)) {
+                zone_page_state_add(*p, zone, item);
+                *p = 0;
+        }
+}
+EXPORT_SYMBOL(__inc_zone_page_state);
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        struct zone *zone = page_zone(page);
+        s8 *p = diff_pointer(zone, item);
+        (*p)--;
+        if (unlikely(*p < -STAT_THRESHOLD)) {
+                zone_page_state_add(*p, zone, item);
+                *p = 0;
+        }
+}
+EXPORT_SYMBOL(__dec_zone_page_state);
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        unsigned long flags;
+        struct zone *zone;
+        s8 *p;
+        zone = page_zone(page);
+        local_irq_save(flags);
+        p = diff_pointer(zone, item);
+        (*p)++;
+        if (unlikely(*p > STAT_THRESHOLD)) {
+                zone_page_state_add(*p, zone, item);
+                *p = 0;
+        }
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        unsigned long flags;
+        struct zone *zone;
+        s8 *p;
+        zone = page_zone(page);
+        local_irq_save(flags);
+        p = diff_pointer(zone, item);
+        (*p)--;
+        if (unlikely(*p < -STAT_THRESHOLD)) {
+                zone_page_state_add(*p, zone, item);
+                *p = 0;
+        }
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+/*
+ * Update the zone counters for one cpu.
+ */
+void refresh_cpu_vm_stats(int cpu)
+{
+        struct zone *zone;
+        int i;
+        unsigned long flags;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pcp;
+                pcp = zone_pcp(zone, cpu);
+                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                        if (pcp->vm_stat_diff[i]) {
+                                local_irq_save(flags);
+                                zone_page_state_add(pcp->vm_stat_diff[i],
+                                        zone, i);
+                                pcp->vm_stat_diff[i] = 0;
+                                local_irq_restore(flags);
+                        }
+        }
+}
+static void __refresh_cpu_vm_stats(void *dummy)
+{
+        refresh_cpu_vm_stats(smp_processor_id());
+}
+/*
+ * Consolidate all counters.
+ *
+ * Note that the result is less inaccurate but still inaccurate
+ * if concurrent processes are allowed to run.
+ */
+void refresh_vm_stats(void)
+{
+        on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
+}
+EXPORT_SYMBOL(refresh_vm_stats);
+#endif
 #ifdef CONFIG_PROC_FS
 #include <linux/seq_file.h>
@@ -204,6 +400,9 @@ struct seq_operations fragmentation_op = {
 };
 static char *vmstat_text[] = {
+        /* Zoned VM counters */
+        /* Page state */
        "nr_dirty",
        "nr_writeback",
        "nr_unstable",
@@ -297,6 +496,11 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                           zone->nr_scan_active, zone->nr_scan_inactive,
                           zone->spanned_pages,
                           zone->present_pages);
+                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                        seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
+                                        zone_page_state(zone, i));
                seq_printf(m,
                           "\n        protection: (%lu",
                           zone->lowmem_reserve[0]);
@@ -368,19 +572,25 @@ struct seq_operations zoneinfo_op = {
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
+        unsigned long *v;
        struct page_state *ps;
+        int i;
        if (*pos >= ARRAY_SIZE(vmstat_text))
                return NULL;
-        ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+        v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
-        m->private = ps;
+                        + sizeof(*ps), GFP_KERNEL);
-        if (!ps)
+        m->private = v;
+        if (!v)
                return ERR_PTR(-ENOMEM);
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                v[i] = global_page_state(i);
+        ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS);
        get_full_page_state(ps);
        ps->pgpgin /= 2;                /* sectors -> kbytes */
        ps->pgpgout /= 2;
-        return (unsigned long *)ps + *pos;
+        return v + *pos;
 }
 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
author	Christoph Lameter <clameter@sgi.com>	2006-06-30 04:55:33 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-06-30 14:25:34 -0400
commit	2244b95a7bcf8d24196f8a3a44187ba5dfff754c (patch)
tree	771ef8eae45c2794fd73f870109c74d67c28888a
parent	f6ac2354d791195ca40822b84d73d48a4e8b7f2b (diff)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index b487e227a1f7..47de9ee6bcd6 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig
@@ -70,6 +70,11 @@ config DMA_IS_DMA32
70	bool	70	bool
71	default y	71	default y
72		72
		73	config DMA_IS_NORMAL
		74	bool
		75	depends on IA64_SGI_SN2
		76	default y
		77
73	choice	78	choice
74	prompt "System type"	79	prompt "System type"
75	default IA64_GENERIC	80	default IA64_GENERIC


diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d6120fa69116..543f9e411563 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h
@@ -46,6 +46,9 @@ struct zone_padding {
46	#define ZONE_PADDING(name)	46	#define ZONE_PADDING(name)
47	#endif	47	#endif
48		48
		49	enum zone_stat_item {
		50	NR_VM_ZONE_STAT_ITEMS };
		51
49	struct per_cpu_pages {	52	struct per_cpu_pages {
50	int count; /* number of pages in the list */	53	int count; /* number of pages in the list */
51	int high; /* high watermark, emptying needed */	54	int high; /* high watermark, emptying needed */
@@ -55,6 +58,10 @@ struct per_cpu_pages {
55		58
56	struct per_cpu_pageset {	59	struct per_cpu_pageset {
57	struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */	60	struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
		61	#ifdef CONFIG_SMP
		62	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
		63	#endif
		64
58	#ifdef CONFIG_NUMA	65	#ifdef CONFIG_NUMA
59	unsigned long numa_hit; /* allocated in intended node */	66	unsigned long numa_hit; /* allocated in intended node */
60	unsigned long numa_miss; /* allocated in non intended node */	67	unsigned long numa_miss; /* allocated in non intended node */
@@ -165,6 +172,8 @@ struct zone {
165	/* A count of how many reclaimers are scanning this zone */	172	/* A count of how many reclaimers are scanning this zone */
166	atomic_t reclaim_in_progress;	173	atomic_t reclaim_in_progress;
167		174
		175	/* Zone statistics */
		176	atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
168	/*	177	/*
169	* timestamp (in jiffies) of the last zone reclaim that did not	178	* timestamp (in jiffies) of the last zone reclaim that did not
170	* result in freeing of pages. This is used to avoid repeated scans	179	* result in freeing of pages. This is used to avoid repeated scans


diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3ca0c1989fc2..3fd5c11e544a 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h
@@ -3,6 +3,9 @@
3		3
4	#include <linux/types.h>	4	#include <linux/types.h>
5	#include <linux/percpu.h>	5	#include <linux/percpu.h>
		6	#include <linux/config.h>
		7	#include <linux/mmzone.h>
		8	#include <asm/atomic.h>
6		9
7	/*	10	/*
8	* Global page accounting. One instance per CPU. Only unsigned longs are	11	* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -134,5 +137,129 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
134		137
135	DECLARE_PER_CPU(struct page_state, page_states);	138	DECLARE_PER_CPU(struct page_state, page_states);
136		139
137	#endif /* _LINUX_VMSTAT_H */	140	/*
		141	* Zone based page accounting with per cpu differentials.
		142	*/
		143	extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
		144
		145	static inline void zone_page_state_add(long x, struct zone *zone,
		146	enum zone_stat_item item)
		147	{
		148	atomic_long_add(x, &zone->vm_stat[item]);
		149	atomic_long_add(x, &vm_stat[item]);
		150	}
		151
		152	static inline unsigned long global_page_state(enum zone_stat_item item)
		153	{
		154	long x = atomic_long_read(&vm_stat[item]);
		155	#ifdef CONFIG_SMP
		156	if (x < 0)
		157	x = 0;
		158	#endif
		159	return x;
		160	}
		161
		162	static inline unsigned long zone_page_state(struct zone *zone,
		163	enum zone_stat_item item)
		164	{
		165	long x = atomic_long_read(&zone->vm_stat[item]);
		166	#ifdef CONFIG_SMP
		167	if (x < 0)
		168	x = 0;
		169	#endif
		170	return x;
		171	}
		172
		173	#ifdef CONFIG_NUMA
		174	/*
		175	* Determine the per node value of a stat item. This function
		176	* is called frequently in a NUMA machine, so try to be as
		177	* frugal as possible.
		178	*/
		179	static inline unsigned long node_page_state(int node,
		180	enum zone_stat_item item)
		181	{
		182	struct zone *zones = NODE_DATA(node)->node_zones;
		183
		184	return
		185	#ifndef CONFIG_DMA_IS_NORMAL
		186	#if !defined(CONFIG_DMA_IS_DMA32) && BITS_PER_LONG >= 64
		187	zone_page_state(&zones[ZONE_DMA32], item) +
		188	#endif
		189	zone_page_state(&zones[ZONE_NORMAL], item) +
		190	#endif
		191	#ifdef CONFIG_HIGHMEM
		192	zone_page_state(&zones[ZONE_HIGHMEM], item) +
		193	#endif
		194	zone_page_state(&zones[ZONE_DMA], item);
		195	}
		196	#else
		197	#define node_page_state(node, item) global_page_state(item)
		198	#endif
		199
		200	#define __add_zone_page_state(__z, __i, __d) \
		201	__mod_zone_page_state(__z, __i, __d)
		202	#define __sub_zone_page_state(__z, __i, __d) \
		203	__mod_zone_page_state(__z, __i,-(__d))
		204
		205	#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
		206	#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
		207
		208	static inline void zap_zone_vm_stats(struct zone *zone)
		209	{
		210	memset(zone->vm_stat, 0, sizeof(zone->vm_stat));
		211	}
		212
		213	#ifdef CONFIG_SMP
		214	void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int);
		215	void __inc_zone_page_state(struct page *, enum zone_stat_item);
		216	void __dec_zone_page_state(struct page *, enum zone_stat_item);
138		217
		218	void mod_zone_page_state(struct zone *, enum zone_stat_item, int);
		219	void inc_zone_page_state(struct page *, enum zone_stat_item);
		220	void dec_zone_page_state(struct page *, enum zone_stat_item);
		221
		222	extern void inc_zone_state(struct zone *, enum zone_stat_item);
		223
		224	void refresh_cpu_vm_stats(int);
		225	void refresh_vm_stats(void);
		226
		227	#else /* CONFIG_SMP */
		228
		229	/*
		230	* We do not maintain differentials in a single processor configuration.
		231	* The functions directly modify the zone and global counters.
		232	*/
		233	static inline void __mod_zone_page_state(struct zone *zone,
		234	enum zone_stat_item item, int delta)
		235	{
		236	zone_page_state_add(delta, zone, item);
		237	}
		238
		239	static inline void __inc_zone_page_state(struct page *page,
		240	enum zone_stat_item item)
		241	{
		242	atomic_long_inc(&page_zone(page)->vm_stat[item]);
		243	atomic_long_inc(&vm_stat[item]);
		244	}
		245
		246	static inline void __dec_zone_page_state(struct page *page,
		247	enum zone_stat_item item)
		248	{
		249	atomic_long_dec(&page_zone(page)->vm_stat[item]);
		250	atomic_long_dec(&vm_stat[item]);
		251	}
		252
		253	/*
		254	* We only use atomic operations to update counters. So there is no need to
		255	* disable interrupts.
		256	*/
		257	#define inc_zone_page_state __inc_zone_page_state
		258	#define dec_zone_page_state __dec_zone_page_state
		259	#define mod_zone_page_state __mod_zone_page_state
		260
		261	static inline void refresh_cpu_vm_stats(int cpu) { }
		262	static inline void refresh_vm_stats(void) { }
		263	#endif
		264
		265	#endif /* _LINUX_VMSTAT_H */


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 87dc1297fe39..3a877fecc300 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -2045,6 +2045,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2045	zone->nr_scan_inactive = 0;	2045	zone->nr_scan_inactive = 0;
2046	zone->nr_active = 0;	2046	zone->nr_active = 0;
2047	zone->nr_inactive = 0;	2047	zone->nr_inactive = 0;
		2048	zap_zone_vm_stats(zone);
2048	atomic_set(&zone->reclaim_in_progress, 0);	2049	atomic_set(&zone->reclaim_in_progress, 0);
2049	if (!size)	2050	if (!size)
2050	continue;	2051	continue;
@@ -2147,6 +2148,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
2147	}	2148	}
2148		2149
2149	local_irq_enable();	2150	local_irq_enable();
		2151	refresh_cpu_vm_stats(cpu);
2150	}	2152	}
2151	return NOTIFY_OK;	2153	return NOTIFY_OK;
2152	}	2154	}


diff --git a/mm/slab.c b/mm/slab.c index 233e39d14caf..0c33820038cb 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -3763,6 +3763,7 @@ next:
3763	check_irq_on();	3763	check_irq_on();
3764	mutex_unlock(&cache_chain_mutex);	3764	mutex_unlock(&cache_chain_mutex);
3765	next_reap_node();	3765	next_reap_node();
		3766	refresh_cpu_vm_stats(smp_processor_id());
3766	/* Set up the next iteration */	3767	/* Set up the next iteration */
3767	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);	3768	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3768	}	3769	}


diff --git a/mm/vmstat.c b/mm/vmstat.c index ad456202ff1a..210f9bbbb04f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c
@@ -3,10 +3,15 @@
3	*	3	*
4	* Manages VM statistics	4	* Manages VM statistics
5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds	5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
		6	*
		7	* zoned VM statistics
		8	* Copyright (C) 2006 Silicon Graphics, Inc.,
		9	* Christoph Lameter <christoph@lameter.com>
6	*/	10	*/
7		11
8	#include <linux/config.h>	12	#include <linux/config.h>
9	#include <linux/mm.h>	13	#include <linux/mm.h>
		14	#include <linux/module.h>
10		15
11	/*	16	/*
12	* Accumulate the page_state information across all CPUs.	17	* Accumulate the page_state information across all CPUs.
@@ -143,6 +148,197 @@ void get_zone_counts(unsigned long *active,
143	}	148	}
144	}	149	}
145		150
		151	/*
		152	* Manage combined zone based / global counters
		153	*
		154	* vm_stat contains the global counters
		155	*/
		156	atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
		157	EXPORT_SYMBOL(vm_stat);
		158
		159	#ifdef CONFIG_SMP
		160
		161	#define STAT_THRESHOLD 32
		162
		163	/*
		164	* Determine pointer to currently valid differential byte given a zone and
		165	* the item number.
		166	*
		167	* Preemption must be off
		168	*/
		169	static inline s8 diff_pointer(struct zone zone, enum zone_stat_item item)
		170	{
		171	return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
		172	}
		173
		174	/*
		175	* For use when we know that interrupts are disabled.
		176	*/
		177	void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
		178	int delta)
		179	{
		180	s8 *p;
		181	long x;
		182
		183	p = diff_pointer(zone, item);
		184	x = delta + *p;
		185
		186	if (unlikely(x > STAT_THRESHOLD \|\| x < -STAT_THRESHOLD)) {
		187	zone_page_state_add(x, zone, item);
		188	x = 0;
		189	}
		190
		191	*p = x;
		192	}
		193	EXPORT_SYMBOL(__mod_zone_page_state);
		194
		195	/*
		196	* For an unknown interrupt state
		197	*/
		198	void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
		199	int delta)
		200	{
		201	unsigned long flags;
		202
		203	local_irq_save(flags);
		204	__mod_zone_page_state(zone, item, delta);
		205	local_irq_restore(flags);
		206	}
		207	EXPORT_SYMBOL(mod_zone_page_state);
		208
		209	/*
		210	* Optimized increment and decrement functions.
		211	*
		212	* These are only for a single page and therefore can take a struct page *
		213	* argument instead of struct zone *. This allows the inclusion of the code
		214	* generated for page_zone(page) into the optimized functions.
		215	*
		216	* No overflow check is necessary and therefore the differential can be
		217	* incremented or decremented in place which may allow the compilers to
		218	* generate better code.
		219	*
		220	* The increment or decrement is known and therefore one boundary check can
		221	* be omitted.
		222	*
		223	* Some processors have inc/dec instructions that are atomic vs an interrupt.
		224	* However, the code must first determine the differential location in a zone
		225	* based on the processor number and then inc/dec the counter. There is no
		226	* guarantee without disabling preemption that the processor will not change
		227	* in between and therefore the atomicity vs. interrupt cannot be exploited
		228	* in a useful way here.
		229	*/
		230	void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
		231	{
		232	struct zone *zone = page_zone(page);
		233	s8 *p = diff_pointer(zone, item);
		234
		235	(*p)++;
		236
		237	if (unlikely(*p > STAT_THRESHOLD)) {
		238	zone_page_state_add(*p, zone, item);
		239	*p = 0;
		240	}
		241	}
		242	EXPORT_SYMBOL(__inc_zone_page_state);
		243
		244	void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
		245	{
		246	struct zone *zone = page_zone(page);
		247	s8 *p = diff_pointer(zone, item);
		248
		249	(*p)--;
		250
		251	if (unlikely(*p < -STAT_THRESHOLD)) {
		252	zone_page_state_add(*p, zone, item);
		253	*p = 0;
		254	}
		255	}
		256	EXPORT_SYMBOL(__dec_zone_page_state);
		257
		258	void inc_zone_page_state(struct page *page, enum zone_stat_item item)
		259	{
		260	unsigned long flags;
		261	struct zone *zone;
		262	s8 *p;
		263
		264	zone = page_zone(page);
		265	local_irq_save(flags);
		266	p = diff_pointer(zone, item);
		267
		268	(*p)++;
		269
		270	if (unlikely(*p > STAT_THRESHOLD)) {
		271	zone_page_state_add(*p, zone, item);
		272	*p = 0;
		273	}
		274	local_irq_restore(flags);
		275	}
		276	EXPORT_SYMBOL(inc_zone_page_state);
		277
		278	void dec_zone_page_state(struct page *page, enum zone_stat_item item)
		279	{
		280	unsigned long flags;
		281	struct zone *zone;
		282	s8 *p;
		283
		284	zone = page_zone(page);
		285	local_irq_save(flags);
		286	p = diff_pointer(zone, item);
		287
		288	(*p)--;
		289
		290	if (unlikely(*p < -STAT_THRESHOLD)) {
		291	zone_page_state_add(*p, zone, item);
		292	*p = 0;
		293	}
		294	local_irq_restore(flags);
		295	}
		296	EXPORT_SYMBOL(dec_zone_page_state);
		297
		298	/*
		299	* Update the zone counters for one cpu.
		300	*/
		301	void refresh_cpu_vm_stats(int cpu)
		302	{
		303	struct zone *zone;
		304	int i;
		305	unsigned long flags;
		306
		307	for_each_zone(zone) {
		308	struct per_cpu_pageset *pcp;
		309
		310	pcp = zone_pcp(zone, cpu);
		311
		312	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
		313	if (pcp->vm_stat_diff[i]) {
		314	local_irq_save(flags);
		315	zone_page_state_add(pcp->vm_stat_diff[i],
		316	zone, i);
		317	pcp->vm_stat_diff[i] = 0;
		318	local_irq_restore(flags);
		319	}
		320	}
		321	}
		322
		323	static void __refresh_cpu_vm_stats(void *dummy)
		324	{
		325	refresh_cpu_vm_stats(smp_processor_id());
		326	}
		327
		328	/*
		329	* Consolidate all counters.
		330	*
		331	* Note that the result is less inaccurate but still inaccurate
		332	* if concurrent processes are allowed to run.
		333	*/
		334	void refresh_vm_stats(void)
		335	{
		336	on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
		337	}
		338	EXPORT_SYMBOL(refresh_vm_stats);
		339
		340	#endif
		341
146	#ifdef CONFIG_PROC_FS	342	#ifdef CONFIG_PROC_FS
147		343
148	#include <linux/seq_file.h>	344	#include <linux/seq_file.h>
@@ -204,6 +400,9 @@ struct seq_operations fragmentation_op = {
204	};	400	};
205		401
206	static char *vmstat_text[] = {	402	static char *vmstat_text[] = {
		403	/* Zoned VM counters */
		404
		405	/* Page state */
207	"nr_dirty",	406	"nr_dirty",
208	"nr_writeback",	407	"nr_writeback",
209	"nr_unstable",	408	"nr_unstable",
@@ -297,6 +496,11 @@ static int zoneinfo_show(struct seq_file m, void arg)
297	zone->nr_scan_active, zone->nr_scan_inactive,	496	zone->nr_scan_active, zone->nr_scan_inactive,
298	zone->spanned_pages,	497	zone->spanned_pages,
299	zone->present_pages);	498	zone->present_pages);
		499
		500	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
		501	seq_printf(m, "\n %-12s %lu", vmstat_text[i],
		502	zone_page_state(zone, i));
		503
300	seq_printf(m,	504	seq_printf(m,
301	"\n protection: (%lu",	505	"\n protection: (%lu",
302	zone->lowmem_reserve[0]);	506	zone->lowmem_reserve[0]);
@@ -368,19 +572,25 @@ struct seq_operations zoneinfo_op = {
368		572
369	static void vmstat_start(struct seq_file m, loff_t *pos)	573	static void vmstat_start(struct seq_file m, loff_t *pos)
370	{	574	{
		575	unsigned long *v;
371	struct page_state *ps;	576	struct page_state *ps;
		577	int i;
372		578
373	if (*pos >= ARRAY_SIZE(vmstat_text))	579	if (*pos >= ARRAY_SIZE(vmstat_text))
374	return NULL;	580	return NULL;
375		581
376	ps = kmalloc(sizeof(*ps), GFP_KERNEL);	582	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
377	m->private = ps;	583	+ sizeof(*ps), GFP_KERNEL);
378	if (!ps)	584	m->private = v;
		585	if (!v)
379	return ERR_PTR(-ENOMEM);	586	return ERR_PTR(-ENOMEM);
		587	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
		588	v[i] = global_page_state(i);
		589	ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS);
380	get_full_page_state(ps);	590	get_full_page_state(ps);
381	ps->pgpgin /= 2; /* sectors -> kbytes */	591	ps->pgpgin /= 2; /* sectors -> kbytes */
382	ps->pgpgout /= 2;	592	ps->pgpgout /= 2;
383	return (unsigned long )ps + pos;	593	return v + *pos;
384	}	594	}
385		595
386	static void vmstat_next(struct seq_file m, void arg, loff_t pos)	596	static void vmstat_next(struct seq_file m, void arg, loff_t pos)