[PATCH] Light weight event counters

The remaining counters in page_state after the zoned VM counter patches have been applied are all just for show in /proc/vmstat. They have no essential function for the VM. We use a simple increment of per cpu variables. In order to avoid the most severe races we disable preempt. Preempt does not prevent the race between an increment and an interrupt handler incrementing the same statistics counter. However, that race is exceedingly rare, we may only loose one increment or so and there is no requirement (at least not in kernel) that the vm event counters have to be accurate. In the non preempt case this results in a simple increment for each counter. For many architectures this will be reduced by the compiler to a single instruction. This single instruction is atomic for i386 and x86_64. And therefore even the rare race condition in an interrupt is avoided for both architectures in most cases. The patchset also adds an off switch for embedded systems that allows a building of linux kernels without these counters. The implementation of these counters is through inline code that hopefully results in only a single instruction increment instruction being emitted (i386, x86_64) or in the increment being hidden though instruction concurrency (EPIC architectures such as ia64 can get that done). Benefits: - VM event counter operations usually reduce to a single inline instruction on i386 and x86_64. - No interrupt disable, only preempt disable for the preempt case. Preempt disable can also be avoided by moving the counter into a spinlock. - Handling is similar to zoned VM counters. - Simple and easily extendable. - Can be omitted to reduce memory use for embedded use. References: RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=113512330605497&w=2 RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=114988082814934&w=2 local_t http://marc.theaimsgroup.com/?l=linux-kernel&m=114991748606690&w=2 V2 http://marc.theaimsgroup.com/?t=115014808400007&r=1&w=2 V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767022346&w=2 V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115047968808926&w=2 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@sgi.com> 2006-06-30 04:55:45 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-06-30 14:25:36 -0400
commit: f8891e5e1f93a128c3900f82035e8541357896a7 (patch)
tree: 97b078ac97970962b17c85d39fd64cb48dc01168 /include/linux/vmstat.h
parent: ca889e6c45e0b112cb2ca9d35afc66297519b5d5 (diff)
1 files changed, 66 insertions, 104 deletions
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 16173b63ee6..3e0daf54133 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -7,115 +7,77 @@
 #include <linux/mmzone.h>
 #include <asm/atomic.h>
+#ifdef CONFIG_VM_EVENT_COUNTERS
 /*
- * Global page accounting.  One instance per CPU.  Only unsigned longs are
+ * Light weight per cpu counter implementation.
- * allowed.
 *
- * - Fields can be modified with xxx_page_state and xxx_page_state_zone at
+ * Counters should only be incremented and no critical kernel component
- * any time safely (which protects the instance from modification by
+ * should rely on the counter values.
- * interrupt.
+ *
- * - The __xxx_page_state variants can be used safely when interrupts are
+ * Counters are handled completely inline. On many platforms the code
- * disabled.
+ * generated will simply be the increment of a global address.
- * - The __xxx_page_state variants can be used if the field is only
- * modified from process context and protected from preemption, or only
- * modified from interrupt context.  In this case, the field should be
- * commented here.
 */
-struct page_state {
-        unsigned long pgpgin;           /* Disk reads */
+#define FOR_ALL_ZONES(x) x##_DMA, x##_DMA32, x##_NORMAL, x##_HIGH
-        unsigned long pgpgout;          /* Disk writes */
-        unsigned long pswpin;           /* swap reads */
+enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
-        unsigned long pswpout;          /* swap writes */
+                FOR_ALL_ZONES(PGALLOC),
+                PGFREE, PGACTIVATE, PGDEACTIVATE,
-        unsigned long pgalloc_high;     /* page allocations */
+                PGFAULT, PGMAJFAULT,
-        unsigned long pgalloc_normal;
+                FOR_ALL_ZONES(PGREFILL),
-        unsigned long pgalloc_dma32;
+                FOR_ALL_ZONES(PGSTEAL),
-        unsigned long pgalloc_dma;
+                FOR_ALL_ZONES(PGSCAN_KSWAPD),
+                FOR_ALL_ZONES(PGSCAN_DIRECT),
-        unsigned long pgfree;           /* page freeings */
+                PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
-        unsigned long pgactivate;       /* pages moved inactive->active */
+                PAGEOUTRUN, ALLOCSTALL, PGROTATED,
-        unsigned long pgdeactivate;     /* pages moved active->inactive */
+                NR_VM_EVENT_ITEMS
+};
-        unsigned long pgfault;          /* faults (major+minor) */
-        unsigned long pgmajfault;       /* faults (major only) */
+struct vm_event_state {
+        unsigned long event[NR_VM_EVENT_ITEMS];
-        unsigned long pgrefill_high;    /* inspected in refill_inactive_zone */
-        unsigned long pgrefill_normal;
-        unsigned long pgrefill_dma32;
-        unsigned long pgrefill_dma;
-        unsigned long pgsteal_high;     /* total highmem pages reclaimed */
-        unsigned long pgsteal_normal;
-        unsigned long pgsteal_dma32;
-        unsigned long pgsteal_dma;
-        unsigned long pgscan_kswapd_high;/* total highmem pages scanned */
-        unsigned long pgscan_kswapd_normal;
-        unsigned long pgscan_kswapd_dma32;
-        unsigned long pgscan_kswapd_dma;
-        unsigned long pgscan_direct_high;/* total highmem pages scanned */
-        unsigned long pgscan_direct_normal;
-        unsigned long pgscan_direct_dma32;
-        unsigned long pgscan_direct_dma;
-        unsigned long pginodesteal;     /* pages reclaimed via inode freeing */
-        unsigned long slabs_scanned;    /* slab objects scanned */
-        unsigned long kswapd_steal;     /* pages reclaimed by kswapd */
-        unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */
-        unsigned long pageoutrun;       /* kswapd's calls to page reclaim */
-        unsigned long allocstall;       /* direct reclaim calls */
-        unsigned long pgrotated;        /* pages rotated to tail of the LRU */
 };
-extern void get_full_page_state(struct page_state *ret);
+DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
-extern void mod_page_state_offset(unsigned long offset, unsigned long delta);
-extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
+static inline void __count_vm_event(enum vm_event_item item)
+{
-#define mod_page_state(member, delta)   \
+        __get_cpu_var(vm_event_states.event[item])++;
-        mod_page_state_offset(offsetof(struct page_state, member), (delta))
+}
-#define __mod_page_state(member, delta) \
+static inline void count_vm_event(enum vm_event_item item)
-        __mod_page_state_offset(offsetof(struct page_state, member), (delta))
+{
+        get_cpu_var(vm_event_states.event[item])++;
-#define inc_page_state(member)          mod_page_state(member, 1UL)
+        put_cpu();
-#define dec_page_state(member)          mod_page_state(member, 0UL - 1)
+}
-#define add_page_state(member,delta)    mod_page_state(member, (delta))
-#define sub_page_state(member,delta)    mod_page_state(member, 0UL - (delta))
+static inline void __count_vm_events(enum vm_event_item item, long delta)
+{
-#define __inc_page_state(member)        __mod_page_state(member, 1UL)
+        __get_cpu_var(vm_event_states.event[item]) += delta;
-#define __dec_page_state(member)        __mod_page_state(member, 0UL - 1)
+}
-#define __add_page_state(member,delta)  __mod_page_state(member, (delta))
-#define __sub_page_state(member,delta)  __mod_page_state(member, 0UL - (delta))
+static inline void count_vm_events(enum vm_event_item item, long delta)
+{
-#define page_state(member) (*__page_state(offsetof(struct page_state, member)))
+        get_cpu_var(vm_event_states.event[item])++;
+        put_cpu();
-#define state_zone_offset(zone, member)                                 \
+}
-({                                                                      \
-        unsigned offset;                                                \
+extern void all_vm_events(unsigned long *);
-        if (is_highmem(zone))                                           \
+extern void vm_events_fold_cpu(int cpu);
-                offset = offsetof(struct page_state, member##_high);    \
-        else if (is_normal(zone))                                       \
+#else
-                offset = offsetof(struct page_state, member##_normal);  \
-        else if (is_dma32(zone))                                        \
+/* Disable counters */
-                offset = offsetof(struct page_state, member##_dma32);   \
+#define get_cpu_vm_events(e)    0L
-        else                                                            \
+#define count_vm_event(e)       do { } while (0)
-                offset = offsetof(struct page_state, member##_dma);     \
+#define count_vm_events(e,d)    do { } while (0)
-        offset;                                                         \
+#define __count_vm_event(e)     do { } while (0)
-})
+#define __count_vm_events(e,d)  do { } while (0)
+#define vm_events_fold_cpu(x)   do { } while (0)
-#define __mod_page_state_zone(zone, member, delta)                      \
- do {                                                                   \
+#endif /* CONFIG_VM_EVENT_COUNTERS */
-        __mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
- } while (0)
+#define __count_zone_vm_events(item, zone, delta) \
+                        __count_vm_events(item##_DMA + zone_idx(zone), delta)
-#define mod_page_state_zone(zone, member, delta)                        \
- do {                                                                   \
-        mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
- } while (0)
-DECLARE_PER_CPU(struct page_state, page_states);
 /*
 * Zone based page accounting with per cpu differentials.
author	Christoph Lameter <clameter@sgi.com>	2006-06-30 04:55:45 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-06-30 14:25:36 -0400
commit	f8891e5e1f93a128c3900f82035e8541357896a7 (patch)
tree	97b078ac97970962b17c85d39fd64cb48dc01168 /include/linux/vmstat.h
parent	ca889e6c45e0b112cb2ca9d35afc66297519b5d5 (diff)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 16173b63ee6..3e0daf54133 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h
@@ -7,115 +7,77 @@
7	#include <linux/mmzone.h>	7	#include <linux/mmzone.h>
8	#include <asm/atomic.h>	8	#include <asm/atomic.h>
9		9
		10	#ifdef CONFIG_VM_EVENT_COUNTERS
10	/*	11	/*
11	* Global page accounting. One instance per CPU. Only unsigned longs are	12	* Light weight per cpu counter implementation.
12	* allowed.
13	*	13	*
14	* - Fields can be modified with xxx_page_state and xxx_page_state_zone at	14	* Counters should only be incremented and no critical kernel component
15	* any time safely (which protects the instance from modification by	15	* should rely on the counter values.
16	* interrupt.	16	*
17	* - The __xxx_page_state variants can be used safely when interrupts are	17	* Counters are handled completely inline. On many platforms the code
18	* disabled.	18	* generated will simply be the increment of a global address.
19	* - The __xxx_page_state variants can be used if the field is only
20	* modified from process context and protected from preemption, or only
21	* modified from interrupt context. In this case, the field should be
22	* commented here.
23	*/	19	*/
24	struct page_state {	20
25	unsigned long pgpgin; /* Disk reads */	21	#define FOR_ALL_ZONES(x) x##_DMA, x##_DMA32, x##_NORMAL, x##_HIGH
26	unsigned long pgpgout; /* Disk writes */	22
27	unsigned long pswpin; /* swap reads */	23	enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
28	unsigned long pswpout; /* swap writes */	24	FOR_ALL_ZONES(PGALLOC),
29		25	PGFREE, PGACTIVATE, PGDEACTIVATE,
30	unsigned long pgalloc_high; /* page allocations */	26	PGFAULT, PGMAJFAULT,
31	unsigned long pgalloc_normal;	27	FOR_ALL_ZONES(PGREFILL),
32	unsigned long pgalloc_dma32;	28	FOR_ALL_ZONES(PGSTEAL),
33	unsigned long pgalloc_dma;	29	FOR_ALL_ZONES(PGSCAN_KSWAPD),
34		30	FOR_ALL_ZONES(PGSCAN_DIRECT),
35	unsigned long pgfree; /* page freeings */	31	PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
36	unsigned long pgactivate; /* pages moved inactive->active */	32	PAGEOUTRUN, ALLOCSTALL, PGROTATED,
37	unsigned long pgdeactivate; /* pages moved active->inactive */	33	NR_VM_EVENT_ITEMS
38		34	};
39	unsigned long pgfault; /* faults (major+minor) */	35
40	unsigned long pgmajfault; /* faults (major only) */	36	struct vm_event_state {
41		37	unsigned long event[NR_VM_EVENT_ITEMS];
42	unsigned long pgrefill_high; /* inspected in refill_inactive_zone */
43	unsigned long pgrefill_normal;
44	unsigned long pgrefill_dma32;
45	unsigned long pgrefill_dma;
46
47	unsigned long pgsteal_high; /* total highmem pages reclaimed */
48	unsigned long pgsteal_normal;
49	unsigned long pgsteal_dma32;
50	unsigned long pgsteal_dma;
51
52	unsigned long pgscan_kswapd_high;/* total highmem pages scanned */
53	unsigned long pgscan_kswapd_normal;
54	unsigned long pgscan_kswapd_dma32;
55	unsigned long pgscan_kswapd_dma;
56
57	unsigned long pgscan_direct_high;/* total highmem pages scanned */
58	unsigned long pgscan_direct_normal;
59	unsigned long pgscan_direct_dma32;
60	unsigned long pgscan_direct_dma;
61
62	unsigned long pginodesteal; /* pages reclaimed via inode freeing */
63	unsigned long slabs_scanned; /* slab objects scanned */
64	unsigned long kswapd_steal; /* pages reclaimed by kswapd */
65	unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */
66	unsigned long pageoutrun; /* kswapd's calls to page reclaim */
67	unsigned long allocstall; /* direct reclaim calls */
68
69	unsigned long pgrotated; /* pages rotated to tail of the LRU */
70	};	38	};
71		39
72	extern void get_full_page_state(struct page_state *ret);	40	DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
73	extern void mod_page_state_offset(unsigned long offset, unsigned long delta);	41
74	extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);	42	static inline void __count_vm_event(enum vm_event_item item)
75		43	{
76	#define mod_page_state(member, delta) \	44	__get_cpu_var(vm_event_states.event[item])++;
77	mod_page_state_offset(offsetof(struct page_state, member), (delta))	45	}
78		46
79	#define __mod_page_state(member, delta) \	47	static inline void count_vm_event(enum vm_event_item item)
80	__mod_page_state_offset(offsetof(struct page_state, member), (delta))	48	{
81		49	get_cpu_var(vm_event_states.event[item])++;
82	#define inc_page_state(member) mod_page_state(member, 1UL)	50	put_cpu();
83	#define dec_page_state(member) mod_page_state(member, 0UL - 1)	51	}
84	#define add_page_state(member,delta) mod_page_state(member, (delta))	52
85	#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))	53	static inline void __count_vm_events(enum vm_event_item item, long delta)
86		54	{
87	#define __inc_page_state(member) __mod_page_state(member, 1UL)	55	__get_cpu_var(vm_event_states.event[item]) += delta;
88	#define __dec_page_state(member) __mod_page_state(member, 0UL - 1)	56	}
89	#define __add_page_state(member,delta) __mod_page_state(member, (delta))	57
90	#define __sub_page_state(member,delta) __mod_page_state(member, 0UL - (delta))	58	static inline void count_vm_events(enum vm_event_item item, long delta)
91		59	{
92	#define page_state(member) (*__page_state(offsetof(struct page_state, member)))	60	get_cpu_var(vm_event_states.event[item])++;
93		61	put_cpu();
94	#define state_zone_offset(zone, member) \	62	}
95	({ \	63
96	unsigned offset; \	64	extern void all_vm_events(unsigned long *);
97	if (is_highmem(zone)) \	65	extern void vm_events_fold_cpu(int cpu);
98	offset = offsetof(struct page_state, member##_high); \	66
99	else if (is_normal(zone)) \	67	#else
100	offset = offsetof(struct page_state, member##_normal); \	68
101	else if (is_dma32(zone)) \	69	/* Disable counters */
102	offset = offsetof(struct page_state, member##_dma32); \	70	#define get_cpu_vm_events(e) 0L
103	else \	71	#define count_vm_event(e) do { } while (0)
104	offset = offsetof(struct page_state, member##_dma); \	72	#define count_vm_events(e,d) do { } while (0)
105	offset; \	73	#define __count_vm_event(e) do { } while (0)
106	})	74	#define __count_vm_events(e,d) do { } while (0)
107		75	#define vm_events_fold_cpu(x) do { } while (0)
108	#define __mod_page_state_zone(zone, member, delta) \	76
109	do { \	77	#endif /* CONFIG_VM_EVENT_COUNTERS */
110	__mod_page_state_offset(state_zone_offset(zone, member), (delta)); \	78
111	} while (0)	79	#define __count_zone_vm_events(item, zone, delta) \
112		80	__count_vm_events(item##_DMA + zone_idx(zone), delta)
113	#define mod_page_state_zone(zone, member, delta) \
114	do { \
115	mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
116	} while (0)
117
118	DECLARE_PER_CPU(struct page_state, page_states);
119		81
120	/*	82	/*
121	* Zone based page accounting with per cpu differentials.	83	* Zone based page accounting with per cpu differentials.