aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-06-30 04:55:45 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-30 14:25:36 -0400
commitf8891e5e1f93a128c3900f82035e8541357896a7 (patch)
tree97b078ac97970962b17c85d39fd64cb48dc01168
parentca889e6c45e0b112cb2ca9d35afc66297519b5d5 (diff)
[PATCH] Light weight event counters
The remaining counters in page_state after the zoned VM counter patches have been applied are all just for show in /proc/vmstat. They have no essential function for the VM. We use a simple increment of per cpu variables. In order to avoid the most severe races we disable preempt. Preempt does not prevent the race between an increment and an interrupt handler incrementing the same statistics counter. However, that race is exceedingly rare, we may only loose one increment or so and there is no requirement (at least not in kernel) that the vm event counters have to be accurate. In the non preempt case this results in a simple increment for each counter. For many architectures this will be reduced by the compiler to a single instruction. This single instruction is atomic for i386 and x86_64. And therefore even the rare race condition in an interrupt is avoided for both architectures in most cases. The patchset also adds an off switch for embedded systems that allows a building of linux kernels without these counters. The implementation of these counters is through inline code that hopefully results in only a single instruction increment instruction being emitted (i386, x86_64) or in the increment being hidden though instruction concurrency (EPIC architectures such as ia64 can get that done). Benefits: - VM event counter operations usually reduce to a single inline instruction on i386 and x86_64. - No interrupt disable, only preempt disable for the preempt case. Preempt disable can also be avoided by moving the counter into a spinlock. - Handling is similar to zoned VM counters. - Simple and easily extendable. - Can be omitted to reduce memory use for embedded use. References: RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=113512330605497&w=2 RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=114988082814934&w=2 local_t http://marc.theaimsgroup.com/?l=linux-kernel&m=114991748606690&w=2 V2 http://marc.theaimsgroup.com/?t=115014808400007&r=1&w=2 V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767022346&w=2 V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115047968808926&w=2 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/s390/appldata/appldata_base.c1
-rw-r--r--arch/s390/appldata/appldata_mem.c20
-rw-r--r--block/ll_rw_blk.c4
-rw-r--r--drivers/parisc/led.c11
-rw-r--r--fs/inode.c9
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--include/linux/vmstat.h170
-rw-r--r--init/Kconfig9
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/memory.c4
-rw-r--r--mm/page_alloc.c21
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/swap.c4
-rw-r--r--mm/vmscan.c23
-rw-r--r--mm/vmstat.c171
16 files changed, 212 insertions, 249 deletions
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 61bc44626c04..2476ca739c1e 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -766,7 +766,6 @@ unsigned long nr_iowait(void)
766#endif /* MODULE */ 766#endif /* MODULE */
767EXPORT_SYMBOL_GPL(si_swapinfo); 767EXPORT_SYMBOL_GPL(si_swapinfo);
768EXPORT_SYMBOL_GPL(nr_threads); 768EXPORT_SYMBOL_GPL(nr_threads);
769EXPORT_SYMBOL_GPL(get_full_page_state);
770EXPORT_SYMBOL_GPL(nr_running); 769EXPORT_SYMBOL_GPL(nr_running);
771EXPORT_SYMBOL_GPL(nr_iowait); 770EXPORT_SYMBOL_GPL(nr_iowait);
772//EXPORT_SYMBOL_GPL(nr_context_switches); 771//EXPORT_SYMBOL_GPL(nr_context_switches);
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index 180ba79a6267..4811e2dac864 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -107,21 +107,21 @@ static void appldata_get_mem_data(void *data)
107 * serialized through the appldata_ops_lock and can use static 107 * serialized through the appldata_ops_lock and can use static
108 */ 108 */
109 static struct sysinfo val; 109 static struct sysinfo val;
110 static struct page_state ps; 110 unsigned long ev[NR_VM_EVENT_ITEMS];
111 struct appldata_mem_data *mem_data; 111 struct appldata_mem_data *mem_data;
112 112
113 mem_data = data; 113 mem_data = data;
114 mem_data->sync_count_1++; 114 mem_data->sync_count_1++;
115 115
116 get_full_page_state(&ps); 116 all_vm_events(ev);
117 mem_data->pgpgin = ps.pgpgin >> 1; 117 mem_data->pgpgin = ev[PGPGIN] >> 1;
118 mem_data->pgpgout = ps.pgpgout >> 1; 118 mem_data->pgpgout = ev[PGPGOUT] >> 1;
119 mem_data->pswpin = ps.pswpin; 119 mem_data->pswpin = ev[PSWPIN];
120 mem_data->pswpout = ps.pswpout; 120 mem_data->pswpout = ev[PSWPOUT];
121 mem_data->pgalloc = ps.pgalloc_high + ps.pgalloc_normal + 121 mem_data->pgalloc = ev[PGALLOC_HIGH] + ev[PGALLOC_NORMAL] +
122 ps.pgalloc_dma; 122 ev[PGALLOC_DMA];
123 mem_data->pgfault = ps.pgfault; 123 mem_data->pgfault = ev[PGFAULT];
124 mem_data->pgmajfault = ps.pgmajfault; 124 mem_data->pgmajfault = ev[PGMAJFAULT];
125 125
126 si_meminfo(&val); 126 si_meminfo(&val);
127 mem_data->sharedram = val.sharedram; 127 mem_data->sharedram = val.sharedram;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index eee03a3876a3..fb83547f563e 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3117,9 +3117,9 @@ void submit_bio(int rw, struct bio *bio)
3117 BIO_BUG_ON(!bio->bi_io_vec); 3117 BIO_BUG_ON(!bio->bi_io_vec);
3118 bio->bi_rw |= rw; 3118 bio->bi_rw |= rw;
3119 if (rw & WRITE) 3119 if (rw & WRITE)
3120 mod_page_state(pgpgout, count); 3120 count_vm_events(PGPGOUT, count);
3121 else 3121 else
3122 mod_page_state(pgpgin, count); 3122 count_vm_events(PGPGIN, count);
3123 3123
3124 if (unlikely(block_dump)) { 3124 if (unlikely(block_dump)) {
3125 char b[BDEVNAME_SIZE]; 3125 char b[BDEVNAME_SIZE];
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 298f2ddb2c17..d7024c7483bd 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -411,16 +411,17 @@ static __inline__ int led_get_net_activity(void)
411static __inline__ int led_get_diskio_activity(void) 411static __inline__ int led_get_diskio_activity(void)
412{ 412{
413 static unsigned long last_pgpgin, last_pgpgout; 413 static unsigned long last_pgpgin, last_pgpgout;
414 struct page_state pgstat; 414 unsigned long events[NR_VM_EVENT_ITEMS];
415 int changed; 415 int changed;
416 416
417 get_full_page_state(&pgstat); /* get no of sectors in & out */ 417 all_vm_events(events);
418 418
419 /* Just use a very simple calculation here. Do not care about overflow, 419 /* Just use a very simple calculation here. Do not care about overflow,
420 since we only want to know if there was activity or not. */ 420 since we only want to know if there was activity or not. */
421 changed = (pgstat.pgpgin != last_pgpgin) || (pgstat.pgpgout != last_pgpgout); 421 changed = (events[PGPGIN] != last_pgpgin) ||
422 last_pgpgin = pgstat.pgpgin; 422 (events[PGPGOUT] != last_pgpgout);
423 last_pgpgout = pgstat.pgpgout; 423 last_pgpgin = events[PGPGIN];
424 last_pgpgout = events[PGPGOUT];
424 425
425 return (changed ? LED_DISK_IO : 0); 426 return (changed ? LED_DISK_IO : 0);
426} 427}
diff --git a/fs/inode.c b/fs/inode.c
index f42961eb983b..14a6c4147e4e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -452,15 +452,14 @@ static void prune_icache(int nr_to_scan)
452 nr_pruned++; 452 nr_pruned++;
453 } 453 }
454 inodes_stat.nr_unused -= nr_pruned; 454 inodes_stat.nr_unused -= nr_pruned;
455 if (current_is_kswapd())
456 __count_vm_events(KSWAPD_INODESTEAL, reap);
457 else
458 __count_vm_events(PGINODESTEAL, reap);
455 spin_unlock(&inode_lock); 459 spin_unlock(&inode_lock);
456 460
457 dispose_list(&freeable); 461 dispose_list(&freeable);
458 mutex_unlock(&iprune_mutex); 462 mutex_unlock(&iprune_mutex);
459
460 if (current_is_kswapd())
461 mod_page_state(kswapd_inodesteal, reap);
462 else
463 mod_page_state(pginodesteal, reap);
464} 463}
465 464
466/* 465/*
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 52d60c3d8996..e7d5a3097fe6 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -93,7 +93,7 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
93 */ 93 */
94 if (type) 94 if (type)
95 *type = VM_FAULT_MAJOR; 95 *type = VM_FAULT_MAJOR;
96 inc_page_state(pgmajfault); 96 count_vm_event(PGMAJFAULT);
97 return page; 97 return page;
98} 98}
99 99
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 16173b63ee67..3e0daf54133e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -7,115 +7,77 @@
7#include <linux/mmzone.h> 7#include <linux/mmzone.h>
8#include <asm/atomic.h> 8#include <asm/atomic.h>
9 9
10#ifdef CONFIG_VM_EVENT_COUNTERS
10/* 11/*
11 * Global page accounting. One instance per CPU. Only unsigned longs are 12 * Light weight per cpu counter implementation.
12 * allowed.
13 * 13 *
14 * - Fields can be modified with xxx_page_state and xxx_page_state_zone at 14 * Counters should only be incremented and no critical kernel component
15 * any time safely (which protects the instance from modification by 15 * should rely on the counter values.
16 * interrupt. 16 *
17 * - The __xxx_page_state variants can be used safely when interrupts are 17 * Counters are handled completely inline. On many platforms the code
18 * disabled. 18 * generated will simply be the increment of a global address.
19 * - The __xxx_page_state variants can be used if the field is only
20 * modified from process context and protected from preemption, or only
21 * modified from interrupt context. In this case, the field should be
22 * commented here.
23 */ 19 */
24struct page_state { 20
25 unsigned long pgpgin; /* Disk reads */ 21#define FOR_ALL_ZONES(x) x##_DMA, x##_DMA32, x##_NORMAL, x##_HIGH
26 unsigned long pgpgout; /* Disk writes */ 22
27 unsigned long pswpin; /* swap reads */ 23enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
28 unsigned long pswpout; /* swap writes */ 24 FOR_ALL_ZONES(PGALLOC),
29 25 PGFREE, PGACTIVATE, PGDEACTIVATE,
30 unsigned long pgalloc_high; /* page allocations */ 26 PGFAULT, PGMAJFAULT,
31 unsigned long pgalloc_normal; 27 FOR_ALL_ZONES(PGREFILL),
32 unsigned long pgalloc_dma32; 28 FOR_ALL_ZONES(PGSTEAL),
33 unsigned long pgalloc_dma; 29 FOR_ALL_ZONES(PGSCAN_KSWAPD),
34 30 FOR_ALL_ZONES(PGSCAN_DIRECT),
35 unsigned long pgfree; /* page freeings */ 31 PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
36 unsigned long pgactivate; /* pages moved inactive->active */ 32 PAGEOUTRUN, ALLOCSTALL, PGROTATED,
37 unsigned long pgdeactivate; /* pages moved active->inactive */ 33 NR_VM_EVENT_ITEMS
38 34};
39 unsigned long pgfault; /* faults (major+minor) */ 35
40 unsigned long pgmajfault; /* faults (major only) */ 36struct vm_event_state {
41 37 unsigned long event[NR_VM_EVENT_ITEMS];
42 unsigned long pgrefill_high; /* inspected in refill_inactive_zone */
43 unsigned long pgrefill_normal;
44 unsigned long pgrefill_dma32;
45 unsigned long pgrefill_dma;
46
47 unsigned long pgsteal_high; /* total highmem pages reclaimed */
48 unsigned long pgsteal_normal;
49 unsigned long pgsteal_dma32;
50 unsigned long pgsteal_dma;
51
52 unsigned long pgscan_kswapd_high;/* total highmem pages scanned */
53 unsigned long pgscan_kswapd_normal;
54 unsigned long pgscan_kswapd_dma32;
55 unsigned long pgscan_kswapd_dma;
56
57 unsigned long pgscan_direct_high;/* total highmem pages scanned */
58 unsigned long pgscan_direct_normal;
59 unsigned long pgscan_direct_dma32;
60 unsigned long pgscan_direct_dma;
61
62 unsigned long pginodesteal; /* pages reclaimed via inode freeing */
63 unsigned long slabs_scanned; /* slab objects scanned */
64 unsigned long kswapd_steal; /* pages reclaimed by kswapd */
65 unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */
66 unsigned long pageoutrun; /* kswapd's calls to page reclaim */
67 unsigned long allocstall; /* direct reclaim calls */
68
69 unsigned long pgrotated; /* pages rotated to tail of the LRU */
70}; 38};
71 39
72extern void get_full_page_state(struct page_state *ret); 40DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
73extern void mod_page_state_offset(unsigned long offset, unsigned long delta); 41
74extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); 42static inline void __count_vm_event(enum vm_event_item item)
75 43{
76#define mod_page_state(member, delta) \ 44 __get_cpu_var(vm_event_states.event[item])++;
77 mod_page_state_offset(offsetof(struct page_state, member), (delta)) 45}
78 46
79#define __mod_page_state(member, delta) \ 47static inline void count_vm_event(enum vm_event_item item)
80 __mod_page_state_offset(offsetof(struct page_state, member), (delta)) 48{
81 49 get_cpu_var(vm_event_states.event[item])++;
82#define inc_page_state(member) mod_page_state(member, 1UL) 50 put_cpu();
83#define dec_page_state(member) mod_page_state(member, 0UL - 1) 51}
84#define add_page_state(member,delta) mod_page_state(member, (delta)) 52
85#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta)) 53static inline void __count_vm_events(enum vm_event_item item, long delta)
86 54{
87#define __inc_page_state(member) __mod_page_state(member, 1UL) 55 __get_cpu_var(vm_event_states.event[item]) += delta;
88#define __dec_page_state(member) __mod_page_state(member, 0UL - 1) 56}
89#define __add_page_state(member,delta) __mod_page_state(member, (delta)) 57
90#define __sub_page_state(member,delta) __mod_page_state(member, 0UL - (delta)) 58static inline void count_vm_events(enum vm_event_item item, long delta)
91 59{
92#define page_state(member) (*__page_state(offsetof(struct page_state, member))) 60 get_cpu_var(vm_event_states.event[item])++;
93 61 put_cpu();
94#define state_zone_offset(zone, member) \ 62}
95({ \ 63
96 unsigned offset; \ 64extern void all_vm_events(unsigned long *);
97 if (is_highmem(zone)) \ 65extern void vm_events_fold_cpu(int cpu);
98 offset = offsetof(struct page_state, member##_high); \ 66
99 else if (is_normal(zone)) \ 67#else
100 offset = offsetof(struct page_state, member##_normal); \ 68
101 else if (is_dma32(zone)) \ 69/* Disable counters */
102 offset = offsetof(struct page_state, member##_dma32); \ 70#define get_cpu_vm_events(e) 0L
103 else \ 71#define count_vm_event(e) do { } while (0)
104 offset = offsetof(struct page_state, member##_dma); \ 72#define count_vm_events(e,d) do { } while (0)
105 offset; \ 73#define __count_vm_event(e) do { } while (0)
106}) 74#define __count_vm_events(e,d) do { } while (0)
107 75#define vm_events_fold_cpu(x) do { } while (0)
108#define __mod_page_state_zone(zone, member, delta) \ 76
109 do { \ 77#endif /* CONFIG_VM_EVENT_COUNTERS */
110 __mod_page_state_offset(state_zone_offset(zone, member), (delta)); \ 78
111 } while (0) 79#define __count_zone_vm_events(item, zone, delta) \
112 80 __count_vm_events(item##_DMA + zone_idx(zone), delta)
113#define mod_page_state_zone(zone, member, delta) \
114 do { \
115 mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
116 } while (0)
117
118DECLARE_PER_CPU(struct page_state, page_states);
119 81
120/* 82/*
121 * Zone based page accounting with per cpu differentials. 83 * Zone based page accounting with per cpu differentials.
diff --git a/init/Kconfig b/init/Kconfig
index f70f2fd273c2..f515948889a7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -379,6 +379,15 @@ config SLAB
379 SLOB is more space efficient but does not scale well and is 379 SLOB is more space efficient but does not scale well and is
380 more susceptible to fragmentation. 380 more susceptible to fragmentation.
381 381
382config VM_EVENT_COUNTERS
383 default y
384 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
385 help
386 VM event counters are only needed to for event counts to be
387 shown. They have no function for the kernel itself. This
388 option allows the disabling of the VM event counters.
389 /proc/vmstat will only show page counts.
390
382endmenu # General setup 391endmenu # General setup
383 392
384config TINY_SHMEM 393config TINY_SHMEM
diff --git a/mm/filemap.c b/mm/filemap.c
index 87d62c44c3f0..796a5471b495 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1416,7 +1416,7 @@ retry_find:
1416 */ 1416 */
1417 if (!did_readaround) { 1417 if (!did_readaround) {
1418 majmin = VM_FAULT_MAJOR; 1418 majmin = VM_FAULT_MAJOR;
1419 inc_page_state(pgmajfault); 1419 count_vm_event(PGMAJFAULT);
1420 } 1420 }
1421 did_readaround = 1; 1421 did_readaround = 1;
1422 ra_pages = max_sane_readahead(file->f_ra.ra_pages); 1422 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
@@ -1487,7 +1487,7 @@ no_cached_page:
1487page_not_uptodate: 1487page_not_uptodate:
1488 if (!did_readaround) { 1488 if (!did_readaround) {
1489 majmin = VM_FAULT_MAJOR; 1489 majmin = VM_FAULT_MAJOR;
1490 inc_page_state(pgmajfault); 1490 count_vm_event(PGMAJFAULT);
1491 } 1491 }
1492 lock_page(page); 1492 lock_page(page);
1493 1493
diff --git a/mm/memory.c b/mm/memory.c
index 1a78791590fa..7e2a4b1580e3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1951,7 +1951,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1951 1951
1952 /* Had to read the page from swap area: Major fault */ 1952 /* Had to read the page from swap area: Major fault */
1953 ret = VM_FAULT_MAJOR; 1953 ret = VM_FAULT_MAJOR;
1954 inc_page_state(pgmajfault); 1954 count_vm_event(PGMAJFAULT);
1955 grab_swap_token(); 1955 grab_swap_token();
1956 } 1956 }
1957 1957
@@ -2324,7 +2324,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2324 2324
2325 __set_current_state(TASK_RUNNING); 2325 __set_current_state(TASK_RUNNING);
2326 2326
2327 inc_page_state(pgfault); 2327 count_vm_event(PGFAULT);
2328 2328
2329 if (unlikely(is_vm_hugetlb_page(vma))) 2329 if (unlikely(is_vm_hugetlb_page(vma)))
2330 return hugetlb_fault(mm, vma, address, write_access); 2330 return hugetlb_fault(mm, vma, address, write_access);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d61671260f92..30b0b97ad023 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -456,7 +456,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
456 456
457 kernel_map_pages(page, 1 << order, 0); 457 kernel_map_pages(page, 1 << order, 0);
458 local_irq_save(flags); 458 local_irq_save(flags);
459 __mod_page_state(pgfree, 1 << order); 459 __count_vm_events(PGFREE, 1 << order);
460 free_one_page(page_zone(page), page, order); 460 free_one_page(page_zone(page), page, order);
461 local_irq_restore(flags); 461 local_irq_restore(flags);
462} 462}
@@ -729,7 +729,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
729 729
730 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 730 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
731 local_irq_save(flags); 731 local_irq_save(flags);
732 __inc_page_state(pgfree); 732 __count_vm_event(PGFREE);
733 list_add(&page->lru, &pcp->list); 733 list_add(&page->lru, &pcp->list);
734 pcp->count++; 734 pcp->count++;
735 if (pcp->count >= pcp->high) { 735 if (pcp->count >= pcp->high) {
@@ -805,7 +805,7 @@ again:
805 goto failed; 805 goto failed;
806 } 806 }
807 807
808 __mod_page_state_zone(zone, pgalloc, 1 << order); 808 __count_zone_vm_events(PGALLOC, zone, 1 << order);
809 zone_statistics(zonelist, zone); 809 zone_statistics(zonelist, zone);
810 local_irq_restore(flags); 810 local_irq_restore(flags);
811 put_cpu(); 811 put_cpu();
@@ -2101,24 +2101,11 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
2101 unsigned long action, void *hcpu) 2101 unsigned long action, void *hcpu)
2102{ 2102{
2103 int cpu = (unsigned long)hcpu; 2103 int cpu = (unsigned long)hcpu;
2104 unsigned long *src, *dest;
2105 2104
2106 if (action == CPU_DEAD) { 2105 if (action == CPU_DEAD) {
2107 int i;
2108
2109 local_irq_disable(); 2106 local_irq_disable();
2110 __drain_pages(cpu); 2107 __drain_pages(cpu);
2111 2108 vm_events_fold_cpu(cpu);
2112 /* Add dead cpu's page_states to our own. */
2113 dest = (unsigned long *)&__get_cpu_var(page_states);
2114 src = (unsigned long *)&per_cpu(page_states, cpu);
2115
2116 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
2117 i++) {
2118 dest[i] += src[i];
2119 src[i] = 0;
2120 }
2121
2122 local_irq_enable(); 2109 local_irq_enable();
2123 refresh_cpu_vm_stats(cpu); 2110 refresh_cpu_vm_stats(cpu);
2124 } 2111 }
diff --git a/mm/page_io.c b/mm/page_io.c
index bb2b0d53889c..88029948d00a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -101,7 +101,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
101 } 101 }
102 if (wbc->sync_mode == WB_SYNC_ALL) 102 if (wbc->sync_mode == WB_SYNC_ALL)
103 rw |= (1 << BIO_RW_SYNC); 103 rw |= (1 << BIO_RW_SYNC);
104 inc_page_state(pswpout); 104 count_vm_event(PSWPOUT);
105 set_page_writeback(page); 105 set_page_writeback(page);
106 unlock_page(page); 106 unlock_page(page);
107 submit_bio(rw, bio); 107 submit_bio(rw, bio);
@@ -123,7 +123,7 @@ int swap_readpage(struct file *file, struct page *page)
123 ret = -ENOMEM; 123 ret = -ENOMEM;
124 goto out; 124 goto out;
125 } 125 }
126 inc_page_state(pswpin); 126 count_vm_event(PSWPIN);
127 submit_bio(READ, bio); 127 submit_bio(READ, bio);
128out: 128out:
129 return ret; 129 return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index b14ff817d162..a9c09e0ba709 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1045,12 +1045,12 @@ repeat:
1045 swappage = lookup_swap_cache(swap); 1045 swappage = lookup_swap_cache(swap);
1046 if (!swappage) { 1046 if (!swappage) {
1047 shmem_swp_unmap(entry); 1047 shmem_swp_unmap(entry);
1048 spin_unlock(&info->lock);
1049 /* here we actually do the io */ 1048 /* here we actually do the io */
1050 if (type && *type == VM_FAULT_MINOR) { 1049 if (type && *type == VM_FAULT_MINOR) {
1051 inc_page_state(pgmajfault); 1050 __count_vm_event(PGMAJFAULT);
1052 *type = VM_FAULT_MAJOR; 1051 *type = VM_FAULT_MAJOR;
1053 } 1052 }
1053 spin_unlock(&info->lock);
1054 swappage = shmem_swapin(info, swap, idx); 1054 swappage = shmem_swapin(info, swap, idx);
1055 if (!swappage) { 1055 if (!swappage) {
1056 spin_lock(&info->lock); 1056 spin_lock(&info->lock);
diff --git a/mm/swap.c b/mm/swap.c
index 990868afc1c6..8fd095c4ae51 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -87,7 +87,7 @@ int rotate_reclaimable_page(struct page *page)
87 spin_lock_irqsave(&zone->lru_lock, flags); 87 spin_lock_irqsave(&zone->lru_lock, flags);
88 if (PageLRU(page) && !PageActive(page)) { 88 if (PageLRU(page) && !PageActive(page)) {
89 list_move_tail(&page->lru, &zone->inactive_list); 89 list_move_tail(&page->lru, &zone->inactive_list);
90 inc_page_state(pgrotated); 90 __count_vm_event(PGROTATED);
91 } 91 }
92 if (!test_clear_page_writeback(page)) 92 if (!test_clear_page_writeback(page))
93 BUG(); 93 BUG();
@@ -107,7 +107,7 @@ void fastcall activate_page(struct page *page)
107 del_page_from_inactive_list(zone, page); 107 del_page_from_inactive_list(zone, page);
108 SetPageActive(page); 108 SetPageActive(page);
109 add_page_to_active_list(zone, page); 109 add_page_to_active_list(zone, page);
110 inc_page_state(pgactivate); 110 __count_vm_event(PGACTIVATE);
111 } 111 }
112 spin_unlock_irq(&zone->lru_lock); 112 spin_unlock_irq(&zone->lru_lock);
113} 113}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d6942436ac97..ff2ebe9458a3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -215,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
215 break; 215 break;
216 if (shrink_ret < nr_before) 216 if (shrink_ret < nr_before)
217 ret += nr_before - shrink_ret; 217 ret += nr_before - shrink_ret;
218 mod_page_state(slabs_scanned, this_scan); 218 count_vm_events(SLABS_SCANNED, this_scan);
219 total_scan -= this_scan; 219 total_scan -= this_scan;
220 220
221 cond_resched(); 221 cond_resched();
@@ -569,7 +569,7 @@ keep:
569 list_splice(&ret_pages, page_list); 569 list_splice(&ret_pages, page_list);
570 if (pagevec_count(&freed_pvec)) 570 if (pagevec_count(&freed_pvec))
571 __pagevec_release_nonlru(&freed_pvec); 571 __pagevec_release_nonlru(&freed_pvec);
572 mod_page_state(pgactivate, pgactivate); 572 count_vm_events(PGACTIVATE, pgactivate);
573 return nr_reclaimed; 573 return nr_reclaimed;
574} 574}
575 575
@@ -659,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
659 nr_reclaimed += nr_freed; 659 nr_reclaimed += nr_freed;
660 local_irq_disable(); 660 local_irq_disable();
661 if (current_is_kswapd()) { 661 if (current_is_kswapd()) {
662 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 662 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
663 __mod_page_state(kswapd_steal, nr_freed); 663 __count_vm_events(KSWAPD_STEAL, nr_freed);
664 } else 664 } else
665 __mod_page_state_zone(zone, pgscan_direct, nr_scan); 665 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
666 __mod_page_state_zone(zone, pgsteal, nr_freed); 666 __count_vm_events(PGACTIVATE, nr_freed);
667 667
668 if (nr_taken == 0) 668 if (nr_taken == 0)
669 goto done; 669 goto done;
@@ -841,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
841 } 841 }
842 } 842 }
843 zone->nr_active += pgmoved; 843 zone->nr_active += pgmoved;
844 spin_unlock(&zone->lru_lock);
845 844
846 __mod_page_state_zone(zone, pgrefill, pgscanned); 845 __count_zone_vm_events(PGREFILL, zone, pgscanned);
847 __mod_page_state(pgdeactivate, pgdeactivate); 846 __count_vm_events(PGDEACTIVATE, pgdeactivate);
848 local_irq_enable(); 847 spin_unlock_irq(&zone->lru_lock);
849 848
850 pagevec_release(&pvec); 849 pagevec_release(&pvec);
851} 850}
@@ -977,7 +976,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
977 .swappiness = vm_swappiness, 976 .swappiness = vm_swappiness,
978 }; 977 };
979 978
980 inc_page_state(allocstall); 979 count_vm_event(ALLOCSTALL);
981 980
982 for (i = 0; zones[i] != NULL; i++) { 981 for (i = 0; zones[i] != NULL; i++) {
983 struct zone *zone = zones[i]; 982 struct zone *zone = zones[i];
@@ -1074,7 +1073,7 @@ loop_again:
1074 total_scanned = 0; 1073 total_scanned = 0;
1075 nr_reclaimed = 0; 1074 nr_reclaimed = 0;
1076 sc.may_writepage = !laptop_mode; 1075 sc.may_writepage = !laptop_mode;
1077 inc_page_state(pageoutrun); 1076 count_vm_event(PAGEOUTRUN);
1078 1077
1079 for (i = 0; i < pgdat->nr_zones; i++) { 1078 for (i = 0; i < pgdat->nr_zones; i++) {
1080 struct zone *zone = pgdat->node_zones + i; 1079 struct zone *zone = pgdat->node_zones + i;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ee7f89666250..73b83d67bab6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,66 +13,6 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/module.h> 14#include <linux/module.h>
15 15
16/*
17 * Accumulate the page_state information across all CPUs.
18 * The result is unavoidably approximate - it can change
19 * during and after execution of this function.
20 */
21DEFINE_PER_CPU(struct page_state, page_states) = {0};
22
23static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
24{
25 unsigned cpu;
26
27 memset(ret, 0, nr * sizeof(unsigned long));
28 cpus_and(*cpumask, *cpumask, cpu_online_map);
29
30 for_each_cpu_mask(cpu, *cpumask) {
31 unsigned long *in;
32 unsigned long *out;
33 unsigned off;
34 unsigned next_cpu;
35
36 in = (unsigned long *)&per_cpu(page_states, cpu);
37
38 next_cpu = next_cpu(cpu, *cpumask);
39 if (likely(next_cpu < NR_CPUS))
40 prefetch(&per_cpu(page_states, next_cpu));
41
42 out = (unsigned long *)ret;
43 for (off = 0; off < nr; off++)
44 *out++ += *in++;
45 }
46}
47
48void get_full_page_state(struct page_state *ret)
49{
50 cpumask_t mask = CPU_MASK_ALL;
51
52 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
53}
54
55void __mod_page_state_offset(unsigned long offset, unsigned long delta)
56{
57 void *ptr;
58
59 ptr = &__get_cpu_var(page_states);
60 *(unsigned long *)(ptr + offset) += delta;
61}
62EXPORT_SYMBOL(__mod_page_state_offset);
63
64void mod_page_state_offset(unsigned long offset, unsigned long delta)
65{
66 unsigned long flags;
67 void *ptr;
68
69 local_irq_save(flags);
70 ptr = &__get_cpu_var(page_states);
71 *(unsigned long *)(ptr + offset) += delta;
72 local_irq_restore(flags);
73}
74EXPORT_SYMBOL(mod_page_state_offset);
75
76void __get_zone_counts(unsigned long *active, unsigned long *inactive, 16void __get_zone_counts(unsigned long *active, unsigned long *inactive,
77 unsigned long *free, struct pglist_data *pgdat) 17 unsigned long *free, struct pglist_data *pgdat)
78{ 18{
@@ -106,6 +46,63 @@ void get_zone_counts(unsigned long *active,
106 } 46 }
107} 47}
108 48
49#ifdef CONFIG_VM_EVENT_COUNTERS
50DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
51EXPORT_PER_CPU_SYMBOL(vm_event_states);
52
53static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
54{
55 int cpu = 0;
56 int i;
57
58 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
59
60 cpu = first_cpu(*cpumask);
61 while (cpu < NR_CPUS) {
62 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
63
64 cpu = next_cpu(cpu, *cpumask);
65
66 if (cpu < NR_CPUS)
67 prefetch(&per_cpu(vm_event_states, cpu));
68
69
70 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
71 ret[i] += this->event[i];
72 }
73}
74
75/*
76 * Accumulate the vm event counters across all CPUs.
77 * The result is unavoidably approximate - it can change
78 * during and after execution of this function.
79*/
80void all_vm_events(unsigned long *ret)
81{
82 sum_vm_events(ret, &cpu_online_map);
83}
84
85#ifdef CONFIG_HOTPLUG
86/*
87 * Fold the foreign cpu events into our own.
88 *
89 * This is adding to the events on one processor
90 * but keeps the global counts constant.
91 */
92void vm_events_fold_cpu(int cpu)
93{
94 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
95 int i;
96
97 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
98 count_vm_events(i, fold_state->event[i]);
99 fold_state->event[i] = 0;
100 }
101}
102#endif /* CONFIG_HOTPLUG */
103
104#endif /* CONFIG_VM_EVENT_COUNTERS */
105
109/* 106/*
110 * Manage combined zone based / global counters 107 * Manage combined zone based / global counters
111 * 108 *
@@ -405,16 +402,16 @@ static char *vmstat_text[] = {
405 "numa_other", 402 "numa_other",
406#endif 403#endif
407 404
408 /* Event counters */ 405#ifdef CONFIG_VM_EVENT_COUNTERS
409 "pgpgin", 406 "pgpgin",
410 "pgpgout", 407 "pgpgout",
411 "pswpin", 408 "pswpin",
412 "pswpout", 409 "pswpout",
413 410
414 "pgalloc_high",
415 "pgalloc_normal",
416 "pgalloc_dma32",
417 "pgalloc_dma", 411 "pgalloc_dma",
412 "pgalloc_dma32",
413 "pgalloc_normal",
414 "pgalloc_high",
418 415
419 "pgfree", 416 "pgfree",
420 "pgactivate", 417 "pgactivate",
@@ -423,25 +420,25 @@ static char *vmstat_text[] = {
423 "pgfault", 420 "pgfault",
424 "pgmajfault", 421 "pgmajfault",
425 422
426 "pgrefill_high",
427 "pgrefill_normal",
428 "pgrefill_dma32",
429 "pgrefill_dma", 423 "pgrefill_dma",
424 "pgrefill_dma32",
425 "pgrefill_normal",
426 "pgrefill_high",
430 427
431 "pgsteal_high",
432 "pgsteal_normal",
433 "pgsteal_dma32",
434 "pgsteal_dma", 428 "pgsteal_dma",
429 "pgsteal_dma32",
430 "pgsteal_normal",
431 "pgsteal_high",
435 432
436 "pgscan_kswapd_high",
437 "pgscan_kswapd_normal",
438 "pgscan_kswapd_dma32",
439 "pgscan_kswapd_dma", 433 "pgscan_kswapd_dma",
434 "pgscan_kswapd_dma32",
435 "pgscan_kswapd_normal",
436 "pgscan_kswapd_high",
440 437
441 "pgscan_direct_high",
442 "pgscan_direct_normal",
443 "pgscan_direct_dma32",
444 "pgscan_direct_dma", 438 "pgscan_direct_dma",
439 "pgscan_direct_dma32",
440 "pgscan_direct_normal",
441 "pgscan_direct_high",
445 442
446 "pginodesteal", 443 "pginodesteal",
447 "slabs_scanned", 444 "slabs_scanned",
@@ -451,6 +448,7 @@ static char *vmstat_text[] = {
451 "allocstall", 448 "allocstall",
452 449
453 "pgrotated", 450 "pgrotated",
451#endif
454}; 452};
455 453
456/* 454/*
@@ -553,23 +551,32 @@ struct seq_operations zoneinfo_op = {
553static void *vmstat_start(struct seq_file *m, loff_t *pos) 551static void *vmstat_start(struct seq_file *m, loff_t *pos)
554{ 552{
555 unsigned long *v; 553 unsigned long *v;
556 struct page_state *ps; 554#ifdef CONFIG_VM_EVENT_COUNTERS
555 unsigned long *e;
556#endif
557 int i; 557 int i;
558 558
559 if (*pos >= ARRAY_SIZE(vmstat_text)) 559 if (*pos >= ARRAY_SIZE(vmstat_text))
560 return NULL; 560 return NULL;
561 561
562#ifdef CONFIG_VM_EVENT_COUNTERS
562 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) 563 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
563 + sizeof(*ps), GFP_KERNEL); 564 + sizeof(struct vm_event_state), GFP_KERNEL);
565#else
566 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
567 GFP_KERNEL);
568#endif
564 m->private = v; 569 m->private = v;
565 if (!v) 570 if (!v)
566 return ERR_PTR(-ENOMEM); 571 return ERR_PTR(-ENOMEM);
567 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 572 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
568 v[i] = global_page_state(i); 573 v[i] = global_page_state(i);
569 ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS); 574#ifdef CONFIG_VM_EVENT_COUNTERS
570 get_full_page_state(ps); 575 e = v + NR_VM_ZONE_STAT_ITEMS;
571 ps->pgpgin /= 2; /* sectors -> kbytes */ 576 all_vm_events(e);
572 ps->pgpgout /= 2; 577 e[PGPGIN] /= 2; /* sectors -> kbytes */
578 e[PGPGOUT] /= 2;
579#endif
573 return v + *pos; 580 return v + *pos;
574} 581}
575 582