aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmstat.c
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-06-30 04:55:32 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-30 14:25:34 -0400
commitf6ac2354d791195ca40822b84d73d48a4e8b7f2b (patch)
tree5f600175cf3591eac3d32bb8cebfd45d0aabf804 /mm/vmstat.c
parent672b2714ae57af16fe7d760dc4e0918a7a6cb0fa (diff)
[PATCH] zoned vm counters: create vmstat.c/.h from page_alloc.c/.h
NOTE: ZVC are *not* the lightweight event counters. ZVCs are reliable whereas event counters do not need to be. Zone based VM statistics are necessary to be able to determine what the state of memory in one zone is. In a NUMA system this can be helpful for local reclaim and other memory optimizations that may be able to shift VM load in order to get more balanced memory use. It is also useful to know how the computing load affects the memory allocations on various zones. This patchset allows the retrieval of that data from userspace. The patchset introduces a framework for counters that is a cross between the existing page_stats --which are simply global counters split per cpu-- and the approach of deferred incremental updates implemented for nr_pagecache. Small per cpu 8 bit counters are added to struct zone. If the counter exceeds certain thresholds then the counters are accumulated in an array of atomic_long in the zone and in a global array that sums up all zone values. The small 8 bit counters are next to the per cpu page pointers and so they will be in high in the cpu cache when pages are allocated and freed. Access to VM counter information for a zone and for the whole machine is then possible by simply indexing an array (Thanks to Nick Piggin for pointing out that approach). The access to the total number of pages of various types does no longer require the summing up of all per cpu counters. Benefits of this patchset right now: - Ability for UP and SMP configuration to determine how memory is balanced between the DMA, NORMAL and HIGHMEM zones. - loops over all processors are avoided in writeback and reclaim paths. We can avoid caching the writeback information because the needed information is directly accessible. - Special handling for nr_pagecache removed. - zone_reclaim_interval vanishes since VM stats can now determine when it is worth to do local reclaim. - Fast inline per node page state determination. - Accurate counters in /sys/devices/system/node/node*/meminfo. Current counters are counting simply which processor allocated a page somewhere and guestimate based on that. So the counters were not useful to show the actual distribution of page use on a specific zone. - The swap_prefetch patch requires per node statistics in order to figure out when processors of a node can prefetch. This patch provides some of the needed numbers. - Detailed VM counters available in more /proc and /sys status files. References to earlier discussions: V1 http://marc.theaimsgroup.com/?l=linux-kernel&m=113511649910826&w=2 V2 http://marc.theaimsgroup.com/?l=linux-kernel&m=114980851924230&w=2 V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115014697910351&w=2 V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767318740&w=2 Performance tests with AIM7 did not show any regressions. Seems to be a tad faster even. Tested on ia64/NUMA. Builds fine on i386, SMP / UP. Includes fixes for s390/arm/uml arch code. This patch: Move counter code from page_alloc.c/page-flags.h to vmstat.c/h. Create vmstat.c/vmstat.h by separating the counter code and the proc functions. Move the vm_stat_text array before zoneinfo_show. [akpm@osdl.org: s390 build fix] [akpm@osdl.org: HOTPLUG_CPU build fix] Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r--mm/vmstat.c417
1 files changed, 417 insertions, 0 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
new file mode 100644
index 000000000000..ad456202ff1a
--- /dev/null
+++ b/mm/vmstat.c
@@ -0,0 +1,417 @@
1/*
2 * linux/mm/vmstat.c
3 *
4 * Manages VM statistics
5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 */
7
8#include <linux/config.h>
9#include <linux/mm.h>
10
11/*
12 * Accumulate the page_state information across all CPUs.
13 * The result is unavoidably approximate - it can change
14 * during and after execution of this function.
15 */
16DEFINE_PER_CPU(struct page_state, page_states) = {0};
17
18atomic_t nr_pagecache = ATOMIC_INIT(0);
19EXPORT_SYMBOL(nr_pagecache);
20#ifdef CONFIG_SMP
21DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
22#endif
23
24static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
25{
26 unsigned cpu;
27
28 memset(ret, 0, nr * sizeof(unsigned long));
29 cpus_and(*cpumask, *cpumask, cpu_online_map);
30
31 for_each_cpu_mask(cpu, *cpumask) {
32 unsigned long *in;
33 unsigned long *out;
34 unsigned off;
35 unsigned next_cpu;
36
37 in = (unsigned long *)&per_cpu(page_states, cpu);
38
39 next_cpu = next_cpu(cpu, *cpumask);
40 if (likely(next_cpu < NR_CPUS))
41 prefetch(&per_cpu(page_states, next_cpu));
42
43 out = (unsigned long *)ret;
44 for (off = 0; off < nr; off++)
45 *out++ += *in++;
46 }
47}
48
49void get_page_state_node(struct page_state *ret, int node)
50{
51 int nr;
52 cpumask_t mask = node_to_cpumask(node);
53
54 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
55 nr /= sizeof(unsigned long);
56
57 __get_page_state(ret, nr+1, &mask);
58}
59
60void get_page_state(struct page_state *ret)
61{
62 int nr;
63 cpumask_t mask = CPU_MASK_ALL;
64
65 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
66 nr /= sizeof(unsigned long);
67
68 __get_page_state(ret, nr + 1, &mask);
69}
70
71void get_full_page_state(struct page_state *ret)
72{
73 cpumask_t mask = CPU_MASK_ALL;
74
75 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
76}
77
78unsigned long read_page_state_offset(unsigned long offset)
79{
80 unsigned long ret = 0;
81 int cpu;
82
83 for_each_online_cpu(cpu) {
84 unsigned long in;
85
86 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
87 ret += *((unsigned long *)in);
88 }
89 return ret;
90}
91
92void __mod_page_state_offset(unsigned long offset, unsigned long delta)
93{
94 void *ptr;
95
96 ptr = &__get_cpu_var(page_states);
97 *(unsigned long *)(ptr + offset) += delta;
98}
99EXPORT_SYMBOL(__mod_page_state_offset);
100
101void mod_page_state_offset(unsigned long offset, unsigned long delta)
102{
103 unsigned long flags;
104 void *ptr;
105
106 local_irq_save(flags);
107 ptr = &__get_cpu_var(page_states);
108 *(unsigned long *)(ptr + offset) += delta;
109 local_irq_restore(flags);
110}
111EXPORT_SYMBOL(mod_page_state_offset);
112
113void __get_zone_counts(unsigned long *active, unsigned long *inactive,
114 unsigned long *free, struct pglist_data *pgdat)
115{
116 struct zone *zones = pgdat->node_zones;
117 int i;
118
119 *active = 0;
120 *inactive = 0;
121 *free = 0;
122 for (i = 0; i < MAX_NR_ZONES; i++) {
123 *active += zones[i].nr_active;
124 *inactive += zones[i].nr_inactive;
125 *free += zones[i].free_pages;
126 }
127}
128
129void get_zone_counts(unsigned long *active,
130 unsigned long *inactive, unsigned long *free)
131{
132 struct pglist_data *pgdat;
133
134 *active = 0;
135 *inactive = 0;
136 *free = 0;
137 for_each_online_pgdat(pgdat) {
138 unsigned long l, m, n;
139 __get_zone_counts(&l, &m, &n, pgdat);
140 *active += l;
141 *inactive += m;
142 *free += n;
143 }
144}
145
146#ifdef CONFIG_PROC_FS
147
148#include <linux/seq_file.h>
149
150static void *frag_start(struct seq_file *m, loff_t *pos)
151{
152 pg_data_t *pgdat;
153 loff_t node = *pos;
154 for (pgdat = first_online_pgdat();
155 pgdat && node;
156 pgdat = next_online_pgdat(pgdat))
157 --node;
158
159 return pgdat;
160}
161
162static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
163{
164 pg_data_t *pgdat = (pg_data_t *)arg;
165
166 (*pos)++;
167 return next_online_pgdat(pgdat);
168}
169
170static void frag_stop(struct seq_file *m, void *arg)
171{
172}
173
174/*
175 * This walks the free areas for each zone.
176 */
177static int frag_show(struct seq_file *m, void *arg)
178{
179 pg_data_t *pgdat = (pg_data_t *)arg;
180 struct zone *zone;
181 struct zone *node_zones = pgdat->node_zones;
182 unsigned long flags;
183 int order;
184
185 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
186 if (!populated_zone(zone))
187 continue;
188
189 spin_lock_irqsave(&zone->lock, flags);
190 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
191 for (order = 0; order < MAX_ORDER; ++order)
192 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
193 spin_unlock_irqrestore(&zone->lock, flags);
194 seq_putc(m, '\n');
195 }
196 return 0;
197}
198
199struct seq_operations fragmentation_op = {
200 .start = frag_start,
201 .next = frag_next,
202 .stop = frag_stop,
203 .show = frag_show,
204};
205
206static char *vmstat_text[] = {
207 "nr_dirty",
208 "nr_writeback",
209 "nr_unstable",
210 "nr_page_table_pages",
211 "nr_mapped",
212 "nr_slab",
213
214 "pgpgin",
215 "pgpgout",
216 "pswpin",
217 "pswpout",
218
219 "pgalloc_high",
220 "pgalloc_normal",
221 "pgalloc_dma32",
222 "pgalloc_dma",
223
224 "pgfree",
225 "pgactivate",
226 "pgdeactivate",
227
228 "pgfault",
229 "pgmajfault",
230
231 "pgrefill_high",
232 "pgrefill_normal",
233 "pgrefill_dma32",
234 "pgrefill_dma",
235
236 "pgsteal_high",
237 "pgsteal_normal",
238 "pgsteal_dma32",
239 "pgsteal_dma",
240
241 "pgscan_kswapd_high",
242 "pgscan_kswapd_normal",
243 "pgscan_kswapd_dma32",
244 "pgscan_kswapd_dma",
245
246 "pgscan_direct_high",
247 "pgscan_direct_normal",
248 "pgscan_direct_dma32",
249 "pgscan_direct_dma",
250
251 "pginodesteal",
252 "slabs_scanned",
253 "kswapd_steal",
254 "kswapd_inodesteal",
255 "pageoutrun",
256 "allocstall",
257
258 "pgrotated",
259 "nr_bounce",
260};
261
262/*
263 * Output information about zones in @pgdat.
264 */
265static int zoneinfo_show(struct seq_file *m, void *arg)
266{
267 pg_data_t *pgdat = arg;
268 struct zone *zone;
269 struct zone *node_zones = pgdat->node_zones;
270 unsigned long flags;
271
272 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
273 int i;
274
275 if (!populated_zone(zone))
276 continue;
277
278 spin_lock_irqsave(&zone->lock, flags);
279 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
280 seq_printf(m,
281 "\n pages free %lu"
282 "\n min %lu"
283 "\n low %lu"
284 "\n high %lu"
285 "\n active %lu"
286 "\n inactive %lu"
287 "\n scanned %lu (a: %lu i: %lu)"
288 "\n spanned %lu"
289 "\n present %lu",
290 zone->free_pages,
291 zone->pages_min,
292 zone->pages_low,
293 zone->pages_high,
294 zone->nr_active,
295 zone->nr_inactive,
296 zone->pages_scanned,
297 zone->nr_scan_active, zone->nr_scan_inactive,
298 zone->spanned_pages,
299 zone->present_pages);
300 seq_printf(m,
301 "\n protection: (%lu",
302 zone->lowmem_reserve[0]);
303 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
304 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
305 seq_printf(m,
306 ")"
307 "\n pagesets");
308 for_each_online_cpu(i) {
309 struct per_cpu_pageset *pageset;
310 int j;
311
312 pageset = zone_pcp(zone, i);
313 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
314 if (pageset->pcp[j].count)
315 break;
316 }
317 if (j == ARRAY_SIZE(pageset->pcp))
318 continue;
319 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
320 seq_printf(m,
321 "\n cpu: %i pcp: %i"
322 "\n count: %i"
323 "\n high: %i"
324 "\n batch: %i",
325 i, j,
326 pageset->pcp[j].count,
327 pageset->pcp[j].high,
328 pageset->pcp[j].batch);
329 }
330#ifdef CONFIG_NUMA
331 seq_printf(m,
332 "\n numa_hit: %lu"
333 "\n numa_miss: %lu"
334 "\n numa_foreign: %lu"
335 "\n interleave_hit: %lu"
336 "\n local_node: %lu"
337 "\n other_node: %lu",
338 pageset->numa_hit,
339 pageset->numa_miss,
340 pageset->numa_foreign,
341 pageset->interleave_hit,
342 pageset->local_node,
343 pageset->other_node);
344#endif
345 }
346 seq_printf(m,
347 "\n all_unreclaimable: %u"
348 "\n prev_priority: %i"
349 "\n temp_priority: %i"
350 "\n start_pfn: %lu",
351 zone->all_unreclaimable,
352 zone->prev_priority,
353 zone->temp_priority,
354 zone->zone_start_pfn);
355 spin_unlock_irqrestore(&zone->lock, flags);
356 seq_putc(m, '\n');
357 }
358 return 0;
359}
360
361struct seq_operations zoneinfo_op = {
362 .start = frag_start, /* iterate over all zones. The same as in
363 * fragmentation. */
364 .next = frag_next,
365 .stop = frag_stop,
366 .show = zoneinfo_show,
367};
368
369static void *vmstat_start(struct seq_file *m, loff_t *pos)
370{
371 struct page_state *ps;
372
373 if (*pos >= ARRAY_SIZE(vmstat_text))
374 return NULL;
375
376 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
377 m->private = ps;
378 if (!ps)
379 return ERR_PTR(-ENOMEM);
380 get_full_page_state(ps);
381 ps->pgpgin /= 2; /* sectors -> kbytes */
382 ps->pgpgout /= 2;
383 return (unsigned long *)ps + *pos;
384}
385
386static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
387{
388 (*pos)++;
389 if (*pos >= ARRAY_SIZE(vmstat_text))
390 return NULL;
391 return (unsigned long *)m->private + *pos;
392}
393
394static int vmstat_show(struct seq_file *m, void *arg)
395{
396 unsigned long *l = arg;
397 unsigned long off = l - (unsigned long *)m->private;
398
399 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
400 return 0;
401}
402
403static void vmstat_stop(struct seq_file *m, void *arg)
404{
405 kfree(m->private);
406 m->private = NULL;
407}
408
409struct seq_operations vmstat_op = {
410 .start = vmstat_start,
411 .next = vmstat_next,
412 .stop = vmstat_stop,
413 .show = vmstat_show,
414};
415
416#endif /* CONFIG_PROC_FS */
417