aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-06-30 04:55:32 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-30 14:25:34 -0400
commitf6ac2354d791195ca40822b84d73d48a4e8b7f2b (patch)
tree5f600175cf3591eac3d32bb8cebfd45d0aabf804 /mm/page_alloc.c
parent672b2714ae57af16fe7d760dc4e0918a7a6cb0fa (diff)
[PATCH] zoned vm counters: create vmstat.c/.h from page_alloc.c/.h
NOTE: ZVC are *not* the lightweight event counters. ZVCs are reliable whereas event counters do not need to be. Zone based VM statistics are necessary to be able to determine what the state of memory in one zone is. In a NUMA system this can be helpful for local reclaim and other memory optimizations that may be able to shift VM load in order to get more balanced memory use. It is also useful to know how the computing load affects the memory allocations on various zones. This patchset allows the retrieval of that data from userspace. The patchset introduces a framework for counters that is a cross between the existing page_stats --which are simply global counters split per cpu-- and the approach of deferred incremental updates implemented for nr_pagecache. Small per cpu 8 bit counters are added to struct zone. If the counter exceeds certain thresholds then the counters are accumulated in an array of atomic_long in the zone and in a global array that sums up all zone values. The small 8 bit counters are next to the per cpu page pointers and so they will be in high in the cpu cache when pages are allocated and freed. Access to VM counter information for a zone and for the whole machine is then possible by simply indexing an array (Thanks to Nick Piggin for pointing out that approach). The access to the total number of pages of various types does no longer require the summing up of all per cpu counters. Benefits of this patchset right now: - Ability for UP and SMP configuration to determine how memory is balanced between the DMA, NORMAL and HIGHMEM zones. - loops over all processors are avoided in writeback and reclaim paths. We can avoid caching the writeback information because the needed information is directly accessible. - Special handling for nr_pagecache removed. - zone_reclaim_interval vanishes since VM stats can now determine when it is worth to do local reclaim. - Fast inline per node page state determination. - Accurate counters in /sys/devices/system/node/node*/meminfo. Current counters are counting simply which processor allocated a page somewhere and guestimate based on that. So the counters were not useful to show the actual distribution of page use on a specific zone. - The swap_prefetch patch requires per node statistics in order to figure out when processors of a node can prefetch. This patch provides some of the needed numbers. - Detailed VM counters available in more /proc and /sys status files. References to earlier discussions: V1 http://marc.theaimsgroup.com/?l=linux-kernel&m=113511649910826&w=2 V2 http://marc.theaimsgroup.com/?l=linux-kernel&m=114980851924230&w=2 V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115014697910351&w=2 V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767318740&w=2 Performance tests with AIM7 did not show any regressions. Seems to be a tad faster even. Tested on ia64/NUMA. Builds fine on i386, SMP / UP. Includes fixes for s390/arm/uml arch code. This patch: Move counter code from page_alloc.c/page-flags.h to vmstat.c/h. Create vmstat.c/vmstat.h by separating the counter code and the proc functions. Move the vm_stat_text array before zoneinfo_show. [akpm@osdl.org: s390 build fix] [akpm@osdl.org: HOTPLUG_CPU build fix] Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c407
1 files changed, 0 insertions, 407 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 084a2de7e52a..87dc1297fe39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1231,141 +1231,6 @@ static void show_node(struct zone *zone)
1231#define show_node(zone) do { } while (0) 1231#define show_node(zone) do { } while (0)
1232#endif 1232#endif
1233 1233
1234/*
1235 * Accumulate the page_state information across all CPUs.
1236 * The result is unavoidably approximate - it can change
1237 * during and after execution of this function.
1238 */
1239static DEFINE_PER_CPU(struct page_state, page_states) = {0};
1240
1241atomic_t nr_pagecache = ATOMIC_INIT(0);
1242EXPORT_SYMBOL(nr_pagecache);
1243#ifdef CONFIG_SMP
1244DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1245#endif
1246
1247static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1248{
1249 unsigned cpu;
1250
1251 memset(ret, 0, nr * sizeof(unsigned long));
1252 cpus_and(*cpumask, *cpumask, cpu_online_map);
1253
1254 for_each_cpu_mask(cpu, *cpumask) {
1255 unsigned long *in;
1256 unsigned long *out;
1257 unsigned off;
1258 unsigned next_cpu;
1259
1260 in = (unsigned long *)&per_cpu(page_states, cpu);
1261
1262 next_cpu = next_cpu(cpu, *cpumask);
1263 if (likely(next_cpu < NR_CPUS))
1264 prefetch(&per_cpu(page_states, next_cpu));
1265
1266 out = (unsigned long *)ret;
1267 for (off = 0; off < nr; off++)
1268 *out++ += *in++;
1269 }
1270}
1271
1272void get_page_state_node(struct page_state *ret, int node)
1273{
1274 int nr;
1275 cpumask_t mask = node_to_cpumask(node);
1276
1277 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1278 nr /= sizeof(unsigned long);
1279
1280 __get_page_state(ret, nr+1, &mask);
1281}
1282
1283void get_page_state(struct page_state *ret)
1284{
1285 int nr;
1286 cpumask_t mask = CPU_MASK_ALL;
1287
1288 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1289 nr /= sizeof(unsigned long);
1290
1291 __get_page_state(ret, nr + 1, &mask);
1292}
1293
1294void get_full_page_state(struct page_state *ret)
1295{
1296 cpumask_t mask = CPU_MASK_ALL;
1297
1298 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1299}
1300
1301unsigned long read_page_state_offset(unsigned long offset)
1302{
1303 unsigned long ret = 0;
1304 int cpu;
1305
1306 for_each_online_cpu(cpu) {
1307 unsigned long in;
1308
1309 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
1310 ret += *((unsigned long *)in);
1311 }
1312 return ret;
1313}
1314
1315void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1316{
1317 void *ptr;
1318
1319 ptr = &__get_cpu_var(page_states);
1320 *(unsigned long *)(ptr + offset) += delta;
1321}
1322EXPORT_SYMBOL(__mod_page_state_offset);
1323
1324void mod_page_state_offset(unsigned long offset, unsigned long delta)
1325{
1326 unsigned long flags;
1327 void *ptr;
1328
1329 local_irq_save(flags);
1330 ptr = &__get_cpu_var(page_states);
1331 *(unsigned long *)(ptr + offset) += delta;
1332 local_irq_restore(flags);
1333}
1334EXPORT_SYMBOL(mod_page_state_offset);
1335
1336void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1337 unsigned long *free, struct pglist_data *pgdat)
1338{
1339 struct zone *zones = pgdat->node_zones;
1340 int i;
1341
1342 *active = 0;
1343 *inactive = 0;
1344 *free = 0;
1345 for (i = 0; i < MAX_NR_ZONES; i++) {
1346 *active += zones[i].nr_active;
1347 *inactive += zones[i].nr_inactive;
1348 *free += zones[i].free_pages;
1349 }
1350}
1351
1352void get_zone_counts(unsigned long *active,
1353 unsigned long *inactive, unsigned long *free)
1354{
1355 struct pglist_data *pgdat;
1356
1357 *active = 0;
1358 *inactive = 0;
1359 *free = 0;
1360 for_each_online_pgdat(pgdat) {
1361 unsigned long l, m, n;
1362 __get_zone_counts(&l, &m, &n, pgdat);
1363 *active += l;
1364 *inactive += m;
1365 *free += n;
1366 }
1367}
1368
1369void si_meminfo(struct sysinfo *val) 1234void si_meminfo(struct sysinfo *val)
1370{ 1235{
1371 val->totalram = totalram_pages; 1236 val->totalram = totalram_pages;
@@ -2253,278 +2118,6 @@ void __init free_area_init(unsigned long *zones_size)
2253 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2118 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2254} 2119}
2255 2120
2256#ifdef CONFIG_PROC_FS
2257
2258#include <linux/seq_file.h>
2259
2260static void *frag_start(struct seq_file *m, loff_t *pos)
2261{
2262 pg_data_t *pgdat;
2263 loff_t node = *pos;
2264 for (pgdat = first_online_pgdat();
2265 pgdat && node;
2266 pgdat = next_online_pgdat(pgdat))
2267 --node;
2268
2269 return pgdat;
2270}
2271
2272static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
2273{
2274 pg_data_t *pgdat = (pg_data_t *)arg;
2275
2276 (*pos)++;
2277 return next_online_pgdat(pgdat);
2278}
2279
2280static void frag_stop(struct seq_file *m, void *arg)
2281{
2282}
2283
2284/*
2285 * This walks the free areas for each zone.
2286 */
2287static int frag_show(struct seq_file *m, void *arg)
2288{
2289 pg_data_t *pgdat = (pg_data_t *)arg;
2290 struct zone *zone;
2291 struct zone *node_zones = pgdat->node_zones;
2292 unsigned long flags;
2293 int order;
2294
2295 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2296 if (!populated_zone(zone))
2297 continue;
2298
2299 spin_lock_irqsave(&zone->lock, flags);
2300 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
2301 for (order = 0; order < MAX_ORDER; ++order)
2302 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
2303 spin_unlock_irqrestore(&zone->lock, flags);
2304 seq_putc(m, '\n');
2305 }
2306 return 0;
2307}
2308
2309struct seq_operations fragmentation_op = {
2310 .start = frag_start,
2311 .next = frag_next,
2312 .stop = frag_stop,
2313 .show = frag_show,
2314};
2315
2316/*
2317 * Output information about zones in @pgdat.
2318 */
2319static int zoneinfo_show(struct seq_file *m, void *arg)
2320{
2321 pg_data_t *pgdat = arg;
2322 struct zone *zone;
2323 struct zone *node_zones = pgdat->node_zones;
2324 unsigned long flags;
2325
2326 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2327 int i;
2328
2329 if (!populated_zone(zone))
2330 continue;
2331
2332 spin_lock_irqsave(&zone->lock, flags);
2333 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
2334 seq_printf(m,
2335 "\n pages free %lu"
2336 "\n min %lu"
2337 "\n low %lu"
2338 "\n high %lu"
2339 "\n active %lu"
2340 "\n inactive %lu"
2341 "\n scanned %lu (a: %lu i: %lu)"
2342 "\n spanned %lu"
2343 "\n present %lu",
2344 zone->free_pages,
2345 zone->pages_min,
2346 zone->pages_low,
2347 zone->pages_high,
2348 zone->nr_active,
2349 zone->nr_inactive,
2350 zone->pages_scanned,
2351 zone->nr_scan_active, zone->nr_scan_inactive,
2352 zone->spanned_pages,
2353 zone->present_pages);
2354 seq_printf(m,
2355 "\n protection: (%lu",
2356 zone->lowmem_reserve[0]);
2357 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
2358 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
2359 seq_printf(m,
2360 ")"
2361 "\n pagesets");
2362 for_each_online_cpu(i) {
2363 struct per_cpu_pageset *pageset;
2364 int j;
2365
2366 pageset = zone_pcp(zone, i);
2367 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2368 if (pageset->pcp[j].count)
2369 break;
2370 }
2371 if (j == ARRAY_SIZE(pageset->pcp))
2372 continue;
2373 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2374 seq_printf(m,
2375 "\n cpu: %i pcp: %i"
2376 "\n count: %i"
2377 "\n high: %i"
2378 "\n batch: %i",
2379 i, j,
2380 pageset->pcp[j].count,
2381 pageset->pcp[j].high,
2382 pageset->pcp[j].batch);
2383 }
2384#ifdef CONFIG_NUMA
2385 seq_printf(m,
2386 "\n numa_hit: %lu"
2387 "\n numa_miss: %lu"
2388 "\n numa_foreign: %lu"
2389 "\n interleave_hit: %lu"
2390 "\n local_node: %lu"
2391 "\n other_node: %lu",
2392 pageset->numa_hit,
2393 pageset->numa_miss,
2394 pageset->numa_foreign,
2395 pageset->interleave_hit,
2396 pageset->local_node,
2397 pageset->other_node);
2398#endif
2399 }
2400 seq_printf(m,
2401 "\n all_unreclaimable: %u"
2402 "\n prev_priority: %i"
2403 "\n temp_priority: %i"
2404 "\n start_pfn: %lu",
2405 zone->all_unreclaimable,
2406 zone->prev_priority,
2407 zone->temp_priority,
2408 zone->zone_start_pfn);
2409 spin_unlock_irqrestore(&zone->lock, flags);
2410 seq_putc(m, '\n');
2411 }
2412 return 0;
2413}
2414
2415struct seq_operations zoneinfo_op = {
2416 .start = frag_start, /* iterate over all zones. The same as in
2417 * fragmentation. */
2418 .next = frag_next,
2419 .stop = frag_stop,
2420 .show = zoneinfo_show,
2421};
2422
2423static char *vmstat_text[] = {
2424 "nr_dirty",
2425 "nr_writeback",
2426 "nr_unstable",
2427 "nr_page_table_pages",
2428 "nr_mapped",
2429 "nr_slab",
2430
2431 "pgpgin",
2432 "pgpgout",
2433 "pswpin",
2434 "pswpout",
2435
2436 "pgalloc_high",
2437 "pgalloc_normal",
2438 "pgalloc_dma32",
2439 "pgalloc_dma",
2440
2441 "pgfree",
2442 "pgactivate",
2443 "pgdeactivate",
2444
2445 "pgfault",
2446 "pgmajfault",
2447
2448 "pgrefill_high",
2449 "pgrefill_normal",
2450 "pgrefill_dma32",
2451 "pgrefill_dma",
2452
2453 "pgsteal_high",
2454 "pgsteal_normal",
2455 "pgsteal_dma32",
2456 "pgsteal_dma",
2457
2458 "pgscan_kswapd_high",
2459 "pgscan_kswapd_normal",
2460 "pgscan_kswapd_dma32",
2461 "pgscan_kswapd_dma",
2462
2463 "pgscan_direct_high",
2464 "pgscan_direct_normal",
2465 "pgscan_direct_dma32",
2466 "pgscan_direct_dma",
2467
2468 "pginodesteal",
2469 "slabs_scanned",
2470 "kswapd_steal",
2471 "kswapd_inodesteal",
2472 "pageoutrun",
2473 "allocstall",
2474
2475 "pgrotated",
2476 "nr_bounce",
2477};
2478
2479static void *vmstat_start(struct seq_file *m, loff_t *pos)
2480{
2481 struct page_state *ps;
2482
2483 if (*pos >= ARRAY_SIZE(vmstat_text))
2484 return NULL;
2485
2486 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
2487 m->private = ps;
2488 if (!ps)
2489 return ERR_PTR(-ENOMEM);
2490 get_full_page_state(ps);
2491 ps->pgpgin /= 2; /* sectors -> kbytes */
2492 ps->pgpgout /= 2;
2493 return (unsigned long *)ps + *pos;
2494}
2495
2496static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
2497{
2498 (*pos)++;
2499 if (*pos >= ARRAY_SIZE(vmstat_text))
2500 return NULL;
2501 return (unsigned long *)m->private + *pos;
2502}
2503
2504static int vmstat_show(struct seq_file *m, void *arg)
2505{
2506 unsigned long *l = arg;
2507 unsigned long off = l - (unsigned long *)m->private;
2508
2509 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
2510 return 0;
2511}
2512
2513static void vmstat_stop(struct seq_file *m, void *arg)
2514{
2515 kfree(m->private);
2516 m->private = NULL;
2517}
2518
2519struct seq_operations vmstat_op = {
2520 .start = vmstat_start,
2521 .next = vmstat_next,
2522 .stop = vmstat_stop,
2523 .show = vmstat_show,
2524};
2525
2526#endif /* CONFIG_PROC_FS */
2527
2528#ifdef CONFIG_HOTPLUG_CPU 2121#ifdef CONFIG_HOTPLUG_CPU
2529static int page_alloc_cpu_notify(struct notifier_block *self, 2122static int page_alloc_cpu_notify(struct notifier_block *self,
2530 unsigned long action, void *hcpu) 2123 unsigned long action, void *hcpu)