aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/mmzone.h
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2014-08-06 19:07:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-06 21:01:20 -0400
commit3484b2de9499df23c4604a513b36f96326ae81ad (patch)
treefbaaed00603474887953459b01e09c5b9ff7379f /include/linux/mmzone.h
parent24b7e5819ad5cbef2b7c7376510862aa8319d240 (diff)
mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines
The arrangement of struct zone has changed over time and now it has reached the point where there is some inappropriate sharing going on. On x86-64 for example o The zone->node field is shared with the zone lock and zone->node is accessed frequently from the page allocator due to the fair zone allocation policy. o span_seqlock is almost never used by shares a line with free_area o Some zone statistics share a cache line with the LRU lock so reclaim-intensive and allocator-intensive workloads can bounce the cache line on a stat update This patch rearranges struct zone to put read-only and read-mostly fields together and then splits the page allocator intensive fields, the zone statistics and the page reclaim intensive fields into their own cache lines. Note that the type of lowmem_reserve changes due to the watermark calculations being signed and avoiding a signed/unsigned conversion there. On the test configuration I used the overall size of struct zone shrunk by one cache line. On smaller machines, this is not likely to be noticable. However, on a 4-node NUMA machine running tiobench the system CPU overhead is reduced by this patch. 3.16.0-rc3 3.16.0-rc3 vanillarearrange-v5r9 User 746.94 759.78 System 65336.22 58350.98 Elapsed 27553.52 27282.02 Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include/linux/mmzone.h')
-rw-r--r--include/linux/mmzone.h211
1 files changed, 108 insertions, 103 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 559e659288fc..ed0876bb902c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -324,19 +324,12 @@ enum zone_type {
324#ifndef __GENERATING_BOUNDS_H 324#ifndef __GENERATING_BOUNDS_H
325 325
326struct zone { 326struct zone {
327 /* Fields commonly accessed by the page allocator */ 327 /* Read-mostly fields */
328 328
329 /* zone watermarks, access with *_wmark_pages(zone) macros */ 329 /* zone watermarks, access with *_wmark_pages(zone) macros */
330 unsigned long watermark[NR_WMARK]; 330 unsigned long watermark[NR_WMARK];
331 331
332 /* 332 /*
333 * When free pages are below this point, additional steps are taken
334 * when reading the number of free pages to avoid per-cpu counter
335 * drift allowing watermarks to be breached
336 */
337 unsigned long percpu_drift_mark;
338
339 /*
340 * We don't know if the memory that we're going to allocate will be freeable 333 * We don't know if the memory that we're going to allocate will be freeable
341 * or/and it will be released eventually, so to avoid totally wasting several 334 * or/and it will be released eventually, so to avoid totally wasting several
342 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 335 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -344,41 +337,26 @@ struct zone {
344 * on the higher zones). This array is recalculated at runtime if the 337 * on the higher zones). This array is recalculated at runtime if the
345 * sysctl_lowmem_reserve_ratio sysctl changes. 338 * sysctl_lowmem_reserve_ratio sysctl changes.
346 */ 339 */
347 unsigned long lowmem_reserve[MAX_NR_ZONES]; 340 long lowmem_reserve[MAX_NR_ZONES];
348
349 /*
350 * This is a per-zone reserve of pages that should not be
351 * considered dirtyable memory.
352 */
353 unsigned long dirty_balance_reserve;
354 341
355#ifdef CONFIG_NUMA 342#ifdef CONFIG_NUMA
356 int node; 343 int node;
344#endif
345
357 /* 346 /*
358 * zone reclaim becomes active if more unmapped pages exist. 347 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
348 * this zone's LRU. Maintained by the pageout code.
359 */ 349 */
360 unsigned long min_unmapped_pages; 350 unsigned int inactive_ratio;
361 unsigned long min_slab_pages; 351
362#endif 352 struct pglist_data *zone_pgdat;
363 struct per_cpu_pageset __percpu *pageset; 353 struct per_cpu_pageset __percpu *pageset;
354
364 /* 355 /*
365 * free areas of different sizes 356 * This is a per-zone reserve of pages that should not be
357 * considered dirtyable memory.
366 */ 358 */
367 spinlock_t lock; 359 unsigned long dirty_balance_reserve;
368#if defined CONFIG_COMPACTION || defined CONFIG_CMA
369 /* Set to true when the PG_migrate_skip bits should be cleared */
370 bool compact_blockskip_flush;
371
372 /* pfn where compaction free scanner should start */
373 unsigned long compact_cached_free_pfn;
374 /* pfn where async and sync compaction migration scanner should start */
375 unsigned long compact_cached_migrate_pfn[2];
376#endif
377#ifdef CONFIG_MEMORY_HOTPLUG
378 /* see spanned/present_pages for more description */
379 seqlock_t span_seqlock;
380#endif
381 struct free_area free_area[MAX_ORDER];
382 360
383#ifndef CONFIG_SPARSEMEM 361#ifndef CONFIG_SPARSEMEM
384 /* 362 /*
@@ -388,74 +366,14 @@ struct zone {
388 unsigned long *pageblock_flags; 366 unsigned long *pageblock_flags;
389#endif /* CONFIG_SPARSEMEM */ 367#endif /* CONFIG_SPARSEMEM */
390 368
391#ifdef CONFIG_COMPACTION 369#ifdef CONFIG_NUMA
392 /*
393 * On compaction failure, 1<<compact_defer_shift compactions
394 * are skipped before trying again. The number attempted since
395 * last failure is tracked with compact_considered.
396 */
397 unsigned int compact_considered;
398 unsigned int compact_defer_shift;
399 int compact_order_failed;
400#endif
401
402 ZONE_PADDING(_pad1_)
403
404 /* Fields commonly accessed by the page reclaim scanner */
405 spinlock_t lru_lock;
406 struct lruvec lruvec;
407
408 /* Evictions & activations on the inactive file list */
409 atomic_long_t inactive_age;
410
411 unsigned long pages_scanned; /* since last reclaim */
412 unsigned long flags; /* zone flags, see below */
413
414 /* Zone statistics */
415 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
416
417 /*
418 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
419 * this zone's LRU. Maintained by the pageout code.
420 */
421 unsigned int inactive_ratio;
422
423
424 ZONE_PADDING(_pad2_)
425 /* Rarely used or read-mostly fields */
426
427 /* 370 /*
428 * wait_table -- the array holding the hash table 371 * zone reclaim becomes active if more unmapped pages exist.
429 * wait_table_hash_nr_entries -- the size of the hash table array
430 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
431 *
432 * The purpose of all these is to keep track of the people
433 * waiting for a page to become available and make them
434 * runnable again when possible. The trouble is that this
435 * consumes a lot of space, especially when so few things
436 * wait on pages at a given time. So instead of using
437 * per-page waitqueues, we use a waitqueue hash table.
438 *
439 * The bucket discipline is to sleep on the same queue when
440 * colliding and wake all in that wait queue when removing.
441 * When something wakes, it must check to be sure its page is
442 * truly available, a la thundering herd. The cost of a
443 * collision is great, but given the expected load of the
444 * table, they should be so rare as to be outweighed by the
445 * benefits from the saved space.
446 *
447 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
448 * primary users of these fields, and in mm/page_alloc.c
449 * free_area_init_core() performs the initialization of them.
450 */ 372 */
451 wait_queue_head_t * wait_table; 373 unsigned long min_unmapped_pages;
452 unsigned long wait_table_hash_nr_entries; 374 unsigned long min_slab_pages;
453 unsigned long wait_table_bits; 375#endif /* CONFIG_NUMA */
454 376
455 /*
456 * Discontig memory support fields.
457 */
458 struct pglist_data *zone_pgdat;
459 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 377 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
460 unsigned long zone_start_pfn; 378 unsigned long zone_start_pfn;
461 379
@@ -500,9 +418,11 @@ struct zone {
500 * adjust_managed_page_count() should be used instead of directly 418 * adjust_managed_page_count() should be used instead of directly
501 * touching zone->managed_pages and totalram_pages. 419 * touching zone->managed_pages and totalram_pages.
502 */ 420 */
421 unsigned long managed_pages;
503 unsigned long spanned_pages; 422 unsigned long spanned_pages;
504 unsigned long present_pages; 423 unsigned long present_pages;
505 unsigned long managed_pages; 424
425 const char *name;
506 426
507 /* 427 /*
508 * Number of MIGRATE_RESEVE page block. To maintain for just 428 * Number of MIGRATE_RESEVE page block. To maintain for just
@@ -510,10 +430,95 @@ struct zone {
510 */ 430 */
511 int nr_migrate_reserve_block; 431 int nr_migrate_reserve_block;
512 432
433#ifdef CONFIG_MEMORY_HOTPLUG
434 /* see spanned/present_pages for more description */
435 seqlock_t span_seqlock;
436#endif
437
513 /* 438 /*
514 * rarely used fields: 439 * wait_table -- the array holding the hash table
440 * wait_table_hash_nr_entries -- the size of the hash table array
441 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
442 *
443 * The purpose of all these is to keep track of the people
444 * waiting for a page to become available and make them
445 * runnable again when possible. The trouble is that this
446 * consumes a lot of space, especially when so few things
447 * wait on pages at a given time. So instead of using
448 * per-page waitqueues, we use a waitqueue hash table.
449 *
450 * The bucket discipline is to sleep on the same queue when
451 * colliding and wake all in that wait queue when removing.
452 * When something wakes, it must check to be sure its page is
453 * truly available, a la thundering herd. The cost of a
454 * collision is great, but given the expected load of the
455 * table, they should be so rare as to be outweighed by the
456 * benefits from the saved space.
457 *
458 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
459 * primary users of these fields, and in mm/page_alloc.c
460 * free_area_init_core() performs the initialization of them.
515 */ 461 */
516 const char *name; 462 wait_queue_head_t *wait_table;
463 unsigned long wait_table_hash_nr_entries;
464 unsigned long wait_table_bits;
465
466 ZONE_PADDING(_pad1_)
467
468 /* Write-intensive fields used from the page allocator */
469 spinlock_t lock;
470
471 /* free areas of different sizes */
472 struct free_area free_area[MAX_ORDER];
473
474 /* zone flags, see below */
475 unsigned long flags;
476
477 ZONE_PADDING(_pad2_)
478
479 /* Write-intensive fields used by page reclaim */
480
481 /* Fields commonly accessed by the page reclaim scanner */
482 spinlock_t lru_lock;
483 unsigned long pages_scanned; /* since last reclaim */
484 struct lruvec lruvec;
485
486 /* Evictions & activations on the inactive file list */
487 atomic_long_t inactive_age;
488
489 /*
490 * When free pages are below this point, additional steps are taken
491 * when reading the number of free pages to avoid per-cpu counter
492 * drift allowing watermarks to be breached
493 */
494 unsigned long percpu_drift_mark;
495
496#if defined CONFIG_COMPACTION || defined CONFIG_CMA
497 /* pfn where compaction free scanner should start */
498 unsigned long compact_cached_free_pfn;
499 /* pfn where async and sync compaction migration scanner should start */
500 unsigned long compact_cached_migrate_pfn[2];
501#endif
502
503#ifdef CONFIG_COMPACTION
504 /*
505 * On compaction failure, 1<<compact_defer_shift compactions
506 * are skipped before trying again. The number attempted since
507 * last failure is tracked with compact_considered.
508 */
509 unsigned int compact_considered;
510 unsigned int compact_defer_shift;
511 int compact_order_failed;
512#endif
513
514#if defined CONFIG_COMPACTION || defined CONFIG_CMA
515 /* Set to true when the PG_migrate_skip bits should be cleared */
516 bool compact_blockskip_flush;
517#endif
518
519 ZONE_PADDING(_pad3_)
520 /* Zone statistics */
521 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
517} ____cacheline_internodealigned_in_smp; 522} ____cacheline_internodealigned_in_smp;
518 523
519typedef enum { 524typedef enum {