mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines

The arrangement of struct zone has changed over time and now it has reached the point where there is some inappropriate sharing going on. On x86-64 for example o The zone->node field is shared with the zone lock and zone->node is accessed frequently from the page allocator due to the fair zone allocation policy. o span_seqlock is almost never used by shares a line with free_area o Some zone statistics share a cache line with the LRU lock so reclaim-intensive and allocator-intensive workloads can bounce the cache line on a stat update This patch rearranges struct zone to put read-only and read-mostly fields together and then splits the page allocator intensive fields, the zone statistics and the page reclaim intensive fields into their own cache lines. Note that the type of lowmem_reserve changes due to the watermark calculations being signed and avoiding a signed/unsigned conversion there. On the test configuration I used the overall size of struct zone shrunk by one cache line. On smaller machines, this is not likely to be noticable. However, on a 4-node NUMA machine running tiobench the system CPU overhead is reduced by this patch. 3.16.0-rc3 3.16.0-rc3 vanillarearrange-v5r9 User 746.94 759.78 System 65336.22 58350.98 Elapsed 27553.52 27282.02 Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mel Gorman <mgorman@suse.de> 2014-08-06 19:07:14 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-08-06 21:01:20 -0400
commit: 3484b2de9499df23c4604a513b36f96326ae81ad (patch)
tree: fbaaed00603474887953459b01e09c5b9ff7379f /include/linux/mmzone.h
parent: 24b7e5819ad5cbef2b7c7376510862aa8319d240 (diff)
1 files changed, 108 insertions, 103 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 559e659288fc..ed0876bb902c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -324,19 +324,12 @@ enum zone_type {
 #ifndef __GENERATING_BOUNDS_H
 struct zone {
-        /* Fields commonly accessed by the page allocator */
+        /* Read-mostly fields */
        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long watermark[NR_WMARK];
        /*
-         * When free pages are below this point, additional steps are taken
-         * when reading the number of free pages to avoid per-cpu counter
-         * drift allowing watermarks to be breached
-         */
-        unsigned long percpu_drift_mark;
-        /*
         * We don't know if the memory that we're going to allocate will be freeable
         * or/and it will be released eventually, so to avoid totally wasting several
         * GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -344,41 +337,26 @@ struct zone {
         * on the higher zones). This array is recalculated at runtime if the
         * sysctl_lowmem_reserve_ratio sysctl changes.
         */
-        unsigned long           lowmem_reserve[MAX_NR_ZONES];
+        long lowmem_reserve[MAX_NR_ZONES];
-        /*
-         * This is a per-zone reserve of pages that should not be
-         * considered dirtyable memory.
-         */
-        unsigned long           dirty_balance_reserve;
 #ifdef CONFIG_NUMA
        int node;
+#endif
        /*
-         * zone reclaim becomes active if more unmapped pages exist.
+         * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+         * this zone's LRU.  Maintained by the pageout code.
         */
-        unsigned long           min_unmapped_pages;
+        unsigned int inactive_ratio;
-        unsigned long           min_slab_pages;
-#endif
+        struct pglist_data      *zone_pgdat;
        struct per_cpu_pageset __percpu *pageset;
        /*
-         * free areas of different sizes
+         * This is a per-zone reserve of pages that should not be
+         * considered dirtyable memory.
         */
-        spinlock_t              lock;
+        unsigned long           dirty_balance_reserve;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-        /* Set to true when the PG_migrate_skip bits should be cleared */
-        bool                    compact_blockskip_flush;
-        /* pfn where compaction free scanner should start */
-        unsigned long           compact_cached_free_pfn;
-        /* pfn where async and sync compaction migration scanner should start */
-        unsigned long           compact_cached_migrate_pfn[2];
-#endif
-#ifdef CONFIG_MEMORY_HOTPLUG
-        /* see spanned/present_pages for more description */
-        seqlock_t               span_seqlock;
-#endif
-        struct free_area        free_area[MAX_ORDER];
 #ifndef CONFIG_SPARSEMEM
        /*
@@ -388,74 +366,14 @@ struct zone {
        unsigned long           *pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
-#ifdef CONFIG_COMPACTION
+#ifdef CONFIG_NUMA
-        /*
-         * On compaction failure, 1<<compact_defer_shift compactions
-         * are skipped before trying again. The number attempted since
-         * last failure is tracked with compact_considered.
-         */
-        unsigned int            compact_considered;
-        unsigned int            compact_defer_shift;
-        int                     compact_order_failed;
-#endif
-        ZONE_PADDING(_pad1_)
-        /* Fields commonly accessed by the page reclaim scanner */
-        spinlock_t              lru_lock;
-        struct lruvec           lruvec;
-        /* Evictions & activations on the inactive file list */
-        atomic_long_t           inactive_age;
-        unsigned long           pages_scanned;     /* since last reclaim */
-        unsigned long           flags;             /* zone flags, see below */
-        /* Zone statistics */
-        atomic_long_t           vm_stat[NR_VM_ZONE_STAT_ITEMS];
-        /*
-         * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
-         * this zone's LRU.  Maintained by the pageout code.
-         */
-        unsigned int inactive_ratio;
-        ZONE_PADDING(_pad2_)
-        /* Rarely used or read-mostly fields */
        /*
-         * wait_table           -- the array holding the hash table
+         * zone reclaim becomes active if more unmapped pages exist.
-         * wait_table_hash_nr_entries   -- the size of the hash table array
-         * wait_table_bits      -- wait_table_size == (1 << wait_table_bits)
-         *
-         * The purpose of all these is to keep track of the people
-         * waiting for a page to become available and make them
-         * runnable again when possible. The trouble is that this
-         * consumes a lot of space, especially when so few things
-         * wait on pages at a given time. So instead of using
-         * per-page waitqueues, we use a waitqueue hash table.
-         *
-         * The bucket discipline is to sleep on the same queue when
-         * colliding and wake all in that wait queue when removing.
-         * When something wakes, it must check to be sure its page is
-         * truly available, a la thundering herd. The cost of a
-         * collision is great, but given the expected load of the
-         * table, they should be so rare as to be outweighed by the
-         * benefits from the saved space.
-         *
-         * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
-         * primary users of these fields, and in mm/page_alloc.c
-         * free_area_init_core() performs the initialization of them.
         */
-        wait_queue_head_t       * wait_table;
+        unsigned long           min_unmapped_pages;
-        unsigned long           wait_table_hash_nr_entries;
+        unsigned long           min_slab_pages;
-        unsigned long           wait_table_bits;
+#endif /* CONFIG_NUMA */
-        /*
-         * Discontig memory support fields.
-         */
-        struct pglist_data      *zone_pgdat;
        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long           zone_start_pfn;
@@ -500,9 +418,11 @@ struct zone {
         * adjust_managed_page_count() should be used instead of directly
         * touching zone->managed_pages and totalram_pages.
         */
+        unsigned long           managed_pages;
        unsigned long           spanned_pages;
        unsigned long           present_pages;
-        unsigned long           managed_pages;
+        const char              *name;
        /*
         * Number of MIGRATE_RESEVE page block. To maintain for just
@@ -510,10 +430,95 @@ struct zone {
         */
        int                     nr_migrate_reserve_block;
+#ifdef CONFIG_MEMORY_HOTPLUG
+        /* see spanned/present_pages for more description */
+        seqlock_t               span_seqlock;
+#endif
        /*
-         * rarely used fields:
+         * wait_table           -- the array holding the hash table
+         * wait_table_hash_nr_entries   -- the size of the hash table array
+         * wait_table_bits      -- wait_table_size == (1 << wait_table_bits)
+         *
+         * The purpose of all these is to keep track of the people
+         * waiting for a page to become available and make them
+         * runnable again when possible. The trouble is that this
+         * consumes a lot of space, especially when so few things
+         * wait on pages at a given time. So instead of using
+         * per-page waitqueues, we use a waitqueue hash table.
+         *
+         * The bucket discipline is to sleep on the same queue when
+         * colliding and wake all in that wait queue when removing.
+         * When something wakes, it must check to be sure its page is
+         * truly available, a la thundering herd. The cost of a
+         * collision is great, but given the expected load of the
+         * table, they should be so rare as to be outweighed by the
+         * benefits from the saved space.
+         *
+         * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
+         * primary users of these fields, and in mm/page_alloc.c
+         * free_area_init_core() performs the initialization of them.
         */
-        const char              *name;
+        wait_queue_head_t       *wait_table;
+        unsigned long           wait_table_hash_nr_entries;
+        unsigned long           wait_table_bits;
+        ZONE_PADDING(_pad1_)
+        /* Write-intensive fields used from the page allocator */
+        spinlock_t              lock;
+        /* free areas of different sizes */
+        struct free_area        free_area[MAX_ORDER];
+        /* zone flags, see below */
+        unsigned long           flags;
+        ZONE_PADDING(_pad2_)
+        /* Write-intensive fields used by page reclaim */
+        /* Fields commonly accessed by the page reclaim scanner */
+        spinlock_t              lru_lock;
+        unsigned long           pages_scanned;     /* since last reclaim */
+        struct lruvec           lruvec;
+        /* Evictions & activations on the inactive file list */
+        atomic_long_t           inactive_age;
+        /*
+         * When free pages are below this point, additional steps are taken
+         * when reading the number of free pages to avoid per-cpu counter
+         * drift allowing watermarks to be breached
+         */
+        unsigned long percpu_drift_mark;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+        /* pfn where compaction free scanner should start */
+        unsigned long           compact_cached_free_pfn;
+        /* pfn where async and sync compaction migration scanner should start */
+        unsigned long           compact_cached_migrate_pfn[2];
+#endif
+#ifdef CONFIG_COMPACTION
+        /*
+         * On compaction failure, 1<<compact_defer_shift compactions
+         * are skipped before trying again. The number attempted since
+         * last failure is tracked with compact_considered.
+         */
+        unsigned int            compact_considered;
+        unsigned int            compact_defer_shift;
+        int                     compact_order_failed;
+#endif
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+        /* Set to true when the PG_migrate_skip bits should be cleared */
+        bool                    compact_blockskip_flush;
+#endif
+        ZONE_PADDING(_pad3_)
+        /* Zone statistics */
+        atomic_long_t           vm_stat[NR_VM_ZONE_STAT_ITEMS];
 } ____cacheline_internodealigned_in_smp;
 typedef enum {
author	Mel Gorman <mgorman@suse.de>	2014-08-06 19:07:14 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-08-06 21:01:20 -0400
commit	3484b2de9499df23c4604a513b36f96326ae81ad (patch)
tree	fbaaed00603474887953459b01e09c5b9ff7379f /include/linux/mmzone.h
parent	24b7e5819ad5cbef2b7c7376510862aa8319d240 (diff)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 559e659288fc..ed0876bb902c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h
@@ -324,19 +324,12 @@ enum zone_type {
324	#ifndef __GENERATING_BOUNDS_H	324	#ifndef __GENERATING_BOUNDS_H
325		325
326	struct zone {	326	struct zone {
327	/* Fields commonly accessed by the page allocator */	327	/* Read-mostly fields */
328		328
329	/* zone watermarks, access with _wmark_pages(zone) macros /	329	/* zone watermarks, access with _wmark_pages(zone) macros /
330	unsigned long watermark[NR_WMARK];	330	unsigned long watermark[NR_WMARK];
331		331
332	/*	332	/*
333	* When free pages are below this point, additional steps are taken
334	* when reading the number of free pages to avoid per-cpu counter
335	* drift allowing watermarks to be breached
336	*/
337	unsigned long percpu_drift_mark;
338
339	/*
340	* We don't know if the memory that we're going to allocate will be freeable	333	* We don't know if the memory that we're going to allocate will be freeable
341	* or/and it will be released eventually, so to avoid totally wasting several	334	* or/and it will be released eventually, so to avoid totally wasting several
342	* GB of ram we must reserve some of the lower zone memory (otherwise we risk	335	* GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -344,41 +337,26 @@ struct zone {
344	* on the higher zones). This array is recalculated at runtime if the	337	* on the higher zones). This array is recalculated at runtime if the
345	* sysctl_lowmem_reserve_ratio sysctl changes.	338	* sysctl_lowmem_reserve_ratio sysctl changes.
346	*/	339	*/
347	unsigned long lowmem_reserve[MAX_NR_ZONES];	340	long lowmem_reserve[MAX_NR_ZONES];
348
349	/*
350	* This is a per-zone reserve of pages that should not be
351	* considered dirtyable memory.
352	*/
353	unsigned long dirty_balance_reserve;
354		341
355	#ifdef CONFIG_NUMA	342	#ifdef CONFIG_NUMA
356	int node;	343	int node;
		344	#endif
		345
357	/*	346	/*
358	* zone reclaim becomes active if more unmapped pages exist.	347	* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
		348	* this zone's LRU. Maintained by the pageout code.
359	*/	349	*/
360	unsigned long min_unmapped_pages;	350	unsigned int inactive_ratio;
361	unsigned long min_slab_pages;	351
362	#endif	352	struct pglist_data *zone_pgdat;
363	struct per_cpu_pageset __percpu *pageset;	353	struct per_cpu_pageset __percpu *pageset;
		354
364	/*	355	/*
365	* free areas of different sizes	356	* This is a per-zone reserve of pages that should not be
		357	* considered dirtyable memory.
366	*/	358	*/
367	spinlock_t lock;	359	unsigned long dirty_balance_reserve;
368	#if defined CONFIG_COMPACTION \|\| defined CONFIG_CMA
369	/* Set to true when the PG_migrate_skip bits should be cleared */
370	bool compact_blockskip_flush;
371
372	/* pfn where compaction free scanner should start */
373	unsigned long compact_cached_free_pfn;
374	/* pfn where async and sync compaction migration scanner should start */
375	unsigned long compact_cached_migrate_pfn[2];
376	#endif
377	#ifdef CONFIG_MEMORY_HOTPLUG
378	/* see spanned/present_pages for more description */
379	seqlock_t span_seqlock;
380	#endif
381	struct free_area free_area[MAX_ORDER];
382		360
383	#ifndef CONFIG_SPARSEMEM	361	#ifndef CONFIG_SPARSEMEM
384	/*	362	/*
@@ -388,74 +366,14 @@ struct zone {
388	unsigned long *pageblock_flags;	366	unsigned long *pageblock_flags;
389	#endif /* CONFIG_SPARSEMEM */	367	#endif /* CONFIG_SPARSEMEM */
390		368
391	#ifdef CONFIG_COMPACTION	369	#ifdef CONFIG_NUMA
392	/*
393	* On compaction failure, 1<<compact_defer_shift compactions
394	* are skipped before trying again. The number attempted since
395	* last failure is tracked with compact_considered.
396	*/
397	unsigned int compact_considered;
398	unsigned int compact_defer_shift;
399	int compact_order_failed;
400	#endif
401
402	ZONE_PADDING(_pad1_)
403
404	/* Fields commonly accessed by the page reclaim scanner */
405	spinlock_t lru_lock;
406	struct lruvec lruvec;
407
408	/* Evictions & activations on the inactive file list */
409	atomic_long_t inactive_age;
410
411	unsigned long pages_scanned; /* since last reclaim */
412	unsigned long flags; /* zone flags, see below */
413
414	/* Zone statistics */
415	atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
416
417	/*
418	* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
419	* this zone's LRU. Maintained by the pageout code.
420	*/
421	unsigned int inactive_ratio;
422
423
424	ZONE_PADDING(_pad2_)
425	/* Rarely used or read-mostly fields */
426
427	/*	370	/*
428	* wait_table -- the array holding the hash table	371	* zone reclaim becomes active if more unmapped pages exist.
429	* wait_table_hash_nr_entries -- the size of the hash table array
430	* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
431	*
432	* The purpose of all these is to keep track of the people
433	* waiting for a page to become available and make them
434	* runnable again when possible. The trouble is that this
435	* consumes a lot of space, especially when so few things
436	* wait on pages at a given time. So instead of using
437	* per-page waitqueues, we use a waitqueue hash table.
438	*
439	* The bucket discipline is to sleep on the same queue when
440	* colliding and wake all in that wait queue when removing.
441	* When something wakes, it must check to be sure its page is
442	* truly available, a la thundering herd. The cost of a
443	* collision is great, but given the expected load of the
444	* table, they should be so rare as to be outweighed by the
445	* benefits from the saved space.
446	*
447	* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
448	* primary users of these fields, and in mm/page_alloc.c
449	* free_area_init_core() performs the initialization of them.
450	*/	372	*/
451	wait_queue_head_t * wait_table;	373	unsigned long min_unmapped_pages;
452	unsigned long wait_table_hash_nr_entries;	374	unsigned long min_slab_pages;
453	unsigned long wait_table_bits;	375	#endif /* CONFIG_NUMA */
454		376
455	/*
456	* Discontig memory support fields.
457	*/
458	struct pglist_data *zone_pgdat;
459	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */	377	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
460	unsigned long zone_start_pfn;	378	unsigned long zone_start_pfn;
461		379
@@ -500,9 +418,11 @@ struct zone {
500	* adjust_managed_page_count() should be used instead of directly	418	* adjust_managed_page_count() should be used instead of directly
501	* touching zone->managed_pages and totalram_pages.	419	* touching zone->managed_pages and totalram_pages.
502	*/	420	*/
		421	unsigned long managed_pages;
503	unsigned long spanned_pages;	422	unsigned long spanned_pages;
504	unsigned long present_pages;	423	unsigned long present_pages;
505	unsigned long managed_pages;	424
		425	const char *name;
506		426
507	/*	427	/*
508	* Number of MIGRATE_RESEVE page block. To maintain for just	428	* Number of MIGRATE_RESEVE page block. To maintain for just
@@ -510,10 +430,95 @@ struct zone {
510	*/	430	*/
511	int nr_migrate_reserve_block;	431	int nr_migrate_reserve_block;
512		432
		433	#ifdef CONFIG_MEMORY_HOTPLUG
		434	/* see spanned/present_pages for more description */
		435	seqlock_t span_seqlock;
		436	#endif
		437
513	/*	438	/*
514	* rarely used fields:	439	* wait_table -- the array holding the hash table
		440	* wait_table_hash_nr_entries -- the size of the hash table array
		441	* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
		442	*
		443	* The purpose of all these is to keep track of the people
		444	* waiting for a page to become available and make them
		445	* runnable again when possible. The trouble is that this
		446	* consumes a lot of space, especially when so few things
		447	* wait on pages at a given time. So instead of using
		448	* per-page waitqueues, we use a waitqueue hash table.
		449	*
		450	* The bucket discipline is to sleep on the same queue when
		451	* colliding and wake all in that wait queue when removing.
		452	* When something wakes, it must check to be sure its page is
		453	* truly available, a la thundering herd. The cost of a
		454	* collision is great, but given the expected load of the
		455	* table, they should be so rare as to be outweighed by the
		456	* benefits from the saved space.
		457	*
		458	* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
		459	* primary users of these fields, and in mm/page_alloc.c
		460	* free_area_init_core() performs the initialization of them.
515	*/	461	*/
516	const char *name;	462	wait_queue_head_t *wait_table;
		463	unsigned long wait_table_hash_nr_entries;
		464	unsigned long wait_table_bits;
		465
		466	ZONE_PADDING(_pad1_)
		467
		468	/* Write-intensive fields used from the page allocator */
		469	spinlock_t lock;
		470
		471	/* free areas of different sizes */
		472	struct free_area free_area[MAX_ORDER];
		473
		474	/* zone flags, see below */
		475	unsigned long flags;
		476
		477	ZONE_PADDING(_pad2_)
		478
		479	/* Write-intensive fields used by page reclaim */
		480
		481	/* Fields commonly accessed by the page reclaim scanner */
		482	spinlock_t lru_lock;
		483	unsigned long pages_scanned; /* since last reclaim */
		484	struct lruvec lruvec;
		485
		486	/* Evictions & activations on the inactive file list */
		487	atomic_long_t inactive_age;
		488
		489	/*
		490	* When free pages are below this point, additional steps are taken
		491	* when reading the number of free pages to avoid per-cpu counter
		492	* drift allowing watermarks to be breached
		493	*/
		494	unsigned long percpu_drift_mark;
		495
		496	#if defined CONFIG_COMPACTION \|\| defined CONFIG_CMA
		497	/* pfn where compaction free scanner should start */
		498	unsigned long compact_cached_free_pfn;
		499	/* pfn where async and sync compaction migration scanner should start */
		500	unsigned long compact_cached_migrate_pfn[2];
		501	#endif
		502
		503	#ifdef CONFIG_COMPACTION
		504	/*
		505	* On compaction failure, 1<<compact_defer_shift compactions
		506	* are skipped before trying again. The number attempted since
		507	* last failure is tracked with compact_considered.
		508	*/
		509	unsigned int compact_considered;
		510	unsigned int compact_defer_shift;
		511	int compact_order_failed;
		512	#endif
		513
		514	#if defined CONFIG_COMPACTION \|\| defined CONFIG_CMA
		515	/* Set to true when the PG_migrate_skip bits should be cleared */
		516	bool compact_blockskip_flush;
		517	#endif
		518
		519	ZONE_PADDING(_pad3_)
		520	/* Zone statistics */
		521	atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
517	} ____cacheline_internodealigned_in_smp;	522	} ____cacheline_internodealigned_in_smp;
518		523
519	typedef enum {	524	typedef enum {