aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2014-06-04 19:10:16 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 19:54:09 -0400
commite58469bafd0524e848c3733bc3918d854595e20f (patch)
tree5881b188eb2c179576773aedf5b11983047350aa
parent5dab29113ca56335c78be3f98bf5ddf2ef8eb6a6 (diff)
mm: page_alloc: use word-based accesses for get/set pageblock bitmaps
The test_bit operations in get/set pageblock flags are expensive. This patch reads the bitmap on a word basis and use shifts and masks to isolate the bits of interest. Similarly masks are used to set a local copy of the bitmap and then use cmpxchg to update the bitmap if there have been no other changes made in parallel. In a test running dd onto tmpfs the overhead of the pageblock-related functions went from 1.27% in profiles to 0.5%. In addition to the performance benefits, this patch closes races that are possible between: a) get_ and set_pageblock_migratetype(), where get_pageblock_migratetype() reads part of the bits before and other part of the bits after set_pageblock_migratetype() has updated them. b) set_pageblock_migratetype() and set_pageblock_skip(), where the non-atomic read-modify-update set bit operation in set_pageblock_skip() will cause lost updates to some bits changed in the set_pageblock_migratetype(). Joonsoo Kim first reported the case a) via code inspection. Vlastimil Babka's testing with a debug patch showed that either a) or b) occurs roughly once per mmtests' stress-highalloc benchmark (although not necessarily in the same pageblock). Furthermore during development of unrelated compaction patches, it was observed that frequent calls to {start,undo}_isolate_page_range() the race occurs several thousands of times and has resulted in NULL pointer dereferences in move_freepages() and free_one_page() in places where free_list[migratetype] is manipulated by e.g. list_move(). Further debugging confirmed that migratetype had invalid value of 6, causing out of bounds access to the free_list array. That confirmed that the race exist, although it may be extremely rare, and currently only fatal where page isolation is performed due to memory hot remove. Races on pageblocks being updated by set_pageblock_migratetype(), where both old and new migratetype are lower MIGRATE_RESERVE, currently cannot result in an invalid value being observed, although theoretically they may still lead to unexpected creation or destruction of MIGRATE_RESERVE pageblocks. Furthermore, things could get suddenly worse when memory isolation is used more, or when new migratetypes are added. After this patch, the race has no longer been observed in testing. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Vlastimil Babka <vbabka@suse.cz> Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Reported-and-tested-by: Vlastimil Babka <vbabka@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jan Kara <jack@suse.cz> Cc: Michal Hocko <mhocko@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Theodore Ts'o <tytso@mit.edu> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--include/linux/pageblock-flags.h37
-rw-r--r--mm/page_alloc.c52
3 files changed, 68 insertions, 27 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 10a96ee68311..8ef1e3f71e0f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -75,9 +75,13 @@ enum {
75 75
76extern int page_group_by_mobility_disabled; 76extern int page_group_by_mobility_disabled;
77 77
78#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
79#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
80
78static inline int get_pageblock_migratetype(struct page *page) 81static inline int get_pageblock_migratetype(struct page *page)
79{ 82{
80 return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); 83 BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
84 return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK);
81} 85}
82 86
83struct free_area { 87struct free_area {
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 2ee8cd2466b5..c08730c10c7a 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -30,9 +30,12 @@ enum pageblock_bits {
30 PB_migrate, 30 PB_migrate,
31 PB_migrate_end = PB_migrate + 3 - 1, 31 PB_migrate_end = PB_migrate + 3 - 1,
32 /* 3 bits required for migrate types */ 32 /* 3 bits required for migrate types */
33#ifdef CONFIG_COMPACTION
34 PB_migrate_skip,/* If set the block is skipped by compaction */ 33 PB_migrate_skip,/* If set the block is skipped by compaction */
35#endif /* CONFIG_COMPACTION */ 34
35 /*
36 * Assume the bits will always align on a word. If this assumption
37 * changes then get/set pageblock needs updating.
38 */
36 NR_PAGEBLOCK_BITS 39 NR_PAGEBLOCK_BITS
37}; 40};
38 41
@@ -62,11 +65,33 @@ extern int pageblock_order;
62/* Forward declaration */ 65/* Forward declaration */
63struct page; 66struct page;
64 67
68unsigned long get_pageblock_flags_mask(struct page *page,
69 unsigned long end_bitidx,
70 unsigned long mask);
71void set_pageblock_flags_mask(struct page *page,
72 unsigned long flags,
73 unsigned long end_bitidx,
74 unsigned long mask);
75
65/* Declarations for getting and setting flags. See mm/page_alloc.c */ 76/* Declarations for getting and setting flags. See mm/page_alloc.c */
66unsigned long get_pageblock_flags_group(struct page *page, 77static inline unsigned long get_pageblock_flags_group(struct page *page,
67 int start_bitidx, int end_bitidx); 78 int start_bitidx, int end_bitidx)
68void set_pageblock_flags_group(struct page *page, unsigned long flags, 79{
69 int start_bitidx, int end_bitidx); 80 unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
81 unsigned long mask = (1 << nr_flag_bits) - 1;
82
83 return get_pageblock_flags_mask(page, end_bitidx, mask);
84}
85
86static inline void set_pageblock_flags_group(struct page *page,
87 unsigned long flags,
88 int start_bitidx, int end_bitidx)
89{
90 unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
91 unsigned long mask = (1 << nr_flag_bits) - 1;
92
93 set_pageblock_flags_mask(page, flags, end_bitidx, mask);
94}
70 95
71#ifdef CONFIG_COMPACTION 96#ifdef CONFIG_COMPACTION
72#define get_pageblock_skip(page) \ 97#define get_pageblock_skip(page) \
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 485932c577e7..6e937809c87a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6028,53 +6028,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
6028 * @end_bitidx: The last bit of interest 6028 * @end_bitidx: The last bit of interest
6029 * returns pageblock_bits flags 6029 * returns pageblock_bits flags
6030 */ 6030 */
6031unsigned long get_pageblock_flags_group(struct page *page, 6031unsigned long get_pageblock_flags_mask(struct page *page,
6032 int start_bitidx, int end_bitidx) 6032 unsigned long end_bitidx,
6033 unsigned long mask)
6033{ 6034{
6034 struct zone *zone; 6035 struct zone *zone;
6035 unsigned long *bitmap; 6036 unsigned long *bitmap;
6036 unsigned long pfn, bitidx; 6037 unsigned long pfn, bitidx, word_bitidx;
6037 unsigned long flags = 0; 6038 unsigned long word;
6038 unsigned long value = 1;
6039 6039
6040 zone = page_zone(page); 6040 zone = page_zone(page);
6041 pfn = page_to_pfn(page); 6041 pfn = page_to_pfn(page);
6042 bitmap = get_pageblock_bitmap(zone, pfn); 6042 bitmap = get_pageblock_bitmap(zone, pfn);
6043 bitidx = pfn_to_bitidx(zone, pfn); 6043 bitidx = pfn_to_bitidx(zone, pfn);
6044 word_bitidx = bitidx / BITS_PER_LONG;
6045 bitidx &= (BITS_PER_LONG-1);
6044 6046
6045 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 6047 word = bitmap[word_bitidx];
6046 if (test_bit(bitidx + start_bitidx, bitmap)) 6048 bitidx += end_bitidx;
6047 flags |= value; 6049 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
6048
6049 return flags;
6050} 6050}
6051 6051
6052/** 6052/**
6053 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 6053 * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6054 * @page: The page within the block of interest 6054 * @page: The page within the block of interest
6055 * @start_bitidx: The first bit of interest 6055 * @start_bitidx: The first bit of interest
6056 * @end_bitidx: The last bit of interest 6056 * @end_bitidx: The last bit of interest
6057 * @flags: The flags to set 6057 * @flags: The flags to set
6058 */ 6058 */
6059void set_pageblock_flags_group(struct page *page, unsigned long flags, 6059void set_pageblock_flags_mask(struct page *page, unsigned long flags,
6060 int start_bitidx, int end_bitidx) 6060 unsigned long end_bitidx,
6061 unsigned long mask)
6061{ 6062{
6062 struct zone *zone; 6063 struct zone *zone;
6063 unsigned long *bitmap; 6064 unsigned long *bitmap;
6064 unsigned long pfn, bitidx; 6065 unsigned long pfn, bitidx, word_bitidx;
6065 unsigned long value = 1; 6066 unsigned long old_word, word;
6067
6068 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6066 6069
6067 zone = page_zone(page); 6070 zone = page_zone(page);
6068 pfn = page_to_pfn(page); 6071 pfn = page_to_pfn(page);
6069 bitmap = get_pageblock_bitmap(zone, pfn); 6072 bitmap = get_pageblock_bitmap(zone, pfn);
6070 bitidx = pfn_to_bitidx(zone, pfn); 6073 bitidx = pfn_to_bitidx(zone, pfn);
6074 word_bitidx = bitidx / BITS_PER_LONG;
6075 bitidx &= (BITS_PER_LONG-1);
6076
6071 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); 6077 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
6072 6078
6073 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 6079 bitidx += end_bitidx;
6074 if (flags & value) 6080 mask <<= (BITS_PER_LONG - bitidx - 1);
6075 __set_bit(bitidx + start_bitidx, bitmap); 6081 flags <<= (BITS_PER_LONG - bitidx - 1);
6076 else 6082
6077 __clear_bit(bitidx + start_bitidx, bitmap); 6083 word = ACCESS_ONCE(bitmap[word_bitidx]);
6084 for (;;) {
6085 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6086 if (word == old_word)
6087 break;
6088 word = old_word;
6089 }
6078} 6090}
6079 6091
6080/* 6092/*