summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-10-26 13:15:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-27 12:27:57 -0400
commit9dcb8b685fc30813b35ab4b4bf39244430753190 (patch)
treedb5fe274cbc405031d0239205a8024e96969a18e /mm
parent9fe68cad6e74967b88d0c6aeca7d9cd6b6e91942 (diff)
mm: remove per-zone hashtable of bitlock waitqueues
The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher <agruenba@redhat.com> Tested-by: Bob Peterson <rpeterso@redhat.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Steven Whitehouse <swhiteho@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/memory_hotplug.c28
-rw-r--r--mm/page_alloc.c115
3 files changed, 3 insertions, 144 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 849f459ad078..c7fe2f16503f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc);
790 */ 790 */
791wait_queue_head_t *page_waitqueue(struct page *page) 791wait_queue_head_t *page_waitqueue(struct page *page)
792{ 792{
793 const struct zone *zone = page_zone(page); 793 return bit_waitqueue(page, 0);
794
795 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
796} 794}
797EXPORT_SYMBOL(page_waitqueue); 795EXPORT_SYMBOL(page_waitqueue);
798 796
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 962927309b6e..b18dab401be6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
268 unsigned long i, pfn, end_pfn, nr_pages; 268 unsigned long i, pfn, end_pfn, nr_pages;
269 int node = pgdat->node_id; 269 int node = pgdat->node_id;
270 struct page *page; 270 struct page *page;
271 struct zone *zone;
272 271
273 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 272 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
274 page = virt_to_page(pgdat); 273 page = virt_to_page(pgdat);
@@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
276 for (i = 0; i < nr_pages; i++, page++) 275 for (i = 0; i < nr_pages; i++, page++)
277 get_page_bootmem(node, page, NODE_INFO); 276 get_page_bootmem(node, page, NODE_INFO);
278 277
279 zone = &pgdat->node_zones[0];
280 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
281 if (zone_is_initialized(zone)) {
282 nr_pages = zone->wait_table_hash_nr_entries
283 * sizeof(wait_queue_head_t);
284 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
285 page = virt_to_page(zone->wait_table);
286
287 for (i = 0; i < nr_pages; i++, page++)
288 get_page_bootmem(node, page, NODE_INFO);
289 }
290 }
291
292 pfn = pgdat->node_start_pfn; 278 pfn = pgdat->node_start_pfn;
293 end_pfn = pgdat_end_pfn(pgdat); 279 end_pfn = pgdat_end_pfn(pgdat);
294 280
@@ -2158,20 +2144,6 @@ void try_offline_node(int nid)
2158 */ 2144 */
2159 node_set_offline(nid); 2145 node_set_offline(nid);
2160 unregister_one_node(nid); 2146 unregister_one_node(nid);
2161
2162 /* free waittable in each zone */
2163 for (i = 0; i < MAX_NR_ZONES; i++) {
2164 struct zone *zone = pgdat->node_zones + i;
2165
2166 /*
2167 * wait_table may be allocated from boot memory,
2168 * here only free if it's allocated by vmalloc.
2169 */
2170 if (is_vmalloc_addr(zone->wait_table)) {
2171 vfree(zone->wait_table);
2172 zone->wait_table = NULL;
2173 }
2174 }
2175} 2147}
2176EXPORT_SYMBOL(try_offline_node); 2148EXPORT_SYMBOL(try_offline_node);
2177 2149
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b3bf6767d54..de7c6e43b1c9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4977,72 +4977,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
4977} 4977}
4978 4978
4979/* 4979/*
4980 * Helper functions to size the waitqueue hash table.
4981 * Essentially these want to choose hash table sizes sufficiently
4982 * large so that collisions trying to wait on pages are rare.
4983 * But in fact, the number of active page waitqueues on typical
4984 * systems is ridiculously low, less than 200. So this is even
4985 * conservative, even though it seems large.
4986 *
4987 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
4988 * waitqueues, i.e. the size of the waitq table given the number of pages.
4989 */
4990#define PAGES_PER_WAITQUEUE 256
4991
4992#ifndef CONFIG_MEMORY_HOTPLUG
4993static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
4994{
4995 unsigned long size = 1;
4996
4997 pages /= PAGES_PER_WAITQUEUE;
4998
4999 while (size < pages)
5000 size <<= 1;
5001
5002 /*
5003 * Once we have dozens or even hundreds of threads sleeping
5004 * on IO we've got bigger problems than wait queue collision.
5005 * Limit the size of the wait table to a reasonable size.
5006 */
5007 size = min(size, 4096UL);
5008
5009 return max(size, 4UL);
5010}
5011#else
5012/*
5013 * A zone's size might be changed by hot-add, so it is not possible to determine
5014 * a suitable size for its wait_table. So we use the maximum size now.
5015 *
5016 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
5017 *
5018 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
5019 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
5020 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
5021 *
5022 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
5023 * or more by the traditional way. (See above). It equals:
5024 *
5025 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
5026 * ia64(16K page size) : = ( 8G + 4M)byte.
5027 * powerpc (64K page size) : = (32G +16M)byte.
5028 */
5029static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
5030{
5031 return 4096UL;
5032}
5033#endif
5034
5035/*
5036 * This is an integer logarithm so that shifts can be used later
5037 * to extract the more random high bits from the multiplicative
5038 * hash function before the remainder is taken.
5039 */
5040static inline unsigned long wait_table_bits(unsigned long size)
5041{
5042 return ffz(~size);
5043}
5044
5045/*
5046 * Initially all pages are reserved - free ones are freed 4980 * Initially all pages are reserved - free ones are freed
5047 * up by free_all_bootmem() once the early boot process is 4981 * up by free_all_bootmem() once the early boot process is
5048 * done. Non-atomic initialization, single-pass. 4982 * done. Non-atomic initialization, single-pass.
@@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void)
5304 alloc_percpu(struct per_cpu_nodestat); 5238 alloc_percpu(struct per_cpu_nodestat);
5305} 5239}
5306 5240
5307static noinline __ref
5308int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
5309{
5310 int i;
5311 size_t alloc_size;
5312
5313 /*
5314 * The per-page waitqueue mechanism uses hashed waitqueues
5315 * per zone.
5316 */
5317 zone->wait_table_hash_nr_entries =
5318 wait_table_hash_nr_entries(zone_size_pages);
5319 zone->wait_table_bits =
5320 wait_table_bits(zone->wait_table_hash_nr_entries);
5321 alloc_size = zone->wait_table_hash_nr_entries
5322 * sizeof(wait_queue_head_t);
5323
5324 if (!slab_is_available()) {
5325 zone->wait_table = (wait_queue_head_t *)
5326 memblock_virt_alloc_node_nopanic(
5327 alloc_size, zone->zone_pgdat->node_id);
5328 } else {
5329 /*
5330 * This case means that a zone whose size was 0 gets new memory
5331 * via memory hot-add.
5332 * But it may be the case that a new node was hot-added. In
5333 * this case vmalloc() will not be able to use this new node's
5334 * memory - this wait_table must be initialized to use this new
5335 * node itself as well.
5336 * To use this new node's memory, further consideration will be
5337 * necessary.
5338 */
5339 zone->wait_table = vmalloc(alloc_size);
5340 }
5341 if (!zone->wait_table)
5342 return -ENOMEM;
5343
5344 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
5345 init_waitqueue_head(zone->wait_table + i);
5346
5347 return 0;
5348}
5349
5350static __meminit void zone_pcp_init(struct zone *zone) 5241static __meminit void zone_pcp_init(struct zone *zone)
5351{ 5242{
5352 /* 5243 /*
@@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
5367 unsigned long size) 5258 unsigned long size)
5368{ 5259{
5369 struct pglist_data *pgdat = zone->zone_pgdat; 5260 struct pglist_data *pgdat = zone->zone_pgdat;
5370 int ret; 5261
5371 ret = zone_wait_table_init(zone, size);
5372 if (ret)
5373 return ret;
5374 pgdat->nr_zones = zone_idx(zone) + 1; 5262 pgdat->nr_zones = zone_idx(zone) + 1;
5375 5263
5376 zone->zone_start_pfn = zone_start_pfn; 5264 zone->zone_start_pfn = zone_start_pfn;
@@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
5382 zone_start_pfn, (zone_start_pfn + size)); 5270 zone_start_pfn, (zone_start_pfn + size));
5383 5271
5384 zone_init_free_lists(zone); 5272 zone_init_free_lists(zone);
5273 zone->initialized = 1;
5385 5274
5386 return 0; 5275 return 0;
5387} 5276}