mm: remove per-zone hashtable of bitlock waitqueues

The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher <agruenba@redhat.com> Tested-by: Bob Peterson <rpeterso@redhat.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Steven Whitehouse <swhiteho@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-10-26 13:15:30 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-10-27 12:27:57 -0400
commit: 9dcb8b685fc30813b35ab4b4bf39244430753190 (patch)
tree: db5fe274cbc405031d0239205a8024e96969a18e /mm
parent: 9fe68cad6e74967b88d0c6aeca7d9cd6b6e91942 (diff)
3 files changed, 3 insertions, 144 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 849f459ad078..c7fe2f16503f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc);
 */
 wait_queue_head_t *page_waitqueue(struct page *page)
 {
-        const struct zone *zone = page_zone(page);
+        return bit_waitqueue(page, 0);
-        return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 }
 EXPORT_SYMBOL(page_waitqueue);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 962927309b6e..b18dab401be6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
        unsigned long i, pfn, end_pfn, nr_pages;
        int node = pgdat->node_id;
        struct page *page;
-        struct zone *zone;
        nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
        page = virt_to_page(pgdat);
@@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
        for (i = 0; i < nr_pages; i++, page++)
                get_page_bootmem(node, page, NODE_INFO);
-        zone = &pgdat->node_zones[0];
-        for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
-                if (zone_is_initialized(zone)) {
-                        nr_pages = zone->wait_table_hash_nr_entries
-                                * sizeof(wait_queue_head_t);
-                        nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
-                        page = virt_to_page(zone->wait_table);
-                        for (i = 0; i < nr_pages; i++, page++)
-                                get_page_bootmem(node, page, NODE_INFO);
-                }
-        }
        pfn = pgdat->node_start_pfn;
        end_pfn = pgdat_end_pfn(pgdat);
@@ -2158,20 +2144,6 @@ void try_offline_node(int nid)
         */
        node_set_offline(nid);
        unregister_one_node(nid);
-        /* free waittable in each zone */
-        for (i = 0; i < MAX_NR_ZONES; i++) {
-                struct zone *zone = pgdat->node_zones + i;
-                /*
-                 * wait_table may be allocated from boot memory,
-                 * here only free if it's allocated by vmalloc.
-                 */
-                if (is_vmalloc_addr(zone->wait_table)) {
-                        vfree(zone->wait_table);
-                        zone->wait_table = NULL;
-                }
-        }
 }
 EXPORT_SYMBOL(try_offline_node);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b3bf6767d54..de7c6e43b1c9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4977,72 +4977,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 }
 /*
- * Helper functions to size the waitqueue hash table.
- * Essentially these want to choose hash table sizes sufficiently
- * large so that collisions trying to wait on pages are rare.
- * But in fact, the number of active page waitqueues on typical
- * systems is ridiculously low, less than 200. So this is even
- * conservative, even though it seems large.
- *
- * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
- * waitqueues, i.e. the size of the waitq table given the number of pages.
- */
-#define PAGES_PER_WAITQUEUE     256
-#ifndef CONFIG_MEMORY_HOTPLUG
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
-        unsigned long size = 1;
-        pages /= PAGES_PER_WAITQUEUE;
-        while (size < pages)
-                size <<= 1;
-        /*
-         * Once we have dozens or even hundreds of threads sleeping
-         * on IO we've got bigger problems than wait queue collision.
-         * Limit the size of the wait table to a reasonable size.
-         */
-        size = min(size, 4096UL);
-        return max(size, 4UL);
-}
-#else
-/*
- * A zone's size might be changed by hot-add, so it is not possible to determine
- * a suitable size for its wait_table.  So we use the maximum size now.
- *
- * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
- *
- *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
- *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
- *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
- *
- * The maximum entries are prepared when a zone's memory is (512K + 256) pages
- * or more by the traditional way. (See above).  It equals:
- *
- *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
- *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
- *    powerpc (64K page size)             : =  (32G +16M)byte.
- */
-static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
-{
-        return 4096UL;
-}
-#endif
-/*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
- */
-static inline unsigned long wait_table_bits(unsigned long size)
-{
-        return ffz(~size);
-}
-/*
 * Initially all pages are reserved - free ones are freed
 * up by free_all_bootmem() once the early boot process is
 * done. Non-atomic initialization, single-pass.
@@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void)
                        alloc_percpu(struct per_cpu_nodestat);
 }
-static noinline __ref
-int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
-{
-        int i;
-        size_t alloc_size;
-        /*
-         * The per-page waitqueue mechanism uses hashed waitqueues
-         * per zone.
-         */
-        zone->wait_table_hash_nr_entries =
-                 wait_table_hash_nr_entries(zone_size_pages);
-        zone->wait_table_bits =
-                wait_table_bits(zone->wait_table_hash_nr_entries);
-        alloc_size = zone->wait_table_hash_nr_entries
-                                        * sizeof(wait_queue_head_t);
-        if (!slab_is_available()) {
-                zone->wait_table = (wait_queue_head_t *)
-                        memblock_virt_alloc_node_nopanic(
-                                alloc_size, zone->zone_pgdat->node_id);
-        } else {
-                /*
-                 * This case means that a zone whose size was 0 gets new memory
-                 * via memory hot-add.
-                 * But it may be the case that a new node was hot-added.  In
-                 * this case vmalloc() will not be able to use this new node's
-                 * memory - this wait_table must be initialized to use this new
-                 * node itself as well.
-                 * To use this new node's memory, further consideration will be
-                 * necessary.
-                 */
-                zone->wait_table = vmalloc(alloc_size);
-        }
-        if (!zone->wait_table)
-                return -ENOMEM;
-        for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
-                init_waitqueue_head(zone->wait_table + i);
-        return 0;
-}
 static __meminit void zone_pcp_init(struct zone *zone)
 {
        /*
@@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
                                        unsigned long size)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
-        int ret;
-        ret = zone_wait_table_init(zone, size);
-        if (ret)
-                return ret;
        pgdat->nr_zones = zone_idx(zone) + 1;
        zone->zone_start_pfn = zone_start_pfn;
@@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
                        zone_start_pfn, (zone_start_pfn + size));
        zone_init_free_lists(zone);
+        zone->initialized = 1;
        return 0;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-10-26 13:15:30 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-10-27 12:27:57 -0400
commit	9dcb8b685fc30813b35ab4b4bf39244430753190 (patch)
tree	db5fe274cbc405031d0239205a8024e96969a18e /mm
parent	9fe68cad6e74967b88d0c6aeca7d9cd6b6e91942 (diff)

diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..c7fe2f16503f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc);
790	*/	790	*/
791	wait_queue_head_t page_waitqueue(struct page page)	791	wait_queue_head_t page_waitqueue(struct page page)
792	{	792	{
793	const struct zone *zone = page_zone(page);	793	return bit_waitqueue(page, 0);
794
795	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
796	}	794	}
797	EXPORT_SYMBOL(page_waitqueue);	795	EXPORT_SYMBOL(page_waitqueue);
798		796


diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..b18dab401be6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c
@@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
268	unsigned long i, pfn, end_pfn, nr_pages;	268	unsigned long i, pfn, end_pfn, nr_pages;
269	int node = pgdat->node_id;	269	int node = pgdat->node_id;
270	struct page *page;	270	struct page *page;
271	struct zone *zone;
272		271
273	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;	272	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
274	page = virt_to_page(pgdat);	273	page = virt_to_page(pgdat);
@@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
276	for (i = 0; i < nr_pages; i++, page++)	275	for (i = 0; i < nr_pages; i++, page++)
277	get_page_bootmem(node, page, NODE_INFO);	276	get_page_bootmem(node, page, NODE_INFO);
278		277
279	zone = &pgdat->node_zones[0];
280	for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
281	if (zone_is_initialized(zone)) {
282	nr_pages = zone->wait_table_hash_nr_entries
283	* sizeof(wait_queue_head_t);
284	nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
285	page = virt_to_page(zone->wait_table);
286
287	for (i = 0; i < nr_pages; i++, page++)
288	get_page_bootmem(node, page, NODE_INFO);
289	}
290	}
291
292	pfn = pgdat->node_start_pfn;	278	pfn = pgdat->node_start_pfn;
293	end_pfn = pgdat_end_pfn(pgdat);	279	end_pfn = pgdat_end_pfn(pgdat);
294		280
@@ -2158,20 +2144,6 @@ void try_offline_node(int nid)
2158	*/	2144	*/
2159	node_set_offline(nid);	2145	node_set_offline(nid);
2160	unregister_one_node(nid);	2146	unregister_one_node(nid);
2161
2162	/* free waittable in each zone */
2163	for (i = 0; i < MAX_NR_ZONES; i++) {
2164	struct zone *zone = pgdat->node_zones + i;
2165
2166	/*
2167	* wait_table may be allocated from boot memory,
2168	* here only free if it's allocated by vmalloc.
2169	*/
2170	if (is_vmalloc_addr(zone->wait_table)) {
2171	vfree(zone->wait_table);
2172	zone->wait_table = NULL;
2173	}
2174	}
2175	}	2147	}
2176	EXPORT_SYMBOL(try_offline_node);	2148	EXPORT_SYMBOL(try_offline_node);
2177		2149


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..de7c6e43b1c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -4977,72 +4977,6 @@ void __ref build_all_zonelists(pg_data_t pgdat, struct zone zone)
4977	}	4977	}
4978		4978
4979	/*	4979	/*
4980	* Helper functions to size the waitqueue hash table.
4981	* Essentially these want to choose hash table sizes sufficiently
4982	* large so that collisions trying to wait on pages are rare.
4983	* But in fact, the number of active page waitqueues on typical
4984	* systems is ridiculously low, less than 200. So this is even
4985	* conservative, even though it seems large.
4986	*
4987	* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
4988	* waitqueues, i.e. the size of the waitq table given the number of pages.
4989	*/
4990	#define PAGES_PER_WAITQUEUE 256
4991
4992	#ifndef CONFIG_MEMORY_HOTPLUG
4993	static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
4994	{
4995	unsigned long size = 1;
4996
4997	pages /= PAGES_PER_WAITQUEUE;
4998
4999	while (size < pages)
5000	size <<= 1;
5001
5002	/*
5003	* Once we have dozens or even hundreds of threads sleeping
5004	* on IO we've got bigger problems than wait queue collision.
5005	* Limit the size of the wait table to a reasonable size.
5006	*/
5007	size = min(size, 4096UL);
5008
5009	return max(size, 4UL);
5010	}
5011	#else
5012	/*
5013	* A zone's size might be changed by hot-add, so it is not possible to determine
5014	* a suitable size for its wait_table. So we use the maximum size now.
5015	*
5016	* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
5017	*
5018	* i386 (preemption config) : 4096 x 16 = 64Kbyte.
5019	* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
5020	* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
5021	*
5022	* The maximum entries are prepared when a zone's memory is (512K + 256) pages
5023	* or more by the traditional way. (See above). It equals:
5024	*
5025	* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
5026	* ia64(16K page size) : = ( 8G + 4M)byte.
5027	* powerpc (64K page size) : = (32G +16M)byte.
5028	*/
5029	static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
5030	{
5031	return 4096UL;
5032	}
5033	#endif
5034
5035	/*
5036	* This is an integer logarithm so that shifts can be used later
5037	* to extract the more random high bits from the multiplicative
5038	* hash function before the remainder is taken.
5039	*/
5040	static inline unsigned long wait_table_bits(unsigned long size)
5041	{
5042	return ffz(~size);
5043	}
5044
5045	/*
5046	* Initially all pages are reserved - free ones are freed	4980	* Initially all pages are reserved - free ones are freed
5047	* up by free_all_bootmem() once the early boot process is	4981	* up by free_all_bootmem() once the early boot process is
5048	* done. Non-atomic initialization, single-pass.	4982	* done. Non-atomic initialization, single-pass.
@@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void)
5304	alloc_percpu(struct per_cpu_nodestat);	5238	alloc_percpu(struct per_cpu_nodestat);
5305	}	5239	}
5306		5240
5307	static noinline __ref
5308	int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
5309	{
5310	int i;
5311	size_t alloc_size;
5312
5313	/*
5314	* The per-page waitqueue mechanism uses hashed waitqueues
5315	* per zone.
5316	*/
5317	zone->wait_table_hash_nr_entries =
5318	wait_table_hash_nr_entries(zone_size_pages);
5319	zone->wait_table_bits =
5320	wait_table_bits(zone->wait_table_hash_nr_entries);
5321	alloc_size = zone->wait_table_hash_nr_entries
5322	* sizeof(wait_queue_head_t);
5323
5324	if (!slab_is_available()) {
5325	zone->wait_table = (wait_queue_head_t *)
5326	memblock_virt_alloc_node_nopanic(
5327	alloc_size, zone->zone_pgdat->node_id);
5328	} else {
5329	/*
5330	* This case means that a zone whose size was 0 gets new memory
5331	* via memory hot-add.
5332	* But it may be the case that a new node was hot-added. In
5333	* this case vmalloc() will not be able to use this new node's
5334	* memory - this wait_table must be initialized to use this new
5335	* node itself as well.
5336	* To use this new node's memory, further consideration will be
5337	* necessary.
5338	*/
5339	zone->wait_table = vmalloc(alloc_size);
5340	}
5341	if (!zone->wait_table)
5342	return -ENOMEM;
5343
5344	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
5345	init_waitqueue_head(zone->wait_table + i);
5346
5347	return 0;
5348	}
5349
5350	static __meminit void zone_pcp_init(struct zone *zone)	5241	static __meminit void zone_pcp_init(struct zone *zone)
5351	{	5242	{
5352	/*	5243	/*
@@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
5367	unsigned long size)	5258	unsigned long size)
5368	{	5259	{
5369	struct pglist_data *pgdat = zone->zone_pgdat;	5260	struct pglist_data *pgdat = zone->zone_pgdat;
5370	int ret;	5261
5371	ret = zone_wait_table_init(zone, size);
5372	if (ret)
5373	return ret;
5374	pgdat->nr_zones = zone_idx(zone) + 1;	5262	pgdat->nr_zones = zone_idx(zone) + 1;
5375		5263
5376	zone->zone_start_pfn = zone_start_pfn;	5264	zone->zone_start_pfn = zone_start_pfn;
@@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
5382	zone_start_pfn, (zone_start_pfn + size));	5270	zone_start_pfn, (zone_start_pfn + size));
5383		5271
5384	zone_init_free_lists(zone);	5272	zone_init_free_lists(zone);
		5273	zone->initialized = 1;
5385		5274
5386	return 0;	5275	return 0;
5387	}	5276	}