summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2017-09-06 19:20:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:27:26 -0400
commitb93e0f329e24f3615aa551fd9b99a75fb7c9195f (patch)
tree3b6e1cade48a38d71456258f634ce868fa7dd20d
parent11cd8638c37f6c400cc472cc52b6eccb505aba6e (diff)
mm, memory_hotplug: get rid of zonelists_mutex
zonelists_mutex was introduced by commit 4eaf3f64397c ("mem-hotplug: fix potential race while building zonelist for new populated zone") to protect zonelist building from races. This is no longer needed though because both memory online and offline are fully serialized. New users have grown since then. Notably setup_per_zone_wmarks wants to prevent from races between memory hotplug, khugepaged setup and manual min_free_kbytes update via sysctl (see cfd3da1e49bb ("mm: Serialize access to min_free_kbytes"). Let's add a private lock for that purpose. This will not prevent from seeing halfway through memory hotplug operation but that shouldn't be a big deal becuse memory hotplug will update watermarks explicitly so we will eventually get a full picture. The lock just makes sure we won't race when updating watermarks leading to weird results. Also __build_all_zonelists manipulates global data so add a private lock for it as well. This doesn't seem to be necessary today but it is more robust to have a lock there. While we are at it make sure we document that memory online/offline depends on a full serialization either via mem_hotplug_begin() or device_lock. Link: http://lkml.kernel.org/r/20170721143915.14161-9-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Shaohua Li <shaohua.li@intel.com> Cc: Toshi Kani <toshi.kani@hpe.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Haicheng Li <haicheng.li@linux.intel.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--mm/memory_hotplug.c12
-rw-r--r--mm/page_alloc.c18
3 files changed, 11 insertions, 20 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 551f68bec2fa..e7e92c8f4883 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -770,7 +770,6 @@ static inline bool is_dev_zone(const struct zone *zone)
770 770
771#include <linux/memory_hotplug.h> 771#include <linux/memory_hotplug.h>
772 772
773extern struct mutex zonelists_mutex;
774void build_all_zonelists(pg_data_t *pgdat); 773void build_all_zonelists(pg_data_t *pgdat);
775void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); 774void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
776bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 775bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2f0c7ebc7624..73bf17df6899 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -897,7 +897,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
897 return zone; 897 return zone;
898} 898}
899 899
900/* Must be protected by mem_hotplug_begin() */ 900/* Must be protected by mem_hotplug_begin() or a device_lock */
901int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 901int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
902{ 902{
903 unsigned long flags; 903 unsigned long flags;
@@ -926,7 +926,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
926 * This means the page allocator ignores this zone. 926 * This means the page allocator ignores this zone.
927 * So, zonelist must be updated after online. 927 * So, zonelist must be updated after online.
928 */ 928 */
929 mutex_lock(&zonelists_mutex);
930 if (!populated_zone(zone)) { 929 if (!populated_zone(zone)) {
931 need_zonelists_rebuild = 1; 930 need_zonelists_rebuild = 1;
932 setup_zone_pageset(zone); 931 setup_zone_pageset(zone);
@@ -937,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
937 if (ret) { 936 if (ret) {
938 if (need_zonelists_rebuild) 937 if (need_zonelists_rebuild)
939 zone_pcp_reset(zone); 938 zone_pcp_reset(zone);
940 mutex_unlock(&zonelists_mutex);
941 goto failed_addition; 939 goto failed_addition;
942 } 940 }
943 941
@@ -955,8 +953,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
955 zone_pcp_update(zone); 953 zone_pcp_update(zone);
956 } 954 }
957 955
958 mutex_unlock(&zonelists_mutex);
959
960 init_per_zone_wmark_min(); 956 init_per_zone_wmark_min();
961 957
962 if (onlined_pages) { 958 if (onlined_pages) {
@@ -1027,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1027 * The node we allocated has no zone fallback lists. For avoiding 1023 * The node we allocated has no zone fallback lists. For avoiding
1028 * to access not-initialized zonelist, build here. 1024 * to access not-initialized zonelist, build here.
1029 */ 1025 */
1030 mutex_lock(&zonelists_mutex);
1031 build_all_zonelists(pgdat); 1026 build_all_zonelists(pgdat);
1032 mutex_unlock(&zonelists_mutex);
1033 1027
1034 /* 1028 /*
1035 * zone->managed_pages is set to an approximate value in 1029 * zone->managed_pages is set to an approximate value in
@@ -1696,9 +1690,7 @@ repeat:
1696 1690
1697 if (!populated_zone(zone)) { 1691 if (!populated_zone(zone)) {
1698 zone_pcp_reset(zone); 1692 zone_pcp_reset(zone);
1699 mutex_lock(&zonelists_mutex);
1700 build_all_zonelists(NULL); 1693 build_all_zonelists(NULL);
1701 mutex_unlock(&zonelists_mutex);
1702 } else 1694 } else
1703 zone_pcp_update(zone); 1695 zone_pcp_update(zone);
1704 1696
@@ -1724,7 +1716,7 @@ failed_removal:
1724 return ret; 1716 return ret;
1725} 1717}
1726 1718
1727/* Must be protected by mem_hotplug_begin() */ 1719/* Must be protected by mem_hotplug_begin() or a device_lock */
1728int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1720int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1729{ 1721{
1730 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1722 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3086d0fd945..0bea94af0423 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5131,17 +5131,14 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
5131static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 5131static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
5132static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 5132static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
5133 5133
5134/*
5135 * Global mutex to protect against size modification of zonelists
5136 * as well as to serialize pageset setup for the new populated zone.
5137 */
5138DEFINE_MUTEX(zonelists_mutex);
5139
5140static void __build_all_zonelists(void *data) 5134static void __build_all_zonelists(void *data)
5141{ 5135{
5142 int nid; 5136 int nid;
5143 int __maybe_unused cpu; 5137 int __maybe_unused cpu;
5144 pg_data_t *self = data; 5138 pg_data_t *self = data;
5139 static DEFINE_SPINLOCK(lock);
5140
5141 spin_lock(&lock);
5145 5142
5146#ifdef CONFIG_NUMA 5143#ifdef CONFIG_NUMA
5147 memset(node_load, 0, sizeof(node_load)); 5144 memset(node_load, 0, sizeof(node_load));
@@ -5173,6 +5170,8 @@ static void __build_all_zonelists(void *data)
5173 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 5170 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5174#endif 5171#endif
5175 } 5172 }
5173
5174 spin_unlock(&lock);
5176} 5175}
5177 5176
5178static noinline void __init 5177static noinline void __init
@@ -5203,7 +5202,6 @@ build_all_zonelists_init(void)
5203} 5202}
5204 5203
5205/* 5204/*
5206 * Called with zonelists_mutex held always
5207 * unless system_state == SYSTEM_BOOTING. 5205 * unless system_state == SYSTEM_BOOTING.
5208 * 5206 *
5209 * __ref due to call of __init annotated helper build_all_zonelists_init 5207 * __ref due to call of __init annotated helper build_all_zonelists_init
@@ -6939,9 +6937,11 @@ static void __setup_per_zone_wmarks(void)
6939 */ 6937 */
6940void setup_per_zone_wmarks(void) 6938void setup_per_zone_wmarks(void)
6941{ 6939{
6942 mutex_lock(&zonelists_mutex); 6940 static DEFINE_SPINLOCK(lock);
6941
6942 spin_lock(&lock);
6943 __setup_per_zone_wmarks(); 6943 __setup_per_zone_wmarks();
6944 mutex_unlock(&zonelists_mutex); 6944 spin_unlock(&lock);
6945} 6945}
6946 6946
6947/* 6947/*