summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2017-09-06 19:20:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:27:25 -0400
commitc9bff3eebc09be23fbc868f5e6731666d23cbea3 (patch)
tree3a98933875f125df12e7fa44279bea2e7067f802 /mm/page_alloc.c
parent5a47074f0279421778f97b1b1e75686696a5f42a (diff)
mm, page_alloc: rip out ZONELIST_ORDER_ZONE
Patch series "cleanup zonelists initialization", v1. This is aimed at cleaning up the zonelists initialization code we have but the primary motivation was bug report [2] which got resolved but the usage of stop_machine is just too ugly to live. Most patches are straightforward but 3 of them need a special consideration. Patch 1 removes zone ordered zonelists completely. I am CCing linux-api because this is a user visible change. As I argue in the patch description I do not think we have a strong usecase for it these days. I have kept sysctl in place and warn into the log if somebody tries to configure zone lists ordering. If somebody has a real usecase for it we can revert this patch but I do not expect anybody will actually notice runtime differences. This patch is not strictly needed for the rest but it made patch 6 easier to implement. Patch 7 removes stop_machine from build_all_zonelists without adding any special synchronization between iterators and updater which I _believe_ is acceptable as explained in the changelog. I hope I am not missing anything. Patch 8 then removes zonelists_mutex which is kind of ugly as well and not really needed AFAICS but a care should be taken when double checking my thinking. This patch (of 9): Supporting zone ordered zonelists costs us just a lot of code while the usefulness is arguable if existent at all. Mel has already made node ordering default on 64b systems. 32b systems are still using ZONELIST_ORDER_ZONE because it is considered better to fallback to a different NUMA node rather than consume precious lowmem zones. This argument is, however, weaken by the fact that the memory reclaim has been reworked to be node rather than zone oriented. This means that lowmem requests have to skip over all highmem pages on LRUs already and so zone ordering doesn't save the reclaim time much. So the only advantage of the zone ordering is under a light memory pressure when highmem requests do not ever hit into lowmem zones and the lowmem pressure doesn't need to reclaim. Considering that 32b NUMA systems are rather suboptimal already and it is generally advisable to use 64b kernel on such a HW I believe we should rather care about the code maintainability and just get rid of ZONELIST_ORDER_ZONE altogether. Keep systcl in place and warn if somebody tries to set zone ordering either from kernel command line or the sysctl. [mhocko@suse.com: reading vm.numa_zonelist_order will never terminate] Link: http://lkml.kernel.org/r/20170721143915.14161-2-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Mel Gorman <mgorman@suse.de> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Shaohua Li <shaohua.li@intel.com> Cc: Toshi Kani <toshi.kani@hpe.com> Cc: Abdul Haleem <abdhalee@linux.vnet.ibm.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c179
1 files changed, 21 insertions, 158 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dcc8a1cf55b6..6b23df1be909 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4858,52 +4858,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
4858 return nr_zones; 4858 return nr_zones;
4859} 4859}
4860 4860
4861
4862/*
4863 * zonelist_order:
4864 * 0 = automatic detection of better ordering.
4865 * 1 = order by ([node] distance, -zonetype)
4866 * 2 = order by (-zonetype, [node] distance)
4867 *
4868 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
4869 * the same zonelist. So only NUMA can configure this param.
4870 */
4871#define ZONELIST_ORDER_DEFAULT 0
4872#define ZONELIST_ORDER_NODE 1
4873#define ZONELIST_ORDER_ZONE 2
4874
4875/* zonelist order in the kernel.
4876 * set_zonelist_order() will set this to NODE or ZONE.
4877 */
4878static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
4879static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
4880
4881
4882#ifdef CONFIG_NUMA 4861#ifdef CONFIG_NUMA
4883/* The value user specified ....changed by config */
4884static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
4885/* string for sysctl */
4886#define NUMA_ZONELIST_ORDER_LEN 16
4887char numa_zonelist_order[16] = "default";
4888
4889/*
4890 * interface for configure zonelist ordering.
4891 * command line option "numa_zonelist_order"
4892 * = "[dD]efault - default, automatic configuration.
4893 * = "[nN]ode - order by node locality, then by zone within node
4894 * = "[zZ]one - order by zone, then by locality within zone
4895 */
4896 4862
4897static int __parse_numa_zonelist_order(char *s) 4863static int __parse_numa_zonelist_order(char *s)
4898{ 4864{
4899 if (*s == 'd' || *s == 'D') { 4865 /*
4900 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 4866 * We used to support different zonlists modes but they turned
4901 } else if (*s == 'n' || *s == 'N') { 4867 * out to be just not useful. Let's keep the warning in place
4902 user_zonelist_order = ZONELIST_ORDER_NODE; 4868 * if somebody still use the cmd line parameter so that we do
4903 } else if (*s == 'z' || *s == 'Z') { 4869 * not fail it silently
4904 user_zonelist_order = ZONELIST_ORDER_ZONE; 4870 */
4905 } else { 4871 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
4906 pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); 4872 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
4907 return -EINVAL; 4873 return -EINVAL;
4908 } 4874 }
4909 return 0; 4875 return 0;
@@ -4911,19 +4877,15 @@ static int __parse_numa_zonelist_order(char *s)
4911 4877
4912static __init int setup_numa_zonelist_order(char *s) 4878static __init int setup_numa_zonelist_order(char *s)
4913{ 4879{
4914 int ret;
4915
4916 if (!s) 4880 if (!s)
4917 return 0; 4881 return 0;
4918 4882
4919 ret = __parse_numa_zonelist_order(s); 4883 return __parse_numa_zonelist_order(s);
4920 if (ret == 0)
4921 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
4922
4923 return ret;
4924} 4884}
4925early_param("numa_zonelist_order", setup_numa_zonelist_order); 4885early_param("numa_zonelist_order", setup_numa_zonelist_order);
4926 4886
4887char numa_zonelist_order[] = "Node";
4888
4927/* 4889/*
4928 * sysctl handler for numa_zonelist_order 4890 * sysctl handler for numa_zonelist_order
4929 */ 4891 */
@@ -4931,42 +4893,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
4931 void __user *buffer, size_t *length, 4893 void __user *buffer, size_t *length,
4932 loff_t *ppos) 4894 loff_t *ppos)
4933{ 4895{
4934 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 4896 char *str;
4935 int ret; 4897 int ret;
4936 static DEFINE_MUTEX(zl_order_mutex);
4937 4898
4938 mutex_lock(&zl_order_mutex); 4899 if (!write)
4939 if (write) { 4900 return proc_dostring(table, write, buffer, length, ppos);
4940 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 4901 str = memdup_user_nul(buffer, 16);
4941 ret = -EINVAL; 4902 if (IS_ERR(str))
4942 goto out; 4903 return PTR_ERR(str);
4943 }
4944 strcpy(saved_string, (char *)table->data);
4945 }
4946 ret = proc_dostring(table, write, buffer, length, ppos);
4947 if (ret)
4948 goto out;
4949 if (write) {
4950 int oldval = user_zonelist_order;
4951 4904
4952 ret = __parse_numa_zonelist_order((char *)table->data); 4905 ret = __parse_numa_zonelist_order(str);
4953 if (ret) { 4906 kfree(str);
4954 /*
4955 * bogus value. restore saved string
4956 */
4957 strncpy((char *)table->data, saved_string,
4958 NUMA_ZONELIST_ORDER_LEN);
4959 user_zonelist_order = oldval;
4960 } else if (oldval != user_zonelist_order) {
4961 mem_hotplug_begin();
4962 mutex_lock(&zonelists_mutex);
4963 build_all_zonelists(NULL, NULL);
4964 mutex_unlock(&zonelists_mutex);
4965 mem_hotplug_done();
4966 }
4967 }
4968out:
4969 mutex_unlock(&zl_order_mutex);
4970 return ret; 4907 return ret;
4971} 4908}
4972 4909
@@ -5075,70 +5012,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
5075 */ 5012 */
5076static int node_order[MAX_NUMNODES]; 5013static int node_order[MAX_NUMNODES];
5077 5014
5078static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
5079{
5080 int pos, j, node;
5081 int zone_type; /* needs to be signed */
5082 struct zone *z;
5083 struct zonelist *zonelist;
5084
5085 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
5086 pos = 0;
5087 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
5088 for (j = 0; j < nr_nodes; j++) {
5089 node = node_order[j];
5090 z = &NODE_DATA(node)->node_zones[zone_type];
5091 if (managed_zone(z)) {
5092 zoneref_set_zone(z,
5093 &zonelist->_zonerefs[pos++]);
5094 check_highest_zone(zone_type);
5095 }
5096 }
5097 }
5098 zonelist->_zonerefs[pos].zone = NULL;
5099 zonelist->_zonerefs[pos].zone_idx = 0;
5100}
5101
5102#if defined(CONFIG_64BIT)
5103/*
5104 * Devices that require DMA32/DMA are relatively rare and do not justify a
5105 * penalty to every machine in case the specialised case applies. Default
5106 * to Node-ordering on 64-bit NUMA machines
5107 */
5108static int default_zonelist_order(void)
5109{
5110 return ZONELIST_ORDER_NODE;
5111}
5112#else
5113/*
5114 * On 32-bit, the Normal zone needs to be preserved for allocations accessible
5115 * by the kernel. If processes running on node 0 deplete the low memory zone
5116 * then reclaim will occur more frequency increasing stalls and potentially
5117 * be easier to OOM if a large percentage of the zone is under writeback or
5118 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
5119 * Hence, default to zone ordering on 32-bit.
5120 */
5121static int default_zonelist_order(void)
5122{
5123 return ZONELIST_ORDER_ZONE;
5124}
5125#endif /* CONFIG_64BIT */
5126
5127static void set_zonelist_order(void)
5128{
5129 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
5130 current_zonelist_order = default_zonelist_order();
5131 else
5132 current_zonelist_order = user_zonelist_order;
5133}
5134
5135static void build_zonelists(pg_data_t *pgdat) 5015static void build_zonelists(pg_data_t *pgdat)
5136{ 5016{
5137 int i, node, load; 5017 int i, node, load;
5138 nodemask_t used_mask; 5018 nodemask_t used_mask;
5139 int local_node, prev_node; 5019 int local_node, prev_node;
5140 struct zonelist *zonelist; 5020 struct zonelist *zonelist;
5141 unsigned int order = current_zonelist_order;
5142 5021
5143 /* initialize zonelists */ 5022 /* initialize zonelists */
5144 for (i = 0; i < MAX_ZONELISTS; i++) { 5023 for (i = 0; i < MAX_ZONELISTS; i++) {
@@ -5168,15 +5047,7 @@ static void build_zonelists(pg_data_t *pgdat)
5168 5047
5169 prev_node = node; 5048 prev_node = node;
5170 load--; 5049 load--;
5171 if (order == ZONELIST_ORDER_NODE) 5050 build_zonelists_in_node_order(pgdat, node);
5172 build_zonelists_in_node_order(pgdat, node);
5173 else
5174 node_order[i++] = node; /* remember order */
5175 }
5176
5177 if (order == ZONELIST_ORDER_ZONE) {
5178 /* calculate node order -- i.e., DMA last! */
5179 build_zonelists_in_zone_order(pgdat, i);
5180 } 5051 }
5181 5052
5182 build_thisnode_zonelists(pgdat); 5053 build_thisnode_zonelists(pgdat);
@@ -5204,11 +5075,6 @@ static void setup_min_unmapped_ratio(void);
5204static void setup_min_slab_ratio(void); 5075static void setup_min_slab_ratio(void);
5205#else /* CONFIG_NUMA */ 5076#else /* CONFIG_NUMA */
5206 5077
5207static void set_zonelist_order(void)
5208{
5209 current_zonelist_order = ZONELIST_ORDER_ZONE;
5210}
5211
5212static void build_zonelists(pg_data_t *pgdat) 5078static void build_zonelists(pg_data_t *pgdat)
5213{ 5079{
5214 int node, local_node; 5080 int node, local_node;
@@ -5348,8 +5214,6 @@ build_all_zonelists_init(void)
5348 */ 5214 */
5349void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 5215void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
5350{ 5216{
5351 set_zonelist_order();
5352
5353 if (system_state == SYSTEM_BOOTING) { 5217 if (system_state == SYSTEM_BOOTING) {
5354 build_all_zonelists_init(); 5218 build_all_zonelists_init();
5355 } else { 5219 } else {
@@ -5375,9 +5239,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
5375 else 5239 else
5376 page_group_by_mobility_disabled = 0; 5240 page_group_by_mobility_disabled = 0;
5377 5241
5378 pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", 5242 pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
5379 nr_online_nodes, 5243 nr_online_nodes,
5380 zonelist_order_name[current_zonelist_order],
5381 page_group_by_mobility_disabled ? "off" : "on", 5244 page_group_by_mobility_disabled ? "off" : "on",
5382 vm_total_pages); 5245 vm_total_pages);
5383#ifdef CONFIG_NUMA 5246#ifdef CONFIG_NUMA