summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c179
1 files changed, 21 insertions, 158 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dcc8a1cf55b6..6b23df1be909 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4858,52 +4858,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
4858 return nr_zones; 4858 return nr_zones;
4859} 4859}
4860 4860
4861
4862/*
4863 * zonelist_order:
4864 * 0 = automatic detection of better ordering.
4865 * 1 = order by ([node] distance, -zonetype)
4866 * 2 = order by (-zonetype, [node] distance)
4867 *
4868 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
4869 * the same zonelist. So only NUMA can configure this param.
4870 */
4871#define ZONELIST_ORDER_DEFAULT 0
4872#define ZONELIST_ORDER_NODE 1
4873#define ZONELIST_ORDER_ZONE 2
4874
4875/* zonelist order in the kernel.
4876 * set_zonelist_order() will set this to NODE or ZONE.
4877 */
4878static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
4879static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
4880
4881
4882#ifdef CONFIG_NUMA 4861#ifdef CONFIG_NUMA
4883/* The value user specified ....changed by config */
4884static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
4885/* string for sysctl */
4886#define NUMA_ZONELIST_ORDER_LEN 16
4887char numa_zonelist_order[16] = "default";
4888
4889/*
4890 * interface for configure zonelist ordering.
4891 * command line option "numa_zonelist_order"
4892 * = "[dD]efault - default, automatic configuration.
4893 * = "[nN]ode - order by node locality, then by zone within node
4894 * = "[zZ]one - order by zone, then by locality within zone
4895 */
4896 4862
4897static int __parse_numa_zonelist_order(char *s) 4863static int __parse_numa_zonelist_order(char *s)
4898{ 4864{
4899 if (*s == 'd' || *s == 'D') { 4865 /*
4900 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 4866 * We used to support different zonlists modes but they turned
4901 } else if (*s == 'n' || *s == 'N') { 4867 * out to be just not useful. Let's keep the warning in place
4902 user_zonelist_order = ZONELIST_ORDER_NODE; 4868 * if somebody still use the cmd line parameter so that we do
4903 } else if (*s == 'z' || *s == 'Z') { 4869 * not fail it silently
4904 user_zonelist_order = ZONELIST_ORDER_ZONE; 4870 */
4905 } else { 4871 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
4906 pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); 4872 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
4907 return -EINVAL; 4873 return -EINVAL;
4908 } 4874 }
4909 return 0; 4875 return 0;
@@ -4911,19 +4877,15 @@ static int __parse_numa_zonelist_order(char *s)
4911 4877
4912static __init int setup_numa_zonelist_order(char *s) 4878static __init int setup_numa_zonelist_order(char *s)
4913{ 4879{
4914 int ret;
4915
4916 if (!s) 4880 if (!s)
4917 return 0; 4881 return 0;
4918 4882
4919 ret = __parse_numa_zonelist_order(s); 4883 return __parse_numa_zonelist_order(s);
4920 if (ret == 0)
4921 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
4922
4923 return ret;
4924} 4884}
4925early_param("numa_zonelist_order", setup_numa_zonelist_order); 4885early_param("numa_zonelist_order", setup_numa_zonelist_order);
4926 4886
4887char numa_zonelist_order[] = "Node";
4888
4927/* 4889/*
4928 * sysctl handler for numa_zonelist_order 4890 * sysctl handler for numa_zonelist_order
4929 */ 4891 */
@@ -4931,42 +4893,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
4931 void __user *buffer, size_t *length, 4893 void __user *buffer, size_t *length,
4932 loff_t *ppos) 4894 loff_t *ppos)
4933{ 4895{
4934 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 4896 char *str;
4935 int ret; 4897 int ret;
4936 static DEFINE_MUTEX(zl_order_mutex);
4937 4898
4938 mutex_lock(&zl_order_mutex); 4899 if (!write)
4939 if (write) { 4900 return proc_dostring(table, write, buffer, length, ppos);
4940 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { 4901 str = memdup_user_nul(buffer, 16);
4941 ret = -EINVAL; 4902 if (IS_ERR(str))
4942 goto out; 4903 return PTR_ERR(str);
4943 }
4944 strcpy(saved_string, (char *)table->data);
4945 }
4946 ret = proc_dostring(table, write, buffer, length, ppos);
4947 if (ret)
4948 goto out;
4949 if (write) {
4950 int oldval = user_zonelist_order;
4951 4904
4952 ret = __parse_numa_zonelist_order((char *)table->data); 4905 ret = __parse_numa_zonelist_order(str);
4953 if (ret) { 4906 kfree(str);
4954 /*
4955 * bogus value. restore saved string
4956 */
4957 strncpy((char *)table->data, saved_string,
4958 NUMA_ZONELIST_ORDER_LEN);
4959 user_zonelist_order = oldval;
4960 } else if (oldval != user_zonelist_order) {
4961 mem_hotplug_begin();
4962 mutex_lock(&zonelists_mutex);
4963 build_all_zonelists(NULL, NULL);
4964 mutex_unlock(&zonelists_mutex);
4965 mem_hotplug_done();
4966 }
4967 }
4968out:
4969 mutex_unlock(&zl_order_mutex);
4970 return ret; 4907 return ret;
4971} 4908}
4972 4909
@@ -5075,70 +5012,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
5075 */ 5012 */
5076static int node_order[MAX_NUMNODES]; 5013static int node_order[MAX_NUMNODES];
5077 5014
5078static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
5079{
5080 int pos, j, node;
5081 int zone_type; /* needs to be signed */
5082 struct zone *z;
5083 struct zonelist *zonelist;
5084
5085 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
5086 pos = 0;
5087 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
5088 for (j = 0; j < nr_nodes; j++) {
5089 node = node_order[j];
5090 z = &NODE_DATA(node)->node_zones[zone_type];
5091 if (managed_zone(z)) {
5092 zoneref_set_zone(z,
5093 &zonelist->_zonerefs[pos++]);
5094 check_highest_zone(zone_type);
5095 }
5096 }
5097 }
5098 zonelist->_zonerefs[pos].zone = NULL;
5099 zonelist->_zonerefs[pos].zone_idx = 0;
5100}
5101
5102#if defined(CONFIG_64BIT)
5103/*
5104 * Devices that require DMA32/DMA are relatively rare and do not justify a
5105 * penalty to every machine in case the specialised case applies. Default
5106 * to Node-ordering on 64-bit NUMA machines
5107 */
5108static int default_zonelist_order(void)
5109{
5110 return ZONELIST_ORDER_NODE;
5111}
5112#else
5113/*
5114 * On 32-bit, the Normal zone needs to be preserved for allocations accessible
5115 * by the kernel. If processes running on node 0 deplete the low memory zone
5116 * then reclaim will occur more frequency increasing stalls and potentially
5117 * be easier to OOM if a large percentage of the zone is under writeback or
5118 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
5119 * Hence, default to zone ordering on 32-bit.
5120 */
5121static int default_zonelist_order(void)
5122{
5123 return ZONELIST_ORDER_ZONE;
5124}
5125#endif /* CONFIG_64BIT */
5126
5127static void set_zonelist_order(void)
5128{
5129 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
5130 current_zonelist_order = default_zonelist_order();
5131 else
5132 current_zonelist_order = user_zonelist_order;
5133}
5134
5135static void build_zonelists(pg_data_t *pgdat) 5015static void build_zonelists(pg_data_t *pgdat)
5136{ 5016{
5137 int i, node, load; 5017 int i, node, load;
5138 nodemask_t used_mask; 5018 nodemask_t used_mask;
5139 int local_node, prev_node; 5019 int local_node, prev_node;
5140 struct zonelist *zonelist; 5020 struct zonelist *zonelist;
5141 unsigned int order = current_zonelist_order;
5142 5021
5143 /* initialize zonelists */ 5022 /* initialize zonelists */
5144 for (i = 0; i < MAX_ZONELISTS; i++) { 5023 for (i = 0; i < MAX_ZONELISTS; i++) {
@@ -5168,15 +5047,7 @@ static void build_zonelists(pg_data_t *pgdat)
5168 5047
5169 prev_node = node; 5048 prev_node = node;
5170 load--; 5049 load--;
5171 if (order == ZONELIST_ORDER_NODE) 5050 build_zonelists_in_node_order(pgdat, node);
5172 build_zonelists_in_node_order(pgdat, node);
5173 else
5174 node_order[i++] = node; /* remember order */
5175 }
5176
5177 if (order == ZONELIST_ORDER_ZONE) {
5178 /* calculate node order -- i.e., DMA last! */
5179 build_zonelists_in_zone_order(pgdat, i);
5180 } 5051 }
5181 5052
5182 build_thisnode_zonelists(pgdat); 5053 build_thisnode_zonelists(pgdat);
@@ -5204,11 +5075,6 @@ static void setup_min_unmapped_ratio(void);
5204static void setup_min_slab_ratio(void); 5075static void setup_min_slab_ratio(void);
5205#else /* CONFIG_NUMA */ 5076#else /* CONFIG_NUMA */
5206 5077
5207static void set_zonelist_order(void)
5208{
5209 current_zonelist_order = ZONELIST_ORDER_ZONE;
5210}
5211
5212static void build_zonelists(pg_data_t *pgdat) 5078static void build_zonelists(pg_data_t *pgdat)
5213{ 5079{
5214 int node, local_node; 5080 int node, local_node;
@@ -5348,8 +5214,6 @@ build_all_zonelists_init(void)
5348 */ 5214 */
5349void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 5215void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
5350{ 5216{
5351 set_zonelist_order();
5352
5353 if (system_state == SYSTEM_BOOTING) { 5217 if (system_state == SYSTEM_BOOTING) {
5354 build_all_zonelists_init(); 5218 build_all_zonelists_init();
5355 } else { 5219 } else {
@@ -5375,9 +5239,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
5375 else 5239 else
5376 page_group_by_mobility_disabled = 0; 5240 page_group_by_mobility_disabled = 0;
5377 5241
5378 pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", 5242 pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
5379 nr_online_nodes, 5243 nr_online_nodes,
5380 zonelist_order_name[current_zonelist_order],
5381 page_group_by_mobility_disabled ? "off" : "on", 5244 page_group_by_mobility_disabled ? "off" : "on",
5382 vm_total_pages); 5245 vm_total_pages);
5383#ifdef CONFIG_NUMA 5246#ifdef CONFIG_NUMA