diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 179 |
1 files changed, 21 insertions, 158 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dcc8a1cf55b6..6b23df1be909 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -4858,52 +4858,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | |||
4858 | return nr_zones; | 4858 | return nr_zones; |
4859 | } | 4859 | } |
4860 | 4860 | ||
4861 | |||
4862 | /* | ||
4863 | * zonelist_order: | ||
4864 | * 0 = automatic detection of better ordering. | ||
4865 | * 1 = order by ([node] distance, -zonetype) | ||
4866 | * 2 = order by (-zonetype, [node] distance) | ||
4867 | * | ||
4868 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create | ||
4869 | * the same zonelist. So only NUMA can configure this param. | ||
4870 | */ | ||
4871 | #define ZONELIST_ORDER_DEFAULT 0 | ||
4872 | #define ZONELIST_ORDER_NODE 1 | ||
4873 | #define ZONELIST_ORDER_ZONE 2 | ||
4874 | |||
4875 | /* zonelist order in the kernel. | ||
4876 | * set_zonelist_order() will set this to NODE or ZONE. | ||
4877 | */ | ||
4878 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
4879 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; | ||
4880 | |||
4881 | |||
4882 | #ifdef CONFIG_NUMA | 4861 | #ifdef CONFIG_NUMA |
4883 | /* The value user specified ....changed by config */ | ||
4884 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
4885 | /* string for sysctl */ | ||
4886 | #define NUMA_ZONELIST_ORDER_LEN 16 | ||
4887 | char numa_zonelist_order[16] = "default"; | ||
4888 | |||
4889 | /* | ||
4890 | * interface for configure zonelist ordering. | ||
4891 | * command line option "numa_zonelist_order" | ||
4892 | * = "[dD]efault - default, automatic configuration. | ||
4893 | * = "[nN]ode - order by node locality, then by zone within node | ||
4894 | * = "[zZ]one - order by zone, then by locality within zone | ||
4895 | */ | ||
4896 | 4862 | ||
4897 | static int __parse_numa_zonelist_order(char *s) | 4863 | static int __parse_numa_zonelist_order(char *s) |
4898 | { | 4864 | { |
4899 | if (*s == 'd' || *s == 'D') { | 4865 | /* |
4900 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; | 4866 | * We used to support different zonlists modes but they turned |
4901 | } else if (*s == 'n' || *s == 'N') { | 4867 | * out to be just not useful. Let's keep the warning in place |
4902 | user_zonelist_order = ZONELIST_ORDER_NODE; | 4868 | * if somebody still use the cmd line parameter so that we do |
4903 | } else if (*s == 'z' || *s == 'Z') { | 4869 | * not fail it silently |
4904 | user_zonelist_order = ZONELIST_ORDER_ZONE; | 4870 | */ |
4905 | } else { | 4871 | if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { |
4906 | pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); | 4872 | pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); |
4907 | return -EINVAL; | 4873 | return -EINVAL; |
4908 | } | 4874 | } |
4909 | return 0; | 4875 | return 0; |
@@ -4911,19 +4877,15 @@ static int __parse_numa_zonelist_order(char *s) | |||
4911 | 4877 | ||
4912 | static __init int setup_numa_zonelist_order(char *s) | 4878 | static __init int setup_numa_zonelist_order(char *s) |
4913 | { | 4879 | { |
4914 | int ret; | ||
4915 | |||
4916 | if (!s) | 4880 | if (!s) |
4917 | return 0; | 4881 | return 0; |
4918 | 4882 | ||
4919 | ret = __parse_numa_zonelist_order(s); | 4883 | return __parse_numa_zonelist_order(s); |
4920 | if (ret == 0) | ||
4921 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); | ||
4922 | |||
4923 | return ret; | ||
4924 | } | 4884 | } |
4925 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | 4885 | early_param("numa_zonelist_order", setup_numa_zonelist_order); |
4926 | 4886 | ||
4887 | char numa_zonelist_order[] = "Node"; | ||
4888 | |||
4927 | /* | 4889 | /* |
4928 | * sysctl handler for numa_zonelist_order | 4890 | * sysctl handler for numa_zonelist_order |
4929 | */ | 4891 | */ |
@@ -4931,42 +4893,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, | |||
4931 | void __user *buffer, size_t *length, | 4893 | void __user *buffer, size_t *length, |
4932 | loff_t *ppos) | 4894 | loff_t *ppos) |
4933 | { | 4895 | { |
4934 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 4896 | char *str; |
4935 | int ret; | 4897 | int ret; |
4936 | static DEFINE_MUTEX(zl_order_mutex); | ||
4937 | 4898 | ||
4938 | mutex_lock(&zl_order_mutex); | 4899 | if (!write) |
4939 | if (write) { | 4900 | return proc_dostring(table, write, buffer, length, ppos); |
4940 | if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { | 4901 | str = memdup_user_nul(buffer, 16); |
4941 | ret = -EINVAL; | 4902 | if (IS_ERR(str)) |
4942 | goto out; | 4903 | return PTR_ERR(str); |
4943 | } | ||
4944 | strcpy(saved_string, (char *)table->data); | ||
4945 | } | ||
4946 | ret = proc_dostring(table, write, buffer, length, ppos); | ||
4947 | if (ret) | ||
4948 | goto out; | ||
4949 | if (write) { | ||
4950 | int oldval = user_zonelist_order; | ||
4951 | 4904 | ||
4952 | ret = __parse_numa_zonelist_order((char *)table->data); | 4905 | ret = __parse_numa_zonelist_order(str); |
4953 | if (ret) { | 4906 | kfree(str); |
4954 | /* | ||
4955 | * bogus value. restore saved string | ||
4956 | */ | ||
4957 | strncpy((char *)table->data, saved_string, | ||
4958 | NUMA_ZONELIST_ORDER_LEN); | ||
4959 | user_zonelist_order = oldval; | ||
4960 | } else if (oldval != user_zonelist_order) { | ||
4961 | mem_hotplug_begin(); | ||
4962 | mutex_lock(&zonelists_mutex); | ||
4963 | build_all_zonelists(NULL, NULL); | ||
4964 | mutex_unlock(&zonelists_mutex); | ||
4965 | mem_hotplug_done(); | ||
4966 | } | ||
4967 | } | ||
4968 | out: | ||
4969 | mutex_unlock(&zl_order_mutex); | ||
4970 | return ret; | 4907 | return ret; |
4971 | } | 4908 | } |
4972 | 4909 | ||
@@ -5075,70 +5012,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) | |||
5075 | */ | 5012 | */ |
5076 | static int node_order[MAX_NUMNODES]; | 5013 | static int node_order[MAX_NUMNODES]; |
5077 | 5014 | ||
5078 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | ||
5079 | { | ||
5080 | int pos, j, node; | ||
5081 | int zone_type; /* needs to be signed */ | ||
5082 | struct zone *z; | ||
5083 | struct zonelist *zonelist; | ||
5084 | |||
5085 | zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; | ||
5086 | pos = 0; | ||
5087 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { | ||
5088 | for (j = 0; j < nr_nodes; j++) { | ||
5089 | node = node_order[j]; | ||
5090 | z = &NODE_DATA(node)->node_zones[zone_type]; | ||
5091 | if (managed_zone(z)) { | ||
5092 | zoneref_set_zone(z, | ||
5093 | &zonelist->_zonerefs[pos++]); | ||
5094 | check_highest_zone(zone_type); | ||
5095 | } | ||
5096 | } | ||
5097 | } | ||
5098 | zonelist->_zonerefs[pos].zone = NULL; | ||
5099 | zonelist->_zonerefs[pos].zone_idx = 0; | ||
5100 | } | ||
5101 | |||
5102 | #if defined(CONFIG_64BIT) | ||
5103 | /* | ||
5104 | * Devices that require DMA32/DMA are relatively rare and do not justify a | ||
5105 | * penalty to every machine in case the specialised case applies. Default | ||
5106 | * to Node-ordering on 64-bit NUMA machines | ||
5107 | */ | ||
5108 | static int default_zonelist_order(void) | ||
5109 | { | ||
5110 | return ZONELIST_ORDER_NODE; | ||
5111 | } | ||
5112 | #else | ||
5113 | /* | ||
5114 | * On 32-bit, the Normal zone needs to be preserved for allocations accessible | ||
5115 | * by the kernel. If processes running on node 0 deplete the low memory zone | ||
5116 | * then reclaim will occur more frequency increasing stalls and potentially | ||
5117 | * be easier to OOM if a large percentage of the zone is under writeback or | ||
5118 | * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. | ||
5119 | * Hence, default to zone ordering on 32-bit. | ||
5120 | */ | ||
5121 | static int default_zonelist_order(void) | ||
5122 | { | ||
5123 | return ZONELIST_ORDER_ZONE; | ||
5124 | } | ||
5125 | #endif /* CONFIG_64BIT */ | ||
5126 | |||
5127 | static void set_zonelist_order(void) | ||
5128 | { | ||
5129 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) | ||
5130 | current_zonelist_order = default_zonelist_order(); | ||
5131 | else | ||
5132 | current_zonelist_order = user_zonelist_order; | ||
5133 | } | ||
5134 | |||
5135 | static void build_zonelists(pg_data_t *pgdat) | 5015 | static void build_zonelists(pg_data_t *pgdat) |
5136 | { | 5016 | { |
5137 | int i, node, load; | 5017 | int i, node, load; |
5138 | nodemask_t used_mask; | 5018 | nodemask_t used_mask; |
5139 | int local_node, prev_node; | 5019 | int local_node, prev_node; |
5140 | struct zonelist *zonelist; | 5020 | struct zonelist *zonelist; |
5141 | unsigned int order = current_zonelist_order; | ||
5142 | 5021 | ||
5143 | /* initialize zonelists */ | 5022 | /* initialize zonelists */ |
5144 | for (i = 0; i < MAX_ZONELISTS; i++) { | 5023 | for (i = 0; i < MAX_ZONELISTS; i++) { |
@@ -5168,15 +5047,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
5168 | 5047 | ||
5169 | prev_node = node; | 5048 | prev_node = node; |
5170 | load--; | 5049 | load--; |
5171 | if (order == ZONELIST_ORDER_NODE) | 5050 | build_zonelists_in_node_order(pgdat, node); |
5172 | build_zonelists_in_node_order(pgdat, node); | ||
5173 | else | ||
5174 | node_order[i++] = node; /* remember order */ | ||
5175 | } | ||
5176 | |||
5177 | if (order == ZONELIST_ORDER_ZONE) { | ||
5178 | /* calculate node order -- i.e., DMA last! */ | ||
5179 | build_zonelists_in_zone_order(pgdat, i); | ||
5180 | } | 5051 | } |
5181 | 5052 | ||
5182 | build_thisnode_zonelists(pgdat); | 5053 | build_thisnode_zonelists(pgdat); |
@@ -5204,11 +5075,6 @@ static void setup_min_unmapped_ratio(void); | |||
5204 | static void setup_min_slab_ratio(void); | 5075 | static void setup_min_slab_ratio(void); |
5205 | #else /* CONFIG_NUMA */ | 5076 | #else /* CONFIG_NUMA */ |
5206 | 5077 | ||
5207 | static void set_zonelist_order(void) | ||
5208 | { | ||
5209 | current_zonelist_order = ZONELIST_ORDER_ZONE; | ||
5210 | } | ||
5211 | |||
5212 | static void build_zonelists(pg_data_t *pgdat) | 5078 | static void build_zonelists(pg_data_t *pgdat) |
5213 | { | 5079 | { |
5214 | int node, local_node; | 5080 | int node, local_node; |
@@ -5348,8 +5214,6 @@ build_all_zonelists_init(void) | |||
5348 | */ | 5214 | */ |
5349 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | 5215 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
5350 | { | 5216 | { |
5351 | set_zonelist_order(); | ||
5352 | |||
5353 | if (system_state == SYSTEM_BOOTING) { | 5217 | if (system_state == SYSTEM_BOOTING) { |
5354 | build_all_zonelists_init(); | 5218 | build_all_zonelists_init(); |
5355 | } else { | 5219 | } else { |
@@ -5375,9 +5239,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | |||
5375 | else | 5239 | else |
5376 | page_group_by_mobility_disabled = 0; | 5240 | page_group_by_mobility_disabled = 0; |
5377 | 5241 | ||
5378 | pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", | 5242 | pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", |
5379 | nr_online_nodes, | 5243 | nr_online_nodes, |
5380 | zonelist_order_name[current_zonelist_order], | ||
5381 | page_group_by_mobility_disabled ? "off" : "on", | 5244 | page_group_by_mobility_disabled ? "off" : "on", |
5382 | vm_total_pages); | 5245 | vm_total_pages); |
5383 | #ifdef CONFIG_NUMA | 5246 | #ifdef CONFIG_NUMA |