summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2017-09-06 19:20:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:27:26 -0400
commit9d3be21bf9c0b849a13e6b51e9c2ce7ccdf50851 (patch)
tree1897d5fc7888af09361c82b5b7ab040124f97c54
parent34ad1296571f7a004a761e3afc18e79428a726a8 (diff)
mm, page_alloc: simplify zonelist initialization
build_zonelists gradually builds zonelists from the nearest to the most distant node. As we do not know how many populated zones we will have in each node we rely on the _zoneref to terminate initialized part of the zonelist by a NULL zone. While this is functionally correct it is quite suboptimal because we cannot allow updaters to race with zonelists users because they could see an empty zonelist and fail the allocation or hit the OOM killer in the worst case. We can do much better, though. We can store the node ordering into an already existing node_order array and then give this array to build_zonelists_in_node_order and do the whole initialization at once. zonelists consumers still might see halfway initialized state but that should be much more tolerateable because the list will not be empty and they would either see some zone twice or skip over some zone(s) in the worst case which shouldn't lead to immediate failures. While at it let's simplify build_zonelists_node which is rather confusing now. It gets an index into the zoneref array and returns the updated index for the next iteration. Let's rename the function to build_zonerefs_node to better reflect its purpose and give it zoneref array to update. The function doesn't the index anymore. It just returns the number of added zones so that the caller can advance the zonered array start for the next update. This patch alone doesn't introduce any functional change yet, though, it is merely a preparatory work for later changes. Link: http://lkml.kernel.org/r/20170721143915.14161-7-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Shaohua Li <shaohua.li@intel.com> Cc: Toshi Kani <toshi.kani@hpe.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/page_alloc.c81
1 files changed, 41 insertions, 40 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2523d5b3b22f..36a2f18c5e0a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4839,18 +4839,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
4839 * 4839 *
4840 * Add all populated zones of a node to the zonelist. 4840 * Add all populated zones of a node to the zonelist.
4841 */ 4841 */
4842static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 4842static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
4843 int nr_zones)
4844{ 4843{
4845 struct zone *zone; 4844 struct zone *zone;
4846 enum zone_type zone_type = MAX_NR_ZONES; 4845 enum zone_type zone_type = MAX_NR_ZONES;
4846 int nr_zones = 0;
4847 4847
4848 do { 4848 do {
4849 zone_type--; 4849 zone_type--;
4850 zone = pgdat->node_zones + zone_type; 4850 zone = pgdat->node_zones + zone_type;
4851 if (managed_zone(zone)) { 4851 if (managed_zone(zone)) {
4852 zoneref_set_zone(zone, 4852 zoneref_set_zone(zone, &zonerefs[nr_zones++]);
4853 &zonelist->_zonerefs[nr_zones++]);
4854 check_highest_zone(zone_type); 4853 check_highest_zone(zone_type);
4855 } 4854 }
4856 } while (zone_type); 4855 } while (zone_type);
@@ -4977,17 +4976,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
4977 * This results in maximum locality--normal zone overflows into local 4976 * This results in maximum locality--normal zone overflows into local
4978 * DMA zone, if any--but risks exhausting DMA zone. 4977 * DMA zone, if any--but risks exhausting DMA zone.
4979 */ 4978 */
4980static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 4979static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
4980 unsigned nr_nodes)
4981{ 4981{
4982 int j; 4982 struct zoneref *zonerefs;
4983 struct zonelist *zonelist; 4983 int i;
4984
4985 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
4986
4987 for (i = 0; i < nr_nodes; i++) {
4988 int nr_zones;
4984 4989
4985 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; 4990 pg_data_t *node = NODE_DATA(node_order[i]);
4986 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 4991
4987 ; 4992 nr_zones = build_zonerefs_node(node, zonerefs);
4988 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 4993 zonerefs += nr_zones;
4989 zonelist->_zonerefs[j].zone = NULL; 4994 }
4990 zonelist->_zonerefs[j].zone_idx = 0; 4995 zonerefs->zone = NULL;
4996 zonerefs->zone_idx = 0;
4991} 4997}
4992 4998
4993/* 4999/*
@@ -4995,13 +5001,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
4995 */ 5001 */
4996static void build_thisnode_zonelists(pg_data_t *pgdat) 5002static void build_thisnode_zonelists(pg_data_t *pgdat)
4997{ 5003{
4998 int j; 5004 struct zoneref *zonerefs;
4999 struct zonelist *zonelist; 5005 int nr_zones;
5000 5006
5001 zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; 5007 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5002 j = build_zonelists_node(pgdat, zonelist, 0); 5008 nr_zones = build_zonerefs_node(pgdat, zonerefs);
5003 zonelist->_zonerefs[j].zone = NULL; 5009 zonerefs += nr_zones;
5004 zonelist->_zonerefs[j].zone_idx = 0; 5010 zonerefs->zone = NULL;
5011 zonerefs->zone_idx = 0;
5005} 5012}
5006 5013
5007/* 5014/*
@@ -5010,21 +5017,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
5010 * exhausted, but results in overflowing to remote node while memory 5017 * exhausted, but results in overflowing to remote node while memory
5011 * may still exist in local DMA zone. 5018 * may still exist in local DMA zone.
5012 */ 5019 */
5013static int node_order[MAX_NUMNODES];
5014 5020
5015static void build_zonelists(pg_data_t *pgdat) 5021static void build_zonelists(pg_data_t *pgdat)
5016{ 5022{
5017 int i, node, load; 5023 static int node_order[MAX_NUMNODES];
5024 int node, load, nr_nodes = 0;
5018 nodemask_t used_mask; 5025 nodemask_t used_mask;
5019 int local_node, prev_node; 5026 int local_node, prev_node;
5020 struct zonelist *zonelist;
5021
5022 /* initialize zonelists */
5023 for (i = 0; i < MAX_ZONELISTS; i++) {
5024 zonelist = pgdat->node_zonelists + i;
5025 zonelist->_zonerefs[0].zone = NULL;
5026 zonelist->_zonerefs[0].zone_idx = 0;
5027 }
5028 5027
5029 /* NUMA-aware ordering of nodes */ 5028 /* NUMA-aware ordering of nodes */
5030 local_node = pgdat->node_id; 5029 local_node = pgdat->node_id;
@@ -5033,8 +5032,6 @@ static void build_zonelists(pg_data_t *pgdat)
5033 nodes_clear(used_mask); 5032 nodes_clear(used_mask);
5034 5033
5035 memset(node_order, 0, sizeof(node_order)); 5034 memset(node_order, 0, sizeof(node_order));
5036 i = 0;
5037
5038 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 5035 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5039 /* 5036 /*
5040 * We don't want to pressure a particular node. 5037 * We don't want to pressure a particular node.
@@ -5045,11 +5042,12 @@ static void build_zonelists(pg_data_t *pgdat)
5045 node_distance(local_node, prev_node)) 5042 node_distance(local_node, prev_node))
5046 node_load[node] = load; 5043 node_load[node] = load;
5047 5044
5045 node_order[nr_nodes++] = node;
5048 prev_node = node; 5046 prev_node = node;
5049 load--; 5047 load--;
5050 build_zonelists_in_node_order(pgdat, node);
5051 } 5048 }
5052 5049
5050 build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5053 build_thisnode_zonelists(pgdat); 5051 build_thisnode_zonelists(pgdat);
5054} 5052}
5055 5053
@@ -5078,13 +5076,14 @@ static void setup_min_slab_ratio(void);
5078static void build_zonelists(pg_data_t *pgdat) 5076static void build_zonelists(pg_data_t *pgdat)
5079{ 5077{
5080 int node, local_node; 5078 int node, local_node;
5081 enum zone_type j; 5079 struct zoneref *zonerefs;
5082 struct zonelist *zonelist; 5080 int nr_zones;
5083 5081
5084 local_node = pgdat->node_id; 5082 local_node = pgdat->node_id;
5085 5083
5086 zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; 5084 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5087 j = build_zonelists_node(pgdat, zonelist, 0); 5085 nr_zones = build_zonerefs_node(pgdat, zonerefs);
5086 zonerefs += nr_zones;
5088 5087
5089 /* 5088 /*
5090 * Now we build the zonelist so that it contains the zones 5089 * Now we build the zonelist so that it contains the zones
@@ -5097,16 +5096,18 @@ static void build_zonelists(pg_data_t *pgdat)
5097 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 5096 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
5098 if (!node_online(node)) 5097 if (!node_online(node))
5099 continue; 5098 continue;
5100 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 5099 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5100 zonerefs += nr_zones;
5101 } 5101 }
5102 for (node = 0; node < local_node; node++) { 5102 for (node = 0; node < local_node; node++) {
5103 if (!node_online(node)) 5103 if (!node_online(node))
5104 continue; 5104 continue;
5105 j = build_zonelists_node(NODE_DATA(node), zonelist, j); 5105 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5106 zonerefs += nr_zones;
5106 } 5107 }
5107 5108
5108 zonelist->_zonerefs[j].zone = NULL; 5109 zonerefs->zone = NULL;
5109 zonelist->_zonerefs[j].zone_idx = 0; 5110 zonerefs->zone_idx = 0;
5110} 5111}
5111 5112
5112#endif /* CONFIG_NUMA */ 5113#endif /* CONFIG_NUMA */