aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/cpuset.h2
-rw-r--r--include/linux/mmzone.h85
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/page_alloc.c188
4 files changed, 265 insertions, 12 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 4d8adf663681..748d2c996631 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -23,6 +23,7 @@ extern void cpuset_fork(struct task_struct *p);
23extern void cpuset_exit(struct task_struct *p); 23extern void cpuset_exit(struct task_struct *p);
24extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); 24extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
25extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 25extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
26#define cpuset_current_mems_allowed (current->mems_allowed)
26void cpuset_init_current_mems_allowed(void); 27void cpuset_init_current_mems_allowed(void);
27void cpuset_update_task_memory_state(void); 28void cpuset_update_task_memory_state(void);
28#define cpuset_nodes_subset_current_mems_allowed(nodes) \ 29#define cpuset_nodes_subset_current_mems_allowed(nodes) \
@@ -83,6 +84,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
83 return node_possible_map; 84 return node_possible_map;
84} 85}
85 86
87#define cpuset_current_mems_allowed (node_online_map)
86static inline void cpuset_init_current_mems_allowed(void) {} 88static inline void cpuset_init_current_mems_allowed(void) {}
87static inline void cpuset_update_task_memory_state(void) {} 89static inline void cpuset_update_task_memory_state(void) {}
88#define cpuset_nodes_subset_current_mems_allowed(nodes) (1) 90#define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e06683e2bea3..09bf9d8d7b72 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -288,19 +288,94 @@ struct zone {
288 */ 288 */
289#define DEF_PRIORITY 12 289#define DEF_PRIORITY 12
290 290
291/* Maximum number of zones on a zonelist */
292#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
293
294#ifdef CONFIG_NUMA
295/*
296 * We cache key information from each zonelist for smaller cache
297 * footprint when scanning for free pages in get_page_from_freelist().
298 *
299 * 1) The BITMAP fullzones tracks which zones in a zonelist have come
300 * up short of free memory since the last time (last_fullzone_zap)
301 * we zero'd fullzones.
302 * 2) The array z_to_n[] maps each zone in the zonelist to its node
303 * id, so that we can efficiently evaluate whether that node is
304 * set in the current tasks mems_allowed.
305 *
306 * Both fullzones and z_to_n[] are one-to-one with the zonelist,
307 * indexed by a zones offset in the zonelist zones[] array.
308 *
309 * The get_page_from_freelist() routine does two scans. During the
310 * first scan, we skip zones whose corresponding bit in 'fullzones'
311 * is set or whose corresponding node in current->mems_allowed (which
312 * comes from cpusets) is not set. During the second scan, we bypass
313 * this zonelist_cache, to ensure we look methodically at each zone.
314 *
315 * Once per second, we zero out (zap) fullzones, forcing us to
316 * reconsider nodes that might have regained more free memory.
317 * The field last_full_zap is the time we last zapped fullzones.
318 *
319 * This mechanism reduces the amount of time we waste repeatedly
320 * reexaming zones for free memory when they just came up low on
321 * memory momentarilly ago.
322 *
323 * The zonelist_cache struct members logically belong in struct
324 * zonelist. However, the mempolicy zonelists constructed for
325 * MPOL_BIND are intentionally variable length (and usually much
326 * shorter). A general purpose mechanism for handling structs with
327 * multiple variable length members is more mechanism than we want
328 * here. We resort to some special case hackery instead.
329 *
330 * The MPOL_BIND zonelists don't need this zonelist_cache (in good
331 * part because they are shorter), so we put the fixed length stuff
332 * at the front of the zonelist struct, ending in a variable length
333 * zones[], as is needed by MPOL_BIND.
334 *
335 * Then we put the optional zonelist cache on the end of the zonelist
336 * struct. This optional stuff is found by a 'zlcache_ptr' pointer in
337 * the fixed length portion at the front of the struct. This pointer
338 * both enables us to find the zonelist cache, and in the case of
339 * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
340 * to know that the zonelist cache is not there.
341 *
342 * The end result is that struct zonelists come in two flavors:
343 * 1) The full, fixed length version, shown below, and
344 * 2) The custom zonelists for MPOL_BIND.
345 * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
346 *
347 * Even though there may be multiple CPU cores on a node modifying
348 * fullzones or last_full_zap in the same zonelist_cache at the same
349 * time, we don't lock it. This is just hint data - if it is wrong now
350 * and then, the allocator will still function, perhaps a bit slower.
351 */
352
353
354struct zonelist_cache {
355 DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
356 unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
357 unsigned long last_full_zap; /* when last zap'd (jiffies) */
358};
359#else
360struct zonelist_cache;
361#endif
362
291/* 363/*
292 * One allocation request operates on a zonelist. A zonelist 364 * One allocation request operates on a zonelist. A zonelist
293 * is a list of zones, the first one is the 'goal' of the 365 * is a list of zones, the first one is the 'goal' of the
294 * allocation, the other zones are fallback zones, in decreasing 366 * allocation, the other zones are fallback zones, in decreasing
295 * priority. 367 * priority.
296 * 368 *
297 * Right now a zonelist takes up less than a cacheline. We never 369 * If zlcache_ptr is not NULL, then it is just the address of zlcache,
298 * modify it apart from boot-up, and only a few indices are used, 370 * as explained above. If zlcache_ptr is NULL, there is no zlcache.
299 * so despite the zonelist table being relatively big, the cache
300 * footprint of this construct is very small.
301 */ 371 */
372
302struct zonelist { 373struct zonelist {
303 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited 374 struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
375 struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited
376#ifdef CONFIG_NUMA
377 struct zonelist_cache zlcache; // optional ...
378#endif
304}; 379};
305 380
306#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 381#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 617fb31086ee..fb907236bbd8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
141 enum zone_type k; 141 enum zone_type k;
142 142
143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
144 max++; /* space for zlcache_ptr (see mmzone.h) */
144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 145 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
145 if (!zl) 146 if (!zl)
146 return NULL; 147 return NULL;
148 zl->zlcache_ptr = NULL;
147 num = 0; 149 num = 0;
148 /* First put in the highest zones from all nodes, then all the next 150 /* First put in the highest zones from all nodes, then all the next
149 lower zones etc. Avoid empty zones because the memory allocator 151 lower zones etc. Avoid empty zones because the memory allocator
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23bc5bcbdcf9..230771d3c6b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -918,6 +918,126 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
918 return 1; 918 return 1;
919} 919}
920 920
921#ifdef CONFIG_NUMA
922/*
923 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
924 * skip over zones that are not allowed by the cpuset, or that have
925 * been recently (in last second) found to be nearly full. See further
926 * comments in mmzone.h. Reduces cache footprint of zonelist scans
927 * that have to skip over alot of full or unallowed zones.
928 *
929 * If the zonelist cache is present in the passed in zonelist, then
930 * returns a pointer to the allowed node mask (either the current
931 * tasks mems_allowed, or node_online_map.)
932 *
933 * If the zonelist cache is not available for this zonelist, does
934 * nothing and returns NULL.
935 *
936 * If the fullzones BITMAP in the zonelist cache is stale (more than
937 * a second since last zap'd) then we zap it out (clear its bits.)
938 *
939 * We hold off even calling zlc_setup, until after we've checked the
940 * first zone in the zonelist, on the theory that most allocations will
941 * be satisfied from that first zone, so best to examine that zone as
942 * quickly as we can.
943 */
944static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
945{
946 struct zonelist_cache *zlc; /* cached zonelist speedup info */
947 nodemask_t *allowednodes; /* zonelist_cache approximation */
948
949 zlc = zonelist->zlcache_ptr;
950 if (!zlc)
951 return NULL;
952
953 if (jiffies - zlc->last_full_zap > 1 * HZ) {
954 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
955 zlc->last_full_zap = jiffies;
956 }
957
958 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
959 &cpuset_current_mems_allowed :
960 &node_online_map;
961 return allowednodes;
962}
963
964/*
965 * Given 'z' scanning a zonelist, run a couple of quick checks to see
966 * if it is worth looking at further for free memory:
967 * 1) Check that the zone isn't thought to be full (doesn't have its
968 * bit set in the zonelist_cache fullzones BITMAP).
969 * 2) Check that the zones node (obtained from the zonelist_cache
970 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
971 * Return true (non-zero) if zone is worth looking at further, or
972 * else return false (zero) if it is not.
973 *
974 * This check -ignores- the distinction between various watermarks,
975 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
976 * found to be full for any variation of these watermarks, it will
977 * be considered full for up to one second by all requests, unless
978 * we are so low on memory on all allowed nodes that we are forced
979 * into the second scan of the zonelist.
980 *
981 * In the second scan we ignore this zonelist cache and exactly
982 * apply the watermarks to all zones, even it is slower to do so.
983 * We are low on memory in the second scan, and should leave no stone
984 * unturned looking for a free page.
985 */
986static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
987 nodemask_t *allowednodes)
988{
989 struct zonelist_cache *zlc; /* cached zonelist speedup info */
990 int i; /* index of *z in zonelist zones */
991 int n; /* node that zone *z is on */
992
993 zlc = zonelist->zlcache_ptr;
994 if (!zlc)
995 return 1;
996
997 i = z - zonelist->zones;
998 n = zlc->z_to_n[i];
999
1000 /* This zone is worth trying if it is allowed but not full */
1001 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1002}
1003
1004/*
1005 * Given 'z' scanning a zonelist, set the corresponding bit in
1006 * zlc->fullzones, so that subsequent attempts to allocate a page
1007 * from that zone don't waste time re-examining it.
1008 */
1009static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1010{
1011 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1012 int i; /* index of *z in zonelist zones */
1013
1014 zlc = zonelist->zlcache_ptr;
1015 if (!zlc)
1016 return;
1017
1018 i = z - zonelist->zones;
1019
1020 set_bit(i, zlc->fullzones);
1021}
1022
1023#else /* CONFIG_NUMA */
1024
1025static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1026{
1027 return NULL;
1028}
1029
1030static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1031 nodemask_t *allowednodes)
1032{
1033 return 1;
1034}
1035
1036static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1037{
1038}
1039#endif /* CONFIG_NUMA */
1040
921/* 1041/*
922 * get_page_from_freelist goes through the zonelist trying to allocate 1042 * get_page_from_freelist goes through the zonelist trying to allocate
923 * a page. 1043 * a page.
@@ -926,23 +1046,32 @@ static struct page *
926get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1046get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
927 struct zonelist *zonelist, int alloc_flags) 1047 struct zonelist *zonelist, int alloc_flags)
928{ 1048{
929 struct zone **z = zonelist->zones; 1049 struct zone **z;
930 struct page *page = NULL; 1050 struct page *page = NULL;
931 int classzone_idx = zone_idx(*z); 1051 int classzone_idx = zone_idx(zonelist->zones[0]);
932 struct zone *zone; 1052 struct zone *zone;
1053 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1054 int zlc_active = 0; /* set if using zonelist_cache */
1055 int did_zlc_setup = 0; /* just call zlc_setup() one time */
933 1056
1057zonelist_scan:
934 /* 1058 /*
935 * Go through the zonelist once, looking for a zone with enough free. 1059 * Scan zonelist, looking for a zone with enough free.
936 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1060 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
937 */ 1061 */
1062 z = zonelist->zones;
1063
938 do { 1064 do {
1065 if (NUMA_BUILD && zlc_active &&
1066 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1067 continue;
939 zone = *z; 1068 zone = *z;
940 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1069 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
941 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1070 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
942 break; 1071 break;
943 if ((alloc_flags & ALLOC_CPUSET) && 1072 if ((alloc_flags & ALLOC_CPUSET) &&
944 !cpuset_zone_allowed(zone, gfp_mask)) 1073 !cpuset_zone_allowed(zone, gfp_mask))
945 continue; 1074 goto try_next_zone;
946 1075
947 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1076 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
948 unsigned long mark; 1077 unsigned long mark;
@@ -956,15 +1085,30 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
956 classzone_idx, alloc_flags)) { 1085 classzone_idx, alloc_flags)) {
957 if (!zone_reclaim_mode || 1086 if (!zone_reclaim_mode ||
958 !zone_reclaim(zone, gfp_mask, order)) 1087 !zone_reclaim(zone, gfp_mask, order))
959 continue; 1088 goto this_zone_full;
960 } 1089 }
961 } 1090 }
962 1091
963 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1092 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
964 if (page) 1093 if (page)
965 break; 1094 break;
966 1095this_zone_full:
1096 if (NUMA_BUILD)
1097 zlc_mark_zone_full(zonelist, z);
1098try_next_zone:
1099 if (NUMA_BUILD && !did_zlc_setup) {
1100 /* we do zlc_setup after the first zone is tried */
1101 allowednodes = zlc_setup(zonelist, alloc_flags);
1102 zlc_active = 1;
1103 did_zlc_setup = 1;
1104 }
967 } while (*(++z) != NULL); 1105 } while (*(++z) != NULL);
1106
1107 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1108 /* Disable zlc cache for second zonelist scan */
1109 zlc_active = 0;
1110 goto zonelist_scan;
1111 }
968 return page; 1112 return page;
969} 1113}
970 1114
@@ -1535,6 +1679,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1535 } 1679 }
1536} 1680}
1537 1681
1682/* Construct the zonelist performance cache - see further mmzone.h */
1683static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1684{
1685 int i;
1686
1687 for (i = 0; i < MAX_NR_ZONES; i++) {
1688 struct zonelist *zonelist;
1689 struct zonelist_cache *zlc;
1690 struct zone **z;
1691
1692 zonelist = pgdat->node_zonelists + i;
1693 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1694 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1695 for (z = zonelist->zones; *z; z++)
1696 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1697 }
1698}
1699
1538#else /* CONFIG_NUMA */ 1700#else /* CONFIG_NUMA */
1539 1701
1540static void __meminit build_zonelists(pg_data_t *pgdat) 1702static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1572,14 +1734,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1572 } 1734 }
1573} 1735}
1574 1736
1737/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1738static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1739{
1740 int i;
1741
1742 for (i = 0; i < MAX_NR_ZONES; i++)
1743 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1744}
1745
1575#endif /* CONFIG_NUMA */ 1746#endif /* CONFIG_NUMA */
1576 1747
1577/* return values int ....just for stop_machine_run() */ 1748/* return values int ....just for stop_machine_run() */
1578static int __meminit __build_all_zonelists(void *dummy) 1749static int __meminit __build_all_zonelists(void *dummy)
1579{ 1750{
1580 int nid; 1751 int nid;
1581 for_each_online_node(nid) 1752
1753 for_each_online_node(nid) {
1582 build_zonelists(NODE_DATA(nid)); 1754 build_zonelists(NODE_DATA(nid));
1755 build_zonelist_cache(NODE_DATA(nid));
1756 }
1583 return 0; 1757 return 0;
1584} 1758}
1585 1759