diff options
-rw-r--r-- | include/linux/cpuset.h | 2 | ||||
-rw-r--r-- | include/linux/mmzone.h | 85 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 188 |
4 files changed, 265 insertions, 12 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 4d8adf663681..748d2c996631 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -23,6 +23,7 @@ extern void cpuset_fork(struct task_struct *p); | |||
23 | extern void cpuset_exit(struct task_struct *p); | 23 | extern void cpuset_exit(struct task_struct *p); |
24 | extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); | 24 | extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); |
25 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); | 25 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); |
26 | #define cpuset_current_mems_allowed (current->mems_allowed) | ||
26 | void cpuset_init_current_mems_allowed(void); | 27 | void cpuset_init_current_mems_allowed(void); |
27 | void cpuset_update_task_memory_state(void); | 28 | void cpuset_update_task_memory_state(void); |
28 | #define cpuset_nodes_subset_current_mems_allowed(nodes) \ | 29 | #define cpuset_nodes_subset_current_mems_allowed(nodes) \ |
@@ -83,6 +84,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) | |||
83 | return node_possible_map; | 84 | return node_possible_map; |
84 | } | 85 | } |
85 | 86 | ||
87 | #define cpuset_current_mems_allowed (node_online_map) | ||
86 | static inline void cpuset_init_current_mems_allowed(void) {} | 88 | static inline void cpuset_init_current_mems_allowed(void) {} |
87 | static inline void cpuset_update_task_memory_state(void) {} | 89 | static inline void cpuset_update_task_memory_state(void) {} |
88 | #define cpuset_nodes_subset_current_mems_allowed(nodes) (1) | 90 | #define cpuset_nodes_subset_current_mems_allowed(nodes) (1) |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e06683e2bea3..09bf9d8d7b72 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -288,19 +288,94 @@ struct zone { | |||
288 | */ | 288 | */ |
289 | #define DEF_PRIORITY 12 | 289 | #define DEF_PRIORITY 12 |
290 | 290 | ||
291 | /* Maximum number of zones on a zonelist */ | ||
292 | #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) | ||
293 | |||
294 | #ifdef CONFIG_NUMA | ||
295 | /* | ||
296 | * We cache key information from each zonelist for smaller cache | ||
297 | * footprint when scanning for free pages in get_page_from_freelist(). | ||
298 | * | ||
299 | * 1) The BITMAP fullzones tracks which zones in a zonelist have come | ||
300 | * up short of free memory since the last time (last_fullzone_zap) | ||
301 | * we zero'd fullzones. | ||
302 | * 2) The array z_to_n[] maps each zone in the zonelist to its node | ||
303 | * id, so that we can efficiently evaluate whether that node is | ||
304 | * set in the current tasks mems_allowed. | ||
305 | * | ||
306 | * Both fullzones and z_to_n[] are one-to-one with the zonelist, | ||
307 | * indexed by a zones offset in the zonelist zones[] array. | ||
308 | * | ||
309 | * The get_page_from_freelist() routine does two scans. During the | ||
310 | * first scan, we skip zones whose corresponding bit in 'fullzones' | ||
311 | * is set or whose corresponding node in current->mems_allowed (which | ||
312 | * comes from cpusets) is not set. During the second scan, we bypass | ||
313 | * this zonelist_cache, to ensure we look methodically at each zone. | ||
314 | * | ||
315 | * Once per second, we zero out (zap) fullzones, forcing us to | ||
316 | * reconsider nodes that might have regained more free memory. | ||
317 | * The field last_full_zap is the time we last zapped fullzones. | ||
318 | * | ||
319 | * This mechanism reduces the amount of time we waste repeatedly | ||
320 | * reexaming zones for free memory when they just came up low on | ||
321 | * memory momentarilly ago. | ||
322 | * | ||
323 | * The zonelist_cache struct members logically belong in struct | ||
324 | * zonelist. However, the mempolicy zonelists constructed for | ||
325 | * MPOL_BIND are intentionally variable length (and usually much | ||
326 | * shorter). A general purpose mechanism for handling structs with | ||
327 | * multiple variable length members is more mechanism than we want | ||
328 | * here. We resort to some special case hackery instead. | ||
329 | * | ||
330 | * The MPOL_BIND zonelists don't need this zonelist_cache (in good | ||
331 | * part because they are shorter), so we put the fixed length stuff | ||
332 | * at the front of the zonelist struct, ending in a variable length | ||
333 | * zones[], as is needed by MPOL_BIND. | ||
334 | * | ||
335 | * Then we put the optional zonelist cache on the end of the zonelist | ||
336 | * struct. This optional stuff is found by a 'zlcache_ptr' pointer in | ||
337 | * the fixed length portion at the front of the struct. This pointer | ||
338 | * both enables us to find the zonelist cache, and in the case of | ||
339 | * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) | ||
340 | * to know that the zonelist cache is not there. | ||
341 | * | ||
342 | * The end result is that struct zonelists come in two flavors: | ||
343 | * 1) The full, fixed length version, shown below, and | ||
344 | * 2) The custom zonelists for MPOL_BIND. | ||
345 | * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. | ||
346 | * | ||
347 | * Even though there may be multiple CPU cores on a node modifying | ||
348 | * fullzones or last_full_zap in the same zonelist_cache at the same | ||
349 | * time, we don't lock it. This is just hint data - if it is wrong now | ||
350 | * and then, the allocator will still function, perhaps a bit slower. | ||
351 | */ | ||
352 | |||
353 | |||
354 | struct zonelist_cache { | ||
355 | DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ | ||
356 | unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ | ||
357 | unsigned long last_full_zap; /* when last zap'd (jiffies) */ | ||
358 | }; | ||
359 | #else | ||
360 | struct zonelist_cache; | ||
361 | #endif | ||
362 | |||
291 | /* | 363 | /* |
292 | * One allocation request operates on a zonelist. A zonelist | 364 | * One allocation request operates on a zonelist. A zonelist |
293 | * is a list of zones, the first one is the 'goal' of the | 365 | * is a list of zones, the first one is the 'goal' of the |
294 | * allocation, the other zones are fallback zones, in decreasing | 366 | * allocation, the other zones are fallback zones, in decreasing |
295 | * priority. | 367 | * priority. |
296 | * | 368 | * |
297 | * Right now a zonelist takes up less than a cacheline. We never | 369 | * If zlcache_ptr is not NULL, then it is just the address of zlcache, |
298 | * modify it apart from boot-up, and only a few indices are used, | 370 | * as explained above. If zlcache_ptr is NULL, there is no zlcache. |
299 | * so despite the zonelist table being relatively big, the cache | ||
300 | * footprint of this construct is very small. | ||
301 | */ | 371 | */ |
372 | |||
302 | struct zonelist { | 373 | struct zonelist { |
303 | struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited | 374 | struct zonelist_cache *zlcache_ptr; // NULL or &zlcache |
375 | struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited | ||
376 | #ifdef CONFIG_NUMA | ||
377 | struct zonelist_cache zlcache; // optional ... | ||
378 | #endif | ||
304 | }; | 379 | }; |
305 | 380 | ||
306 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 381 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 617fb31086ee..fb907236bbd8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
141 | enum zone_type k; | 141 | enum zone_type k; |
142 | 142 | ||
143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
144 | max++; /* space for zlcache_ptr (see mmzone.h) */ | ||
144 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 145 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); |
145 | if (!zl) | 146 | if (!zl) |
146 | return NULL; | 147 | return NULL; |
148 | zl->zlcache_ptr = NULL; | ||
147 | num = 0; | 149 | num = 0; |
148 | /* First put in the highest zones from all nodes, then all the next | 150 | /* First put in the highest zones from all nodes, then all the next |
149 | lower zones etc. Avoid empty zones because the memory allocator | 151 | lower zones etc. Avoid empty zones because the memory allocator |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23bc5bcbdcf9..230771d3c6b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -918,6 +918,126 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
918 | return 1; | 918 | return 1; |
919 | } | 919 | } |
920 | 920 | ||
921 | #ifdef CONFIG_NUMA | ||
922 | /* | ||
923 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to | ||
924 | * skip over zones that are not allowed by the cpuset, or that have | ||
925 | * been recently (in last second) found to be nearly full. See further | ||
926 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | ||
927 | * that have to skip over alot of full or unallowed zones. | ||
928 | * | ||
929 | * If the zonelist cache is present in the passed in zonelist, then | ||
930 | * returns a pointer to the allowed node mask (either the current | ||
931 | * tasks mems_allowed, or node_online_map.) | ||
932 | * | ||
933 | * If the zonelist cache is not available for this zonelist, does | ||
934 | * nothing and returns NULL. | ||
935 | * | ||
936 | * If the fullzones BITMAP in the zonelist cache is stale (more than | ||
937 | * a second since last zap'd) then we zap it out (clear its bits.) | ||
938 | * | ||
939 | * We hold off even calling zlc_setup, until after we've checked the | ||
940 | * first zone in the zonelist, on the theory that most allocations will | ||
941 | * be satisfied from that first zone, so best to examine that zone as | ||
942 | * quickly as we can. | ||
943 | */ | ||
944 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
945 | { | ||
946 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
947 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | ||
948 | |||
949 | zlc = zonelist->zlcache_ptr; | ||
950 | if (!zlc) | ||
951 | return NULL; | ||
952 | |||
953 | if (jiffies - zlc->last_full_zap > 1 * HZ) { | ||
954 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
955 | zlc->last_full_zap = jiffies; | ||
956 | } | ||
957 | |||
958 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | ||
959 | &cpuset_current_mems_allowed : | ||
960 | &node_online_map; | ||
961 | return allowednodes; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | ||
966 | * if it is worth looking at further for free memory: | ||
967 | * 1) Check that the zone isn't thought to be full (doesn't have its | ||
968 | * bit set in the zonelist_cache fullzones BITMAP). | ||
969 | * 2) Check that the zones node (obtained from the zonelist_cache | ||
970 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | ||
971 | * Return true (non-zero) if zone is worth looking at further, or | ||
972 | * else return false (zero) if it is not. | ||
973 | * | ||
974 | * This check -ignores- the distinction between various watermarks, | ||
975 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | ||
976 | * found to be full for any variation of these watermarks, it will | ||
977 | * be considered full for up to one second by all requests, unless | ||
978 | * we are so low on memory on all allowed nodes that we are forced | ||
979 | * into the second scan of the zonelist. | ||
980 | * | ||
981 | * In the second scan we ignore this zonelist cache and exactly | ||
982 | * apply the watermarks to all zones, even it is slower to do so. | ||
983 | * We are low on memory in the second scan, and should leave no stone | ||
984 | * unturned looking for a free page. | ||
985 | */ | ||
986 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
987 | nodemask_t *allowednodes) | ||
988 | { | ||
989 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
990 | int i; /* index of *z in zonelist zones */ | ||
991 | int n; /* node that zone *z is on */ | ||
992 | |||
993 | zlc = zonelist->zlcache_ptr; | ||
994 | if (!zlc) | ||
995 | return 1; | ||
996 | |||
997 | i = z - zonelist->zones; | ||
998 | n = zlc->z_to_n[i]; | ||
999 | |||
1000 | /* This zone is worth trying if it is allowed but not full */ | ||
1001 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | ||
1002 | } | ||
1003 | |||
1004 | /* | ||
1005 | * Given 'z' scanning a zonelist, set the corresponding bit in | ||
1006 | * zlc->fullzones, so that subsequent attempts to allocate a page | ||
1007 | * from that zone don't waste time re-examining it. | ||
1008 | */ | ||
1009 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1010 | { | ||
1011 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1012 | int i; /* index of *z in zonelist zones */ | ||
1013 | |||
1014 | zlc = zonelist->zlcache_ptr; | ||
1015 | if (!zlc) | ||
1016 | return; | ||
1017 | |||
1018 | i = z - zonelist->zones; | ||
1019 | |||
1020 | set_bit(i, zlc->fullzones); | ||
1021 | } | ||
1022 | |||
1023 | #else /* CONFIG_NUMA */ | ||
1024 | |||
1025 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1026 | { | ||
1027 | return NULL; | ||
1028 | } | ||
1029 | |||
1030 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1031 | nodemask_t *allowednodes) | ||
1032 | { | ||
1033 | return 1; | ||
1034 | } | ||
1035 | |||
1036 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1037 | { | ||
1038 | } | ||
1039 | #endif /* CONFIG_NUMA */ | ||
1040 | |||
921 | /* | 1041 | /* |
922 | * get_page_from_freelist goes through the zonelist trying to allocate | 1042 | * get_page_from_freelist goes through the zonelist trying to allocate |
923 | * a page. | 1043 | * a page. |
@@ -926,23 +1046,32 @@ static struct page * | |||
926 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1046 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, |
927 | struct zonelist *zonelist, int alloc_flags) | 1047 | struct zonelist *zonelist, int alloc_flags) |
928 | { | 1048 | { |
929 | struct zone **z = zonelist->zones; | 1049 | struct zone **z; |
930 | struct page *page = NULL; | 1050 | struct page *page = NULL; |
931 | int classzone_idx = zone_idx(*z); | 1051 | int classzone_idx = zone_idx(zonelist->zones[0]); |
932 | struct zone *zone; | 1052 | struct zone *zone; |
1053 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | ||
1054 | int zlc_active = 0; /* set if using zonelist_cache */ | ||
1055 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | ||
933 | 1056 | ||
1057 | zonelist_scan: | ||
934 | /* | 1058 | /* |
935 | * Go through the zonelist once, looking for a zone with enough free. | 1059 | * Scan zonelist, looking for a zone with enough free. |
936 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1060 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
937 | */ | 1061 | */ |
1062 | z = zonelist->zones; | ||
1063 | |||
938 | do { | 1064 | do { |
1065 | if (NUMA_BUILD && zlc_active && | ||
1066 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1067 | continue; | ||
939 | zone = *z; | 1068 | zone = *z; |
940 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | 1069 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && |
941 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 1070 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
942 | break; | 1071 | break; |
943 | if ((alloc_flags & ALLOC_CPUSET) && | 1072 | if ((alloc_flags & ALLOC_CPUSET) && |
944 | !cpuset_zone_allowed(zone, gfp_mask)) | 1073 | !cpuset_zone_allowed(zone, gfp_mask)) |
945 | continue; | 1074 | goto try_next_zone; |
946 | 1075 | ||
947 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1076 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
948 | unsigned long mark; | 1077 | unsigned long mark; |
@@ -956,15 +1085,30 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
956 | classzone_idx, alloc_flags)) { | 1085 | classzone_idx, alloc_flags)) { |
957 | if (!zone_reclaim_mode || | 1086 | if (!zone_reclaim_mode || |
958 | !zone_reclaim(zone, gfp_mask, order)) | 1087 | !zone_reclaim(zone, gfp_mask, order)) |
959 | continue; | 1088 | goto this_zone_full; |
960 | } | 1089 | } |
961 | } | 1090 | } |
962 | 1091 | ||
963 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1092 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); |
964 | if (page) | 1093 | if (page) |
965 | break; | 1094 | break; |
966 | 1095 | this_zone_full: | |
1096 | if (NUMA_BUILD) | ||
1097 | zlc_mark_zone_full(zonelist, z); | ||
1098 | try_next_zone: | ||
1099 | if (NUMA_BUILD && !did_zlc_setup) { | ||
1100 | /* we do zlc_setup after the first zone is tried */ | ||
1101 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1102 | zlc_active = 1; | ||
1103 | did_zlc_setup = 1; | ||
1104 | } | ||
967 | } while (*(++z) != NULL); | 1105 | } while (*(++z) != NULL); |
1106 | |||
1107 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | ||
1108 | /* Disable zlc cache for second zonelist scan */ | ||
1109 | zlc_active = 0; | ||
1110 | goto zonelist_scan; | ||
1111 | } | ||
968 | return page; | 1112 | return page; |
969 | } | 1113 | } |
970 | 1114 | ||
@@ -1535,6 +1679,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1535 | } | 1679 | } |
1536 | } | 1680 | } |
1537 | 1681 | ||
1682 | /* Construct the zonelist performance cache - see further mmzone.h */ | ||
1683 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1684 | { | ||
1685 | int i; | ||
1686 | |||
1687 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1688 | struct zonelist *zonelist; | ||
1689 | struct zonelist_cache *zlc; | ||
1690 | struct zone **z; | ||
1691 | |||
1692 | zonelist = pgdat->node_zonelists + i; | ||
1693 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | ||
1694 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1695 | for (z = zonelist->zones; *z; z++) | ||
1696 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1538 | #else /* CONFIG_NUMA */ | 1700 | #else /* CONFIG_NUMA */ |
1539 | 1701 | ||
1540 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1702 | static void __meminit build_zonelists(pg_data_t *pgdat) |
@@ -1572,14 +1734,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1572 | } | 1734 | } |
1573 | } | 1735 | } |
1574 | 1736 | ||
1737 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | ||
1738 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1739 | { | ||
1740 | int i; | ||
1741 | |||
1742 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1743 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
1744 | } | ||
1745 | |||
1575 | #endif /* CONFIG_NUMA */ | 1746 | #endif /* CONFIG_NUMA */ |
1576 | 1747 | ||
1577 | /* return values int ....just for stop_machine_run() */ | 1748 | /* return values int ....just for stop_machine_run() */ |
1578 | static int __meminit __build_all_zonelists(void *dummy) | 1749 | static int __meminit __build_all_zonelists(void *dummy) |
1579 | { | 1750 | { |
1580 | int nid; | 1751 | int nid; |
1581 | for_each_online_node(nid) | 1752 | |
1753 | for_each_online_node(nid) { | ||
1582 | build_zonelists(NODE_DATA(nid)); | 1754 | build_zonelists(NODE_DATA(nid)); |
1755 | build_zonelist_cache(NODE_DATA(nid)); | ||
1756 | } | ||
1583 | return 0; | 1757 | return 0; |
1584 | } | 1758 | } |
1585 | 1759 | ||