aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/page_alloc.c188
2 files changed, 183 insertions, 7 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 617fb31086ee..fb907236bbd8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
141 enum zone_type k; 141 enum zone_type k;
142 142
143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
144 max++; /* space for zlcache_ptr (see mmzone.h) */
144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 145 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
145 if (!zl) 146 if (!zl)
146 return NULL; 147 return NULL;
148 zl->zlcache_ptr = NULL;
147 num = 0; 149 num = 0;
148 /* First put in the highest zones from all nodes, then all the next 150 /* First put in the highest zones from all nodes, then all the next
149 lower zones etc. Avoid empty zones because the memory allocator 151 lower zones etc. Avoid empty zones because the memory allocator
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23bc5bcbdcf9..230771d3c6b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -918,6 +918,126 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
918 return 1; 918 return 1;
919} 919}
920 920
921#ifdef CONFIG_NUMA
922/*
923 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
924 * skip over zones that are not allowed by the cpuset, or that have
925 * been recently (in last second) found to be nearly full. See further
926 * comments in mmzone.h. Reduces cache footprint of zonelist scans
927 * that have to skip over alot of full or unallowed zones.
928 *
929 * If the zonelist cache is present in the passed in zonelist, then
930 * returns a pointer to the allowed node mask (either the current
931 * tasks mems_allowed, or node_online_map.)
932 *
933 * If the zonelist cache is not available for this zonelist, does
934 * nothing and returns NULL.
935 *
936 * If the fullzones BITMAP in the zonelist cache is stale (more than
937 * a second since last zap'd) then we zap it out (clear its bits.)
938 *
939 * We hold off even calling zlc_setup, until after we've checked the
940 * first zone in the zonelist, on the theory that most allocations will
941 * be satisfied from that first zone, so best to examine that zone as
942 * quickly as we can.
943 */
944static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
945{
946 struct zonelist_cache *zlc; /* cached zonelist speedup info */
947 nodemask_t *allowednodes; /* zonelist_cache approximation */
948
949 zlc = zonelist->zlcache_ptr;
950 if (!zlc)
951 return NULL;
952
953 if (jiffies - zlc->last_full_zap > 1 * HZ) {
954 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
955 zlc->last_full_zap = jiffies;
956 }
957
958 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
959 &cpuset_current_mems_allowed :
960 &node_online_map;
961 return allowednodes;
962}
963
964/*
965 * Given 'z' scanning a zonelist, run a couple of quick checks to see
966 * if it is worth looking at further for free memory:
967 * 1) Check that the zone isn't thought to be full (doesn't have its
968 * bit set in the zonelist_cache fullzones BITMAP).
969 * 2) Check that the zones node (obtained from the zonelist_cache
970 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
971 * Return true (non-zero) if zone is worth looking at further, or
972 * else return false (zero) if it is not.
973 *
974 * This check -ignores- the distinction between various watermarks,
975 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
976 * found to be full for any variation of these watermarks, it will
977 * be considered full for up to one second by all requests, unless
978 * we are so low on memory on all allowed nodes that we are forced
979 * into the second scan of the zonelist.
980 *
981 * In the second scan we ignore this zonelist cache and exactly
982 * apply the watermarks to all zones, even it is slower to do so.
983 * We are low on memory in the second scan, and should leave no stone
984 * unturned looking for a free page.
985 */
986static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
987 nodemask_t *allowednodes)
988{
989 struct zonelist_cache *zlc; /* cached zonelist speedup info */
990 int i; /* index of *z in zonelist zones */
991 int n; /* node that zone *z is on */
992
993 zlc = zonelist->zlcache_ptr;
994 if (!zlc)
995 return 1;
996
997 i = z - zonelist->zones;
998 n = zlc->z_to_n[i];
999
1000 /* This zone is worth trying if it is allowed but not full */
1001 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1002}
1003
1004/*
1005 * Given 'z' scanning a zonelist, set the corresponding bit in
1006 * zlc->fullzones, so that subsequent attempts to allocate a page
1007 * from that zone don't waste time re-examining it.
1008 */
1009static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1010{
1011 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1012 int i; /* index of *z in zonelist zones */
1013
1014 zlc = zonelist->zlcache_ptr;
1015 if (!zlc)
1016 return;
1017
1018 i = z - zonelist->zones;
1019
1020 set_bit(i, zlc->fullzones);
1021}
1022
1023#else /* CONFIG_NUMA */
1024
1025static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1026{
1027 return NULL;
1028}
1029
1030static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1031 nodemask_t *allowednodes)
1032{
1033 return 1;
1034}
1035
1036static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1037{
1038}
1039#endif /* CONFIG_NUMA */
1040
921/* 1041/*
922 * get_page_from_freelist goes through the zonelist trying to allocate 1042 * get_page_from_freelist goes through the zonelist trying to allocate
923 * a page. 1043 * a page.
@@ -926,23 +1046,32 @@ static struct page *
926get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1046get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
927 struct zonelist *zonelist, int alloc_flags) 1047 struct zonelist *zonelist, int alloc_flags)
928{ 1048{
929 struct zone **z = zonelist->zones; 1049 struct zone **z;
930 struct page *page = NULL; 1050 struct page *page = NULL;
931 int classzone_idx = zone_idx(*z); 1051 int classzone_idx = zone_idx(zonelist->zones[0]);
932 struct zone *zone; 1052 struct zone *zone;
1053 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1054 int zlc_active = 0; /* set if using zonelist_cache */
1055 int did_zlc_setup = 0; /* just call zlc_setup() one time */
933 1056
1057zonelist_scan:
934 /* 1058 /*
935 * Go through the zonelist once, looking for a zone with enough free. 1059 * Scan zonelist, looking for a zone with enough free.
936 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1060 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
937 */ 1061 */
1062 z = zonelist->zones;
1063
938 do { 1064 do {
1065 if (NUMA_BUILD && zlc_active &&
1066 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1067 continue;
939 zone = *z; 1068 zone = *z;
940 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1069 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
941 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1070 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
942 break; 1071 break;
943 if ((alloc_flags & ALLOC_CPUSET) && 1072 if ((alloc_flags & ALLOC_CPUSET) &&
944 !cpuset_zone_allowed(zone, gfp_mask)) 1073 !cpuset_zone_allowed(zone, gfp_mask))
945 continue; 1074 goto try_next_zone;
946 1075
947 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1076 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
948 unsigned long mark; 1077 unsigned long mark;
@@ -956,15 +1085,30 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
956 classzone_idx, alloc_flags)) { 1085 classzone_idx, alloc_flags)) {
957 if (!zone_reclaim_mode || 1086 if (!zone_reclaim_mode ||
958 !zone_reclaim(zone, gfp_mask, order)) 1087 !zone_reclaim(zone, gfp_mask, order))
959 continue; 1088 goto this_zone_full;
960 } 1089 }
961 } 1090 }
962 1091
963 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1092 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
964 if (page) 1093 if (page)
965 break; 1094 break;
966 1095this_zone_full:
1096 if (NUMA_BUILD)
1097 zlc_mark_zone_full(zonelist, z);
1098try_next_zone:
1099 if (NUMA_BUILD && !did_zlc_setup) {
1100 /* we do zlc_setup after the first zone is tried */
1101 allowednodes = zlc_setup(zonelist, alloc_flags);
1102 zlc_active = 1;
1103 did_zlc_setup = 1;
1104 }
967 } while (*(++z) != NULL); 1105 } while (*(++z) != NULL);
1106
1107 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1108 /* Disable zlc cache for second zonelist scan */
1109 zlc_active = 0;
1110 goto zonelist_scan;
1111 }
968 return page; 1112 return page;
969} 1113}
970 1114
@@ -1535,6 +1679,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1535 } 1679 }
1536} 1680}
1537 1681
1682/* Construct the zonelist performance cache - see further mmzone.h */
1683static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1684{
1685 int i;
1686
1687 for (i = 0; i < MAX_NR_ZONES; i++) {
1688 struct zonelist *zonelist;
1689 struct zonelist_cache *zlc;
1690 struct zone **z;
1691
1692 zonelist = pgdat->node_zonelists + i;
1693 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1694 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1695 for (z = zonelist->zones; *z; z++)
1696 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1697 }
1698}
1699
1538#else /* CONFIG_NUMA */ 1700#else /* CONFIG_NUMA */
1539 1701
1540static void __meminit build_zonelists(pg_data_t *pgdat) 1702static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1572,14 +1734,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1572 } 1734 }
1573} 1735}
1574 1736
1737/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1738static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1739{
1740 int i;
1741
1742 for (i = 0; i < MAX_NR_ZONES; i++)
1743 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1744}
1745
1575#endif /* CONFIG_NUMA */ 1746#endif /* CONFIG_NUMA */
1576 1747
1577/* return values int ....just for stop_machine_run() */ 1748/* return values int ....just for stop_machine_run() */
1578static int __meminit __build_all_zonelists(void *dummy) 1749static int __meminit __build_all_zonelists(void *dummy)
1579{ 1750{
1580 int nid; 1751 int nid;
1581 for_each_online_node(nid) 1752
1753 for_each_online_node(nid) {
1582 build_zonelists(NODE_DATA(nid)); 1754 build_zonelists(NODE_DATA(nid));
1755 build_zonelist_cache(NODE_DATA(nid));
1756 }
1583 return 0; 1757 return 0;
1584} 1758}
1585 1759