aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c188
1 files changed, 181 insertions, 7 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23bc5bcbdcf9..230771d3c6b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -918,6 +918,126 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
918 return 1; 918 return 1;
919} 919}
920 920
921#ifdef CONFIG_NUMA
922/*
923 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
924 * skip over zones that are not allowed by the cpuset, or that have
925 * been recently (in last second) found to be nearly full. See further
926 * comments in mmzone.h. Reduces cache footprint of zonelist scans
927 * that have to skip over alot of full or unallowed zones.
928 *
929 * If the zonelist cache is present in the passed in zonelist, then
930 * returns a pointer to the allowed node mask (either the current
931 * tasks mems_allowed, or node_online_map.)
932 *
933 * If the zonelist cache is not available for this zonelist, does
934 * nothing and returns NULL.
935 *
936 * If the fullzones BITMAP in the zonelist cache is stale (more than
937 * a second since last zap'd) then we zap it out (clear its bits.)
938 *
939 * We hold off even calling zlc_setup, until after we've checked the
940 * first zone in the zonelist, on the theory that most allocations will
941 * be satisfied from that first zone, so best to examine that zone as
942 * quickly as we can.
943 */
944static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
945{
946 struct zonelist_cache *zlc; /* cached zonelist speedup info */
947 nodemask_t *allowednodes; /* zonelist_cache approximation */
948
949 zlc = zonelist->zlcache_ptr;
950 if (!zlc)
951 return NULL;
952
953 if (jiffies - zlc->last_full_zap > 1 * HZ) {
954 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
955 zlc->last_full_zap = jiffies;
956 }
957
958 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
959 &cpuset_current_mems_allowed :
960 &node_online_map;
961 return allowednodes;
962}
963
964/*
965 * Given 'z' scanning a zonelist, run a couple of quick checks to see
966 * if it is worth looking at further for free memory:
967 * 1) Check that the zone isn't thought to be full (doesn't have its
968 * bit set in the zonelist_cache fullzones BITMAP).
969 * 2) Check that the zones node (obtained from the zonelist_cache
970 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
971 * Return true (non-zero) if zone is worth looking at further, or
972 * else return false (zero) if it is not.
973 *
974 * This check -ignores- the distinction between various watermarks,
975 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
976 * found to be full for any variation of these watermarks, it will
977 * be considered full for up to one second by all requests, unless
978 * we are so low on memory on all allowed nodes that we are forced
979 * into the second scan of the zonelist.
980 *
981 * In the second scan we ignore this zonelist cache and exactly
982 * apply the watermarks to all zones, even it is slower to do so.
983 * We are low on memory in the second scan, and should leave no stone
984 * unturned looking for a free page.
985 */
986static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
987 nodemask_t *allowednodes)
988{
989 struct zonelist_cache *zlc; /* cached zonelist speedup info */
990 int i; /* index of *z in zonelist zones */
991 int n; /* node that zone *z is on */
992
993 zlc = zonelist->zlcache_ptr;
994 if (!zlc)
995 return 1;
996
997 i = z - zonelist->zones;
998 n = zlc->z_to_n[i];
999
1000 /* This zone is worth trying if it is allowed but not full */
1001 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1002}
1003
1004/*
1005 * Given 'z' scanning a zonelist, set the corresponding bit in
1006 * zlc->fullzones, so that subsequent attempts to allocate a page
1007 * from that zone don't waste time re-examining it.
1008 */
1009static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1010{
1011 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1012 int i; /* index of *z in zonelist zones */
1013
1014 zlc = zonelist->zlcache_ptr;
1015 if (!zlc)
1016 return;
1017
1018 i = z - zonelist->zones;
1019
1020 set_bit(i, zlc->fullzones);
1021}
1022
1023#else /* CONFIG_NUMA */
1024
1025static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1026{
1027 return NULL;
1028}
1029
1030static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1031 nodemask_t *allowednodes)
1032{
1033 return 1;
1034}
1035
1036static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1037{
1038}
1039#endif /* CONFIG_NUMA */
1040
921/* 1041/*
922 * get_page_from_freelist goes through the zonelist trying to allocate 1042 * get_page_from_freelist goes through the zonelist trying to allocate
923 * a page. 1043 * a page.
@@ -926,23 +1046,32 @@ static struct page *
926get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1046get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
927 struct zonelist *zonelist, int alloc_flags) 1047 struct zonelist *zonelist, int alloc_flags)
928{ 1048{
929 struct zone **z = zonelist->zones; 1049 struct zone **z;
930 struct page *page = NULL; 1050 struct page *page = NULL;
931 int classzone_idx = zone_idx(*z); 1051 int classzone_idx = zone_idx(zonelist->zones[0]);
932 struct zone *zone; 1052 struct zone *zone;
1053 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1054 int zlc_active = 0; /* set if using zonelist_cache */
1055 int did_zlc_setup = 0; /* just call zlc_setup() one time */
933 1056
1057zonelist_scan:
934 /* 1058 /*
935 * Go through the zonelist once, looking for a zone with enough free. 1059 * Scan zonelist, looking for a zone with enough free.
936 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1060 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
937 */ 1061 */
1062 z = zonelist->zones;
1063
938 do { 1064 do {
1065 if (NUMA_BUILD && zlc_active &&
1066 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1067 continue;
939 zone = *z; 1068 zone = *z;
940 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1069 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
941 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1070 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
942 break; 1071 break;
943 if ((alloc_flags & ALLOC_CPUSET) && 1072 if ((alloc_flags & ALLOC_CPUSET) &&
944 !cpuset_zone_allowed(zone, gfp_mask)) 1073 !cpuset_zone_allowed(zone, gfp_mask))
945 continue; 1074 goto try_next_zone;
946 1075
947 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1076 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
948 unsigned long mark; 1077 unsigned long mark;
@@ -956,15 +1085,30 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
956 classzone_idx, alloc_flags)) { 1085 classzone_idx, alloc_flags)) {
957 if (!zone_reclaim_mode || 1086 if (!zone_reclaim_mode ||
958 !zone_reclaim(zone, gfp_mask, order)) 1087 !zone_reclaim(zone, gfp_mask, order))
959 continue; 1088 goto this_zone_full;
960 } 1089 }
961 } 1090 }
962 1091
963 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1092 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
964 if (page) 1093 if (page)
965 break; 1094 break;
966 1095this_zone_full:
1096 if (NUMA_BUILD)
1097 zlc_mark_zone_full(zonelist, z);
1098try_next_zone:
1099 if (NUMA_BUILD && !did_zlc_setup) {
1100 /* we do zlc_setup after the first zone is tried */
1101 allowednodes = zlc_setup(zonelist, alloc_flags);
1102 zlc_active = 1;
1103 did_zlc_setup = 1;
1104 }
967 } while (*(++z) != NULL); 1105 } while (*(++z) != NULL);
1106
1107 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1108 /* Disable zlc cache for second zonelist scan */
1109 zlc_active = 0;
1110 goto zonelist_scan;
1111 }
968 return page; 1112 return page;
969} 1113}
970 1114
@@ -1535,6 +1679,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1535 } 1679 }
1536} 1680}
1537 1681
1682/* Construct the zonelist performance cache - see further mmzone.h */
1683static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1684{
1685 int i;
1686
1687 for (i = 0; i < MAX_NR_ZONES; i++) {
1688 struct zonelist *zonelist;
1689 struct zonelist_cache *zlc;
1690 struct zone **z;
1691
1692 zonelist = pgdat->node_zonelists + i;
1693 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1694 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1695 for (z = zonelist->zones; *z; z++)
1696 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1697 }
1698}
1699
1538#else /* CONFIG_NUMA */ 1700#else /* CONFIG_NUMA */
1539 1701
1540static void __meminit build_zonelists(pg_data_t *pgdat) 1702static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1572,14 +1734,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1572 } 1734 }
1573} 1735}
1574 1736
1737/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1738static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1739{
1740 int i;
1741
1742 for (i = 0; i < MAX_NR_ZONES; i++)
1743 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1744}
1745
1575#endif /* CONFIG_NUMA */ 1746#endif /* CONFIG_NUMA */
1576 1747
1577/* return values int ....just for stop_machine_run() */ 1748/* return values int ....just for stop_machine_run() */
1578static int __meminit __build_all_zonelists(void *dummy) 1749static int __meminit __build_all_zonelists(void *dummy)
1579{ 1750{
1580 int nid; 1751 int nid;
1581 for_each_online_node(nid) 1752
1753 for_each_online_node(nid) {
1582 build_zonelists(NODE_DATA(nid)); 1754 build_zonelists(NODE_DATA(nid));
1755 build_zonelist_cache(NODE_DATA(nid));
1756 }
1583 return 0; 1757 return 0;
1584} 1758}
1585 1759