diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 188 |
1 files changed, 181 insertions, 7 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23bc5bcbdcf9..230771d3c6b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -918,6 +918,126 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
918 | return 1; | 918 | return 1; |
919 | } | 919 | } |
920 | 920 | ||
921 | #ifdef CONFIG_NUMA | ||
922 | /* | ||
923 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to | ||
924 | * skip over zones that are not allowed by the cpuset, or that have | ||
925 | * been recently (in last second) found to be nearly full. See further | ||
926 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | ||
927 | * that have to skip over alot of full or unallowed zones. | ||
928 | * | ||
929 | * If the zonelist cache is present in the passed in zonelist, then | ||
930 | * returns a pointer to the allowed node mask (either the current | ||
931 | * tasks mems_allowed, or node_online_map.) | ||
932 | * | ||
933 | * If the zonelist cache is not available for this zonelist, does | ||
934 | * nothing and returns NULL. | ||
935 | * | ||
936 | * If the fullzones BITMAP in the zonelist cache is stale (more than | ||
937 | * a second since last zap'd) then we zap it out (clear its bits.) | ||
938 | * | ||
939 | * We hold off even calling zlc_setup, until after we've checked the | ||
940 | * first zone in the zonelist, on the theory that most allocations will | ||
941 | * be satisfied from that first zone, so best to examine that zone as | ||
942 | * quickly as we can. | ||
943 | */ | ||
944 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
945 | { | ||
946 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
947 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | ||
948 | |||
949 | zlc = zonelist->zlcache_ptr; | ||
950 | if (!zlc) | ||
951 | return NULL; | ||
952 | |||
953 | if (jiffies - zlc->last_full_zap > 1 * HZ) { | ||
954 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
955 | zlc->last_full_zap = jiffies; | ||
956 | } | ||
957 | |||
958 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | ||
959 | &cpuset_current_mems_allowed : | ||
960 | &node_online_map; | ||
961 | return allowednodes; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | ||
966 | * if it is worth looking at further for free memory: | ||
967 | * 1) Check that the zone isn't thought to be full (doesn't have its | ||
968 | * bit set in the zonelist_cache fullzones BITMAP). | ||
969 | * 2) Check that the zones node (obtained from the zonelist_cache | ||
970 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | ||
971 | * Return true (non-zero) if zone is worth looking at further, or | ||
972 | * else return false (zero) if it is not. | ||
973 | * | ||
974 | * This check -ignores- the distinction between various watermarks, | ||
975 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | ||
976 | * found to be full for any variation of these watermarks, it will | ||
977 | * be considered full for up to one second by all requests, unless | ||
978 | * we are so low on memory on all allowed nodes that we are forced | ||
979 | * into the second scan of the zonelist. | ||
980 | * | ||
981 | * In the second scan we ignore this zonelist cache and exactly | ||
982 | * apply the watermarks to all zones, even it is slower to do so. | ||
983 | * We are low on memory in the second scan, and should leave no stone | ||
984 | * unturned looking for a free page. | ||
985 | */ | ||
986 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
987 | nodemask_t *allowednodes) | ||
988 | { | ||
989 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
990 | int i; /* index of *z in zonelist zones */ | ||
991 | int n; /* node that zone *z is on */ | ||
992 | |||
993 | zlc = zonelist->zlcache_ptr; | ||
994 | if (!zlc) | ||
995 | return 1; | ||
996 | |||
997 | i = z - zonelist->zones; | ||
998 | n = zlc->z_to_n[i]; | ||
999 | |||
1000 | /* This zone is worth trying if it is allowed but not full */ | ||
1001 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | ||
1002 | } | ||
1003 | |||
1004 | /* | ||
1005 | * Given 'z' scanning a zonelist, set the corresponding bit in | ||
1006 | * zlc->fullzones, so that subsequent attempts to allocate a page | ||
1007 | * from that zone don't waste time re-examining it. | ||
1008 | */ | ||
1009 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1010 | { | ||
1011 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1012 | int i; /* index of *z in zonelist zones */ | ||
1013 | |||
1014 | zlc = zonelist->zlcache_ptr; | ||
1015 | if (!zlc) | ||
1016 | return; | ||
1017 | |||
1018 | i = z - zonelist->zones; | ||
1019 | |||
1020 | set_bit(i, zlc->fullzones); | ||
1021 | } | ||
1022 | |||
1023 | #else /* CONFIG_NUMA */ | ||
1024 | |||
1025 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1026 | { | ||
1027 | return NULL; | ||
1028 | } | ||
1029 | |||
1030 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1031 | nodemask_t *allowednodes) | ||
1032 | { | ||
1033 | return 1; | ||
1034 | } | ||
1035 | |||
1036 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1037 | { | ||
1038 | } | ||
1039 | #endif /* CONFIG_NUMA */ | ||
1040 | |||
921 | /* | 1041 | /* |
922 | * get_page_from_freelist goes through the zonelist trying to allocate | 1042 | * get_page_from_freelist goes through the zonelist trying to allocate |
923 | * a page. | 1043 | * a page. |
@@ -926,23 +1046,32 @@ static struct page * | |||
926 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1046 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, |
927 | struct zonelist *zonelist, int alloc_flags) | 1047 | struct zonelist *zonelist, int alloc_flags) |
928 | { | 1048 | { |
929 | struct zone **z = zonelist->zones; | 1049 | struct zone **z; |
930 | struct page *page = NULL; | 1050 | struct page *page = NULL; |
931 | int classzone_idx = zone_idx(*z); | 1051 | int classzone_idx = zone_idx(zonelist->zones[0]); |
932 | struct zone *zone; | 1052 | struct zone *zone; |
1053 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | ||
1054 | int zlc_active = 0; /* set if using zonelist_cache */ | ||
1055 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | ||
933 | 1056 | ||
1057 | zonelist_scan: | ||
934 | /* | 1058 | /* |
935 | * Go through the zonelist once, looking for a zone with enough free. | 1059 | * Scan zonelist, looking for a zone with enough free. |
936 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1060 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
937 | */ | 1061 | */ |
1062 | z = zonelist->zones; | ||
1063 | |||
938 | do { | 1064 | do { |
1065 | if (NUMA_BUILD && zlc_active && | ||
1066 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1067 | continue; | ||
939 | zone = *z; | 1068 | zone = *z; |
940 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | 1069 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && |
941 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 1070 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
942 | break; | 1071 | break; |
943 | if ((alloc_flags & ALLOC_CPUSET) && | 1072 | if ((alloc_flags & ALLOC_CPUSET) && |
944 | !cpuset_zone_allowed(zone, gfp_mask)) | 1073 | !cpuset_zone_allowed(zone, gfp_mask)) |
945 | continue; | 1074 | goto try_next_zone; |
946 | 1075 | ||
947 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1076 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
948 | unsigned long mark; | 1077 | unsigned long mark; |
@@ -956,15 +1085,30 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
956 | classzone_idx, alloc_flags)) { | 1085 | classzone_idx, alloc_flags)) { |
957 | if (!zone_reclaim_mode || | 1086 | if (!zone_reclaim_mode || |
958 | !zone_reclaim(zone, gfp_mask, order)) | 1087 | !zone_reclaim(zone, gfp_mask, order)) |
959 | continue; | 1088 | goto this_zone_full; |
960 | } | 1089 | } |
961 | } | 1090 | } |
962 | 1091 | ||
963 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1092 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); |
964 | if (page) | 1093 | if (page) |
965 | break; | 1094 | break; |
966 | 1095 | this_zone_full: | |
1096 | if (NUMA_BUILD) | ||
1097 | zlc_mark_zone_full(zonelist, z); | ||
1098 | try_next_zone: | ||
1099 | if (NUMA_BUILD && !did_zlc_setup) { | ||
1100 | /* we do zlc_setup after the first zone is tried */ | ||
1101 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1102 | zlc_active = 1; | ||
1103 | did_zlc_setup = 1; | ||
1104 | } | ||
967 | } while (*(++z) != NULL); | 1105 | } while (*(++z) != NULL); |
1106 | |||
1107 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | ||
1108 | /* Disable zlc cache for second zonelist scan */ | ||
1109 | zlc_active = 0; | ||
1110 | goto zonelist_scan; | ||
1111 | } | ||
968 | return page; | 1112 | return page; |
969 | } | 1113 | } |
970 | 1114 | ||
@@ -1535,6 +1679,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1535 | } | 1679 | } |
1536 | } | 1680 | } |
1537 | 1681 | ||
1682 | /* Construct the zonelist performance cache - see further mmzone.h */ | ||
1683 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1684 | { | ||
1685 | int i; | ||
1686 | |||
1687 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1688 | struct zonelist *zonelist; | ||
1689 | struct zonelist_cache *zlc; | ||
1690 | struct zone **z; | ||
1691 | |||
1692 | zonelist = pgdat->node_zonelists + i; | ||
1693 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | ||
1694 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1695 | for (z = zonelist->zones; *z; z++) | ||
1696 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1538 | #else /* CONFIG_NUMA */ | 1700 | #else /* CONFIG_NUMA */ |
1539 | 1701 | ||
1540 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1702 | static void __meminit build_zonelists(pg_data_t *pgdat) |
@@ -1572,14 +1734,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1572 | } | 1734 | } |
1573 | } | 1735 | } |
1574 | 1736 | ||
1737 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | ||
1738 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1739 | { | ||
1740 | int i; | ||
1741 | |||
1742 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1743 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
1744 | } | ||
1745 | |||
1575 | #endif /* CONFIG_NUMA */ | 1746 | #endif /* CONFIG_NUMA */ |
1576 | 1747 | ||
1577 | /* return values int ....just for stop_machine_run() */ | 1748 | /* return values int ....just for stop_machine_run() */ |
1578 | static int __meminit __build_all_zonelists(void *dummy) | 1749 | static int __meminit __build_all_zonelists(void *dummy) |
1579 | { | 1750 | { |
1580 | int nid; | 1751 | int nid; |
1581 | for_each_online_node(nid) | 1752 | |
1753 | for_each_online_node(nid) { | ||
1582 | build_zonelists(NODE_DATA(nid)); | 1754 | build_zonelists(NODE_DATA(nid)); |
1755 | build_zonelist_cache(NODE_DATA(nid)); | ||
1756 | } | ||
1583 | return 0; | 1757 | return 0; |
1584 | } | 1758 | } |
1585 | 1759 | ||