aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c228
1 files changed, 104 insertions, 124 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 259a71bacca4..104e69ca55e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -67,7 +67,6 @@ long nr_swap_pages;
67int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; 67int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
68 68
69EXPORT_SYMBOL(totalram_pages); 69EXPORT_SYMBOL(totalram_pages);
70EXPORT_SYMBOL(nr_swap_pages);
71 70
72/* 71/*
73 * Used by page_zone() to look up the address of the struct zone whose 72 * Used by page_zone() to look up the address of the struct zone whose
@@ -736,9 +735,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
736 } 735 }
737 local_irq_restore(flags); 736 local_irq_restore(flags);
738 put_cpu(); 737 put_cpu();
739 } 738 } else {
740
741 if (page == NULL) {
742 spin_lock_irqsave(&zone->lock, flags); 739 spin_lock_irqsave(&zone->lock, flags);
743 page = __rmqueue(zone, order); 740 page = __rmqueue(zone, order);
744 spin_unlock_irqrestore(&zone->lock, flags); 741 spin_unlock_irqrestore(&zone->lock, flags);
@@ -758,20 +755,25 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
758 return page; 755 return page;
759} 756}
760 757
758#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
759#define ALLOC_HARDER 0x02 /* try to alloc harder */
760#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */
761#define ALLOC_CPUSET 0x08 /* check for correct cpuset */
762
761/* 763/*
762 * Return 1 if free pages are above 'mark'. This takes into account the order 764 * Return 1 if free pages are above 'mark'. This takes into account the order
763 * of the allocation. 765 * of the allocation.
764 */ 766 */
765int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 767int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
766 int classzone_idx, int can_try_harder, gfp_t gfp_high) 768 int classzone_idx, int alloc_flags)
767{ 769{
768 /* free_pages my go negative - that's OK */ 770 /* free_pages my go negative - that's OK */
769 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 771 long min = mark, free_pages = z->free_pages - (1 << order) + 1;
770 int o; 772 int o;
771 773
772 if (gfp_high) 774 if (alloc_flags & ALLOC_HIGH)
773 min -= min / 2; 775 min -= min / 2;
774 if (can_try_harder) 776 if (alloc_flags & ALLOC_HARDER)
775 min -= min / 4; 777 min -= min / 4;
776 778
777 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 779 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -789,14 +791,40 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
789 return 1; 791 return 1;
790} 792}
791 793
792static inline int 794/*
793should_reclaim_zone(struct zone *z, gfp_t gfp_mask) 795 * get_page_from_freeliest goes through the zonelist trying to allocate
796 * a page.
797 */
798static struct page *
799get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
800 struct zonelist *zonelist, int alloc_flags)
794{ 801{
795 if (!z->reclaim_pages) 802 struct zone **z = zonelist->zones;
796 return 0; 803 struct page *page = NULL;
797 if (gfp_mask & __GFP_NORECLAIM) 804 int classzone_idx = zone_idx(*z);
798 return 0; 805
799 return 1; 806 /*
807 * Go through the zonelist once, looking for a zone with enough free.
808 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
809 */
810 do {
811 if ((alloc_flags & ALLOC_CPUSET) &&
812 !cpuset_zone_allowed(*z, gfp_mask))
813 continue;
814
815 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
816 if (!zone_watermark_ok(*z, order, (*z)->pages_low,
817 classzone_idx, alloc_flags))
818 continue;
819 }
820
821 page = buffered_rmqueue(*z, order, gfp_mask);
822 if (page) {
823 zone_statistics(zonelist, *z);
824 break;
825 }
826 } while (*(++z) != NULL);
827 return page;
800} 828}
801 829
802/* 830/*
@@ -807,105 +835,75 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
807 struct zonelist *zonelist) 835 struct zonelist *zonelist)
808{ 836{
809 const gfp_t wait = gfp_mask & __GFP_WAIT; 837 const gfp_t wait = gfp_mask & __GFP_WAIT;
810 struct zone **zones, *z; 838 struct zone **z;
811 struct page *page; 839 struct page *page;
812 struct reclaim_state reclaim_state; 840 struct reclaim_state reclaim_state;
813 struct task_struct *p = current; 841 struct task_struct *p = current;
814 int i;
815 int classzone_idx;
816 int do_retry; 842 int do_retry;
817 int can_try_harder; 843 int alloc_flags;
818 int did_some_progress; 844 int did_some_progress;
819 845
820 might_sleep_if(wait); 846 might_sleep_if(wait);
821 847
822 /* 848 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
823 * The caller may dip into page reserves a bit more if the caller
824 * cannot run direct reclaim, or is the caller has realtime scheduling
825 * policy
826 */
827 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
828 849
829 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ 850 if (unlikely(*z == NULL)) {
830
831 if (unlikely(zones[0] == NULL)) {
832 /* Should this ever happen?? */ 851 /* Should this ever happen?? */
833 return NULL; 852 return NULL;
834 } 853 }
854restart:
855 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
856 zonelist, ALLOC_CPUSET);
857 if (page)
858 goto got_pg;
835 859
836 classzone_idx = zone_idx(zones[0]); 860 do
861 wakeup_kswapd(*z, order);
862 while (*(++z));
837 863
838restart:
839 /* 864 /*
840 * Go through the zonelist once, looking for a zone with enough free. 865 * OK, we're below the kswapd watermark and have kicked background
841 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 866 * reclaim. Now things get more complex, so set up alloc_flags according
867 * to how we want to proceed.
868 *
869 * The caller may dip into page reserves a bit more if the caller
870 * cannot run direct reclaim, or if the caller has realtime scheduling
871 * policy.
842 */ 872 */
843 for (i = 0; (z = zones[i]) != NULL; i++) { 873 alloc_flags = 0;
844 int do_reclaim = should_reclaim_zone(z, gfp_mask); 874 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
845 875 alloc_flags |= ALLOC_HARDER;
846 if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) 876 if (gfp_mask & __GFP_HIGH)
847 continue; 877 alloc_flags |= ALLOC_HIGH;
848 878 if (wait)
849 /* 879 alloc_flags |= ALLOC_CPUSET;
850 * If the zone is to attempt early page reclaim then this loop
851 * will try to reclaim pages and check the watermark a second
852 * time before giving up and falling back to the next zone.
853 */
854zone_reclaim_retry:
855 if (!zone_watermark_ok(z, order, z->pages_low,
856 classzone_idx, 0, 0)) {
857 if (!do_reclaim)
858 continue;
859 else {
860 zone_reclaim(z, gfp_mask, order);
861 /* Only try reclaim once */
862 do_reclaim = 0;
863 goto zone_reclaim_retry;
864 }
865 }
866
867 page = buffered_rmqueue(z, order, gfp_mask);
868 if (page)
869 goto got_pg;
870 }
871
872 for (i = 0; (z = zones[i]) != NULL; i++)
873 wakeup_kswapd(z, order);
874 880
875 /* 881 /*
876 * Go through the zonelist again. Let __GFP_HIGH and allocations 882 * Go through the zonelist again. Let __GFP_HIGH and allocations
877 * coming from realtime tasks to go deeper into reserves 883 * coming from realtime tasks go deeper into reserves.
878 * 884 *
879 * This is the last chance, in general, before the goto nopage. 885 * This is the last chance, in general, before the goto nopage.
880 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 886 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
881 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 887 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
882 */ 888 */
883 for (i = 0; (z = zones[i]) != NULL; i++) { 889 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
884 if (!zone_watermark_ok(z, order, z->pages_min, 890 if (page)
885 classzone_idx, can_try_harder, 891 goto got_pg;
886 gfp_mask & __GFP_HIGH))
887 continue;
888
889 if (wait && !cpuset_zone_allowed(z, gfp_mask))
890 continue;
891
892 page = buffered_rmqueue(z, order, gfp_mask);
893 if (page)
894 goto got_pg;
895 }
896 892
897 /* This allocation should allow future memory freeing. */ 893 /* This allocation should allow future memory freeing. */
898 894
899 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 895 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
900 && !in_interrupt()) { 896 && !in_interrupt()) {
901 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 897 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
898nofail_alloc:
902 /* go through the zonelist yet again, ignoring mins */ 899 /* go through the zonelist yet again, ignoring mins */
903 for (i = 0; (z = zones[i]) != NULL; i++) { 900 page = get_page_from_freelist(gfp_mask, order,
904 if (!cpuset_zone_allowed(z, gfp_mask)) 901 zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
905 continue; 902 if (page)
906 page = buffered_rmqueue(z, order, gfp_mask); 903 goto got_pg;
907 if (page) 904 if (gfp_mask & __GFP_NOFAIL) {
908 goto got_pg; 905 blk_congestion_wait(WRITE, HZ/50);
906 goto nofail_alloc;
909 } 907 }
910 } 908 }
911 goto nopage; 909 goto nopage;
@@ -923,7 +921,7 @@ rebalance:
923 reclaim_state.reclaimed_slab = 0; 921 reclaim_state.reclaimed_slab = 0;
924 p->reclaim_state = &reclaim_state; 922 p->reclaim_state = &reclaim_state;
925 923
926 did_some_progress = try_to_free_pages(zones, gfp_mask); 924 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
927 925
928 p->reclaim_state = NULL; 926 p->reclaim_state = NULL;
929 p->flags &= ~PF_MEMALLOC; 927 p->flags &= ~PF_MEMALLOC;
@@ -931,19 +929,10 @@ rebalance:
931 cond_resched(); 929 cond_resched();
932 930
933 if (likely(did_some_progress)) { 931 if (likely(did_some_progress)) {
934 for (i = 0; (z = zones[i]) != NULL; i++) { 932 page = get_page_from_freelist(gfp_mask, order,
935 if (!zone_watermark_ok(z, order, z->pages_min, 933 zonelist, alloc_flags);
936 classzone_idx, can_try_harder, 934 if (page)
937 gfp_mask & __GFP_HIGH)) 935 goto got_pg;
938 continue;
939
940 if (!cpuset_zone_allowed(z, gfp_mask))
941 continue;
942
943 page = buffered_rmqueue(z, order, gfp_mask);
944 if (page)
945 goto got_pg;
946 }
947 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 936 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
948 /* 937 /*
949 * Go through the zonelist yet one more time, keep 938 * Go through the zonelist yet one more time, keep
@@ -951,18 +940,10 @@ rebalance:
951 * a parallel oom killing, we must fail if we're still 940 * a parallel oom killing, we must fail if we're still
952 * under heavy pressure. 941 * under heavy pressure.
953 */ 942 */
954 for (i = 0; (z = zones[i]) != NULL; i++) { 943 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
955 if (!zone_watermark_ok(z, order, z->pages_high, 944 zonelist, ALLOC_CPUSET);
956 classzone_idx, 0, 0)) 945 if (page)
957 continue; 946 goto got_pg;
958
959 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
960 continue;
961
962 page = buffered_rmqueue(z, order, gfp_mask);
963 if (page)
964 goto got_pg;
965 }
966 947
967 out_of_memory(gfp_mask, order); 948 out_of_memory(gfp_mask, order);
968 goto restart; 949 goto restart;
@@ -995,9 +976,7 @@ nopage:
995 dump_stack(); 976 dump_stack();
996 show_mem(); 977 show_mem();
997 } 978 }
998 return NULL;
999got_pg: 979got_pg:
1000 zone_statistics(zonelist, z);
1001 return page; 980 return page;
1002} 981}
1003 982
@@ -1334,7 +1313,7 @@ void show_free_areas(void)
1334 } else 1313 } else
1335 printk("\n"); 1314 printk("\n");
1336 1315
1337 for_each_cpu(cpu) { 1316 for_each_online_cpu(cpu) {
1338 struct per_cpu_pageset *pageset; 1317 struct per_cpu_pageset *pageset;
1339 1318
1340 pageset = zone_pcp(zone, cpu); 1319 pageset = zone_pcp(zone, cpu);
@@ -2426,13 +2405,18 @@ void setup_per_zone_pages_min(void)
2426 } 2405 }
2427 2406
2428 for_each_zone(zone) { 2407 for_each_zone(zone) {
2408 unsigned long tmp;
2429 spin_lock_irqsave(&zone->lru_lock, flags); 2409 spin_lock_irqsave(&zone->lru_lock, flags);
2410 tmp = (pages_min * zone->present_pages) / lowmem_pages;
2430 if (is_highmem(zone)) { 2411 if (is_highmem(zone)) {
2431 /* 2412 /*
2432 * Often, highmem doesn't need to reserve any pages. 2413 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
2433 * But the pages_min/low/high values are also used for 2414 * need highmem pages, so cap pages_min to a small
2434 * batching up page reclaim activity so we need a 2415 * value here.
2435 * decent value here. 2416 *
2417 * The (pages_high-pages_low) and (pages_low-pages_min)
2418 * deltas controls asynch page reclaim, and so should
2419 * not be capped for highmem.
2436 */ 2420 */
2437 int min_pages; 2421 int min_pages;
2438 2422
@@ -2443,19 +2427,15 @@ void setup_per_zone_pages_min(void)
2443 min_pages = 128; 2427 min_pages = 128;
2444 zone->pages_min = min_pages; 2428 zone->pages_min = min_pages;
2445 } else { 2429 } else {
2446 /* if it's a lowmem zone, reserve a number of pages 2430 /*
2431 * If it's a lowmem zone, reserve a number of pages
2447 * proportionate to the zone's size. 2432 * proportionate to the zone's size.
2448 */ 2433 */
2449 zone->pages_min = (pages_min * zone->present_pages) / 2434 zone->pages_min = tmp;
2450 lowmem_pages;
2451 } 2435 }
2452 2436
2453 /* 2437 zone->pages_low = zone->pages_min + tmp / 4;
2454 * When interpreting these watermarks, just keep in mind that: 2438 zone->pages_high = zone->pages_min + tmp / 2;
2455 * zone->pages_min == (zone->pages_min * 4) / 4;
2456 */
2457 zone->pages_low = (zone->pages_min * 5) / 4;
2458 zone->pages_high = (zone->pages_min * 6) / 4;
2459 spin_unlock_irqrestore(&zone->lru_lock, flags); 2439 spin_unlock_irqrestore(&zone->lru_lock, flags);
2460 } 2440 }
2461} 2441}