diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 228 |
1 files changed, 104 insertions, 124 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 259a71bacca4..104e69ca55e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -67,7 +67,6 @@ long nr_swap_pages; | |||
67 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; | 67 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; |
68 | 68 | ||
69 | EXPORT_SYMBOL(totalram_pages); | 69 | EXPORT_SYMBOL(totalram_pages); |
70 | EXPORT_SYMBOL(nr_swap_pages); | ||
71 | 70 | ||
72 | /* | 71 | /* |
73 | * Used by page_zone() to look up the address of the struct zone whose | 72 | * Used by page_zone() to look up the address of the struct zone whose |
@@ -736,9 +735,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) | |||
736 | } | 735 | } |
737 | local_irq_restore(flags); | 736 | local_irq_restore(flags); |
738 | put_cpu(); | 737 | put_cpu(); |
739 | } | 738 | } else { |
740 | |||
741 | if (page == NULL) { | ||
742 | spin_lock_irqsave(&zone->lock, flags); | 739 | spin_lock_irqsave(&zone->lock, flags); |
743 | page = __rmqueue(zone, order); | 740 | page = __rmqueue(zone, order); |
744 | spin_unlock_irqrestore(&zone->lock, flags); | 741 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -758,20 +755,25 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) | |||
758 | return page; | 755 | return page; |
759 | } | 756 | } |
760 | 757 | ||
758 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | ||
759 | #define ALLOC_HARDER 0x02 /* try to alloc harder */ | ||
760 | #define ALLOC_HIGH 0x04 /* __GFP_HIGH set */ | ||
761 | #define ALLOC_CPUSET 0x08 /* check for correct cpuset */ | ||
762 | |||
761 | /* | 763 | /* |
762 | * Return 1 if free pages are above 'mark'. This takes into account the order | 764 | * Return 1 if free pages are above 'mark'. This takes into account the order |
763 | * of the allocation. | 765 | * of the allocation. |
764 | */ | 766 | */ |
765 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 767 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
766 | int classzone_idx, int can_try_harder, gfp_t gfp_high) | 768 | int classzone_idx, int alloc_flags) |
767 | { | 769 | { |
768 | /* free_pages my go negative - that's OK */ | 770 | /* free_pages my go negative - that's OK */ |
769 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; | 771 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; |
770 | int o; | 772 | int o; |
771 | 773 | ||
772 | if (gfp_high) | 774 | if (alloc_flags & ALLOC_HIGH) |
773 | min -= min / 2; | 775 | min -= min / 2; |
774 | if (can_try_harder) | 776 | if (alloc_flags & ALLOC_HARDER) |
775 | min -= min / 4; | 777 | min -= min / 4; |
776 | 778 | ||
777 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 779 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
@@ -789,14 +791,40 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
789 | return 1; | 791 | return 1; |
790 | } | 792 | } |
791 | 793 | ||
792 | static inline int | 794 | /* |
793 | should_reclaim_zone(struct zone *z, gfp_t gfp_mask) | 795 | * get_page_from_freeliest goes through the zonelist trying to allocate |
796 | * a page. | ||
797 | */ | ||
798 | static struct page * | ||
799 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | ||
800 | struct zonelist *zonelist, int alloc_flags) | ||
794 | { | 801 | { |
795 | if (!z->reclaim_pages) | 802 | struct zone **z = zonelist->zones; |
796 | return 0; | 803 | struct page *page = NULL; |
797 | if (gfp_mask & __GFP_NORECLAIM) | 804 | int classzone_idx = zone_idx(*z); |
798 | return 0; | 805 | |
799 | return 1; | 806 | /* |
807 | * Go through the zonelist once, looking for a zone with enough free. | ||
808 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
809 | */ | ||
810 | do { | ||
811 | if ((alloc_flags & ALLOC_CPUSET) && | ||
812 | !cpuset_zone_allowed(*z, gfp_mask)) | ||
813 | continue; | ||
814 | |||
815 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | ||
816 | if (!zone_watermark_ok(*z, order, (*z)->pages_low, | ||
817 | classzone_idx, alloc_flags)) | ||
818 | continue; | ||
819 | } | ||
820 | |||
821 | page = buffered_rmqueue(*z, order, gfp_mask); | ||
822 | if (page) { | ||
823 | zone_statistics(zonelist, *z); | ||
824 | break; | ||
825 | } | ||
826 | } while (*(++z) != NULL); | ||
827 | return page; | ||
800 | } | 828 | } |
801 | 829 | ||
802 | /* | 830 | /* |
@@ -807,105 +835,75 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
807 | struct zonelist *zonelist) | 835 | struct zonelist *zonelist) |
808 | { | 836 | { |
809 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 837 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
810 | struct zone **zones, *z; | 838 | struct zone **z; |
811 | struct page *page; | 839 | struct page *page; |
812 | struct reclaim_state reclaim_state; | 840 | struct reclaim_state reclaim_state; |
813 | struct task_struct *p = current; | 841 | struct task_struct *p = current; |
814 | int i; | ||
815 | int classzone_idx; | ||
816 | int do_retry; | 842 | int do_retry; |
817 | int can_try_harder; | 843 | int alloc_flags; |
818 | int did_some_progress; | 844 | int did_some_progress; |
819 | 845 | ||
820 | might_sleep_if(wait); | 846 | might_sleep_if(wait); |
821 | 847 | ||
822 | /* | 848 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
823 | * The caller may dip into page reserves a bit more if the caller | ||
824 | * cannot run direct reclaim, or is the caller has realtime scheduling | ||
825 | * policy | ||
826 | */ | ||
827 | can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; | ||
828 | 849 | ||
829 | zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 850 | if (unlikely(*z == NULL)) { |
830 | |||
831 | if (unlikely(zones[0] == NULL)) { | ||
832 | /* Should this ever happen?? */ | 851 | /* Should this ever happen?? */ |
833 | return NULL; | 852 | return NULL; |
834 | } | 853 | } |
854 | restart: | ||
855 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | ||
856 | zonelist, ALLOC_CPUSET); | ||
857 | if (page) | ||
858 | goto got_pg; | ||
835 | 859 | ||
836 | classzone_idx = zone_idx(zones[0]); | 860 | do |
861 | wakeup_kswapd(*z, order); | ||
862 | while (*(++z)); | ||
837 | 863 | ||
838 | restart: | ||
839 | /* | 864 | /* |
840 | * Go through the zonelist once, looking for a zone with enough free. | 865 | * OK, we're below the kswapd watermark and have kicked background |
841 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 866 | * reclaim. Now things get more complex, so set up alloc_flags according |
867 | * to how we want to proceed. | ||
868 | * | ||
869 | * The caller may dip into page reserves a bit more if the caller | ||
870 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
871 | * policy. | ||
842 | */ | 872 | */ |
843 | for (i = 0; (z = zones[i]) != NULL; i++) { | 873 | alloc_flags = 0; |
844 | int do_reclaim = should_reclaim_zone(z, gfp_mask); | 874 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) |
845 | 875 | alloc_flags |= ALLOC_HARDER; | |
846 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) | 876 | if (gfp_mask & __GFP_HIGH) |
847 | continue; | 877 | alloc_flags |= ALLOC_HIGH; |
848 | 878 | if (wait) | |
849 | /* | 879 | alloc_flags |= ALLOC_CPUSET; |
850 | * If the zone is to attempt early page reclaim then this loop | ||
851 | * will try to reclaim pages and check the watermark a second | ||
852 | * time before giving up and falling back to the next zone. | ||
853 | */ | ||
854 | zone_reclaim_retry: | ||
855 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
856 | classzone_idx, 0, 0)) { | ||
857 | if (!do_reclaim) | ||
858 | continue; | ||
859 | else { | ||
860 | zone_reclaim(z, gfp_mask, order); | ||
861 | /* Only try reclaim once */ | ||
862 | do_reclaim = 0; | ||
863 | goto zone_reclaim_retry; | ||
864 | } | ||
865 | } | ||
866 | |||
867 | page = buffered_rmqueue(z, order, gfp_mask); | ||
868 | if (page) | ||
869 | goto got_pg; | ||
870 | } | ||
871 | |||
872 | for (i = 0; (z = zones[i]) != NULL; i++) | ||
873 | wakeup_kswapd(z, order); | ||
874 | 880 | ||
875 | /* | 881 | /* |
876 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 882 | * Go through the zonelist again. Let __GFP_HIGH and allocations |
877 | * coming from realtime tasks to go deeper into reserves | 883 | * coming from realtime tasks go deeper into reserves. |
878 | * | 884 | * |
879 | * This is the last chance, in general, before the goto nopage. | 885 | * This is the last chance, in general, before the goto nopage. |
880 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 886 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
881 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 887 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
882 | */ | 888 | */ |
883 | for (i = 0; (z = zones[i]) != NULL; i++) { | 889 | page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); |
884 | if (!zone_watermark_ok(z, order, z->pages_min, | 890 | if (page) |
885 | classzone_idx, can_try_harder, | 891 | goto got_pg; |
886 | gfp_mask & __GFP_HIGH)) | ||
887 | continue; | ||
888 | |||
889 | if (wait && !cpuset_zone_allowed(z, gfp_mask)) | ||
890 | continue; | ||
891 | |||
892 | page = buffered_rmqueue(z, order, gfp_mask); | ||
893 | if (page) | ||
894 | goto got_pg; | ||
895 | } | ||
896 | 892 | ||
897 | /* This allocation should allow future memory freeing. */ | 893 | /* This allocation should allow future memory freeing. */ |
898 | 894 | ||
899 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 895 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
900 | && !in_interrupt()) { | 896 | && !in_interrupt()) { |
901 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 897 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
898 | nofail_alloc: | ||
902 | /* go through the zonelist yet again, ignoring mins */ | 899 | /* go through the zonelist yet again, ignoring mins */ |
903 | for (i = 0; (z = zones[i]) != NULL; i++) { | 900 | page = get_page_from_freelist(gfp_mask, order, |
904 | if (!cpuset_zone_allowed(z, gfp_mask)) | 901 | zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); |
905 | continue; | 902 | if (page) |
906 | page = buffered_rmqueue(z, order, gfp_mask); | 903 | goto got_pg; |
907 | if (page) | 904 | if (gfp_mask & __GFP_NOFAIL) { |
908 | goto got_pg; | 905 | blk_congestion_wait(WRITE, HZ/50); |
906 | goto nofail_alloc; | ||
909 | } | 907 | } |
910 | } | 908 | } |
911 | goto nopage; | 909 | goto nopage; |
@@ -923,7 +921,7 @@ rebalance: | |||
923 | reclaim_state.reclaimed_slab = 0; | 921 | reclaim_state.reclaimed_slab = 0; |
924 | p->reclaim_state = &reclaim_state; | 922 | p->reclaim_state = &reclaim_state; |
925 | 923 | ||
926 | did_some_progress = try_to_free_pages(zones, gfp_mask); | 924 | did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); |
927 | 925 | ||
928 | p->reclaim_state = NULL; | 926 | p->reclaim_state = NULL; |
929 | p->flags &= ~PF_MEMALLOC; | 927 | p->flags &= ~PF_MEMALLOC; |
@@ -931,19 +929,10 @@ rebalance: | |||
931 | cond_resched(); | 929 | cond_resched(); |
932 | 930 | ||
933 | if (likely(did_some_progress)) { | 931 | if (likely(did_some_progress)) { |
934 | for (i = 0; (z = zones[i]) != NULL; i++) { | 932 | page = get_page_from_freelist(gfp_mask, order, |
935 | if (!zone_watermark_ok(z, order, z->pages_min, | 933 | zonelist, alloc_flags); |
936 | classzone_idx, can_try_harder, | 934 | if (page) |
937 | gfp_mask & __GFP_HIGH)) | 935 | goto got_pg; |
938 | continue; | ||
939 | |||
940 | if (!cpuset_zone_allowed(z, gfp_mask)) | ||
941 | continue; | ||
942 | |||
943 | page = buffered_rmqueue(z, order, gfp_mask); | ||
944 | if (page) | ||
945 | goto got_pg; | ||
946 | } | ||
947 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 936 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
948 | /* | 937 | /* |
949 | * Go through the zonelist yet one more time, keep | 938 | * Go through the zonelist yet one more time, keep |
@@ -951,18 +940,10 @@ rebalance: | |||
951 | * a parallel oom killing, we must fail if we're still | 940 | * a parallel oom killing, we must fail if we're still |
952 | * under heavy pressure. | 941 | * under heavy pressure. |
953 | */ | 942 | */ |
954 | for (i = 0; (z = zones[i]) != NULL; i++) { | 943 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
955 | if (!zone_watermark_ok(z, order, z->pages_high, | 944 | zonelist, ALLOC_CPUSET); |
956 | classzone_idx, 0, 0)) | 945 | if (page) |
957 | continue; | 946 | goto got_pg; |
958 | |||
959 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) | ||
960 | continue; | ||
961 | |||
962 | page = buffered_rmqueue(z, order, gfp_mask); | ||
963 | if (page) | ||
964 | goto got_pg; | ||
965 | } | ||
966 | 947 | ||
967 | out_of_memory(gfp_mask, order); | 948 | out_of_memory(gfp_mask, order); |
968 | goto restart; | 949 | goto restart; |
@@ -995,9 +976,7 @@ nopage: | |||
995 | dump_stack(); | 976 | dump_stack(); |
996 | show_mem(); | 977 | show_mem(); |
997 | } | 978 | } |
998 | return NULL; | ||
999 | got_pg: | 979 | got_pg: |
1000 | zone_statistics(zonelist, z); | ||
1001 | return page; | 980 | return page; |
1002 | } | 981 | } |
1003 | 982 | ||
@@ -1334,7 +1313,7 @@ void show_free_areas(void) | |||
1334 | } else | 1313 | } else |
1335 | printk("\n"); | 1314 | printk("\n"); |
1336 | 1315 | ||
1337 | for_each_cpu(cpu) { | 1316 | for_each_online_cpu(cpu) { |
1338 | struct per_cpu_pageset *pageset; | 1317 | struct per_cpu_pageset *pageset; |
1339 | 1318 | ||
1340 | pageset = zone_pcp(zone, cpu); | 1319 | pageset = zone_pcp(zone, cpu); |
@@ -2426,13 +2405,18 @@ void setup_per_zone_pages_min(void) | |||
2426 | } | 2405 | } |
2427 | 2406 | ||
2428 | for_each_zone(zone) { | 2407 | for_each_zone(zone) { |
2408 | unsigned long tmp; | ||
2429 | spin_lock_irqsave(&zone->lru_lock, flags); | 2409 | spin_lock_irqsave(&zone->lru_lock, flags); |
2410 | tmp = (pages_min * zone->present_pages) / lowmem_pages; | ||
2430 | if (is_highmem(zone)) { | 2411 | if (is_highmem(zone)) { |
2431 | /* | 2412 | /* |
2432 | * Often, highmem doesn't need to reserve any pages. | 2413 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't |
2433 | * But the pages_min/low/high values are also used for | 2414 | * need highmem pages, so cap pages_min to a small |
2434 | * batching up page reclaim activity so we need a | 2415 | * value here. |
2435 | * decent value here. | 2416 | * |
2417 | * The (pages_high-pages_low) and (pages_low-pages_min) | ||
2418 | * deltas controls asynch page reclaim, and so should | ||
2419 | * not be capped for highmem. | ||
2436 | */ | 2420 | */ |
2437 | int min_pages; | 2421 | int min_pages; |
2438 | 2422 | ||
@@ -2443,19 +2427,15 @@ void setup_per_zone_pages_min(void) | |||
2443 | min_pages = 128; | 2427 | min_pages = 128; |
2444 | zone->pages_min = min_pages; | 2428 | zone->pages_min = min_pages; |
2445 | } else { | 2429 | } else { |
2446 | /* if it's a lowmem zone, reserve a number of pages | 2430 | /* |
2431 | * If it's a lowmem zone, reserve a number of pages | ||
2447 | * proportionate to the zone's size. | 2432 | * proportionate to the zone's size. |
2448 | */ | 2433 | */ |
2449 | zone->pages_min = (pages_min * zone->present_pages) / | 2434 | zone->pages_min = tmp; |
2450 | lowmem_pages; | ||
2451 | } | 2435 | } |
2452 | 2436 | ||
2453 | /* | 2437 | zone->pages_low = zone->pages_min + tmp / 4; |
2454 | * When interpreting these watermarks, just keep in mind that: | 2438 | zone->pages_high = zone->pages_min + tmp / 2; |
2455 | * zone->pages_min == (zone->pages_min * 4) / 4; | ||
2456 | */ | ||
2457 | zone->pages_low = (zone->pages_min * 5) / 4; | ||
2458 | zone->pages_high = (zone->pages_min * 6) / 4; | ||
2459 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2439 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
2460 | } | 2440 | } |
2461 | } | 2441 | } |