diff options
author | Pekka Enberg <penberg@kernel.org> | 2011-03-11 11:10:45 -0500 |
---|---|---|
committer | Pekka Enberg <penberg@kernel.org> | 2011-03-11 11:10:45 -0500 |
commit | c9149556756d56c68451a4a8735c37e7062fd3d7 (patch) | |
tree | a2dae56b22adaa9a23c8f92f30c3b3ad3b610850 /mm | |
parent | d71f606f687ef9d0cdddfd3619ca7cb9a0b3fb63 (diff) | |
parent | 5bfe53a77e8a3ffce4a10003c75f464a138e272d (diff) |
Merge branch 'slab/rcu' into slab/next
Conflicts:
mm/slub.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 72 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 13 | ||||
-rw-r--r-- | mm/memblock.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 98 | ||||
-rw-r--r-- | mm/memory-failure.c | 94 | ||||
-rw-r--r-- | mm/memory.c | 34 | ||||
-rw-r--r-- | mm/mempolicy.c | 16 | ||||
-rw-r--r-- | mm/migrate.c | 15 | ||||
-rw-r--r-- | mm/mlock.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 23 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 1 | ||||
-rw-r--r-- | mm/slab.c | 39 | ||||
-rw-r--r-- | mm/slub.c | 77 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 39 |
19 files changed, 356 insertions, 190 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3ad483bdf505..e9c0c61f2ddd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS | |||
179 | config COMPACTION | 179 | config COMPACTION |
180 | bool "Allow for memory compaction" | 180 | bool "Allow for memory compaction" |
181 | select MIGRATION | 181 | select MIGRATION |
182 | depends on EXPERIMENTAL && HUGETLB_PAGE && MMU | 182 | depends on MMU |
183 | help | 183 | help |
184 | Allows the compaction of memory for the allocation of huge pages. | 184 | Allows the compaction of memory for the allocation of huge pages. |
185 | 185 | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e187454d82f6..dbe99a5f2073 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag) | |||
650 | 650 | ||
651 | static inline struct page *alloc_hugepage_vma(int defrag, | 651 | static inline struct page *alloc_hugepage_vma(int defrag, |
652 | struct vm_area_struct *vma, | 652 | struct vm_area_struct *vma, |
653 | unsigned long haddr) | 653 | unsigned long haddr, int nd) |
654 | { | 654 | { |
655 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), | 655 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), |
656 | HPAGE_PMD_ORDER, vma, haddr); | 656 | HPAGE_PMD_ORDER, vma, haddr, nd); |
657 | } | 657 | } |
658 | 658 | ||
659 | #ifndef CONFIG_NUMA | 659 | #ifndef CONFIG_NUMA |
@@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
678 | if (unlikely(khugepaged_enter(vma))) | 678 | if (unlikely(khugepaged_enter(vma))) |
679 | return VM_FAULT_OOM; | 679 | return VM_FAULT_OOM; |
680 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 680 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
681 | vma, haddr); | 681 | vma, haddr, numa_node_id()); |
682 | if (unlikely(!page)) | 682 | if (unlikely(!page)) |
683 | goto out; | 683 | goto out; |
684 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | 684 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { |
@@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
799 | } | 799 | } |
800 | 800 | ||
801 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 801 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
802 | pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | 802 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, |
803 | vma, address); | 803 | vma, address, page_to_nid(page)); |
804 | if (unlikely(!pages[i] || | 804 | if (unlikely(!pages[i] || |
805 | mem_cgroup_newpage_charge(pages[i], mm, | 805 | mem_cgroup_newpage_charge(pages[i], mm, |
806 | GFP_KERNEL))) { | 806 | GFP_KERNEL))) { |
@@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
902 | if (transparent_hugepage_enabled(vma) && | 902 | if (transparent_hugepage_enabled(vma) && |
903 | !transparent_hugepage_debug_cow()) | 903 | !transparent_hugepage_debug_cow()) |
904 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 904 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
905 | vma, haddr); | 905 | vma, haddr, numa_node_id()); |
906 | else | 906 | else |
907 | new_page = NULL; | 907 | new_page = NULL; |
908 | 908 | ||
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page) | |||
1162 | /* after clearing PageTail the gup refcount can be released */ | 1162 | /* after clearing PageTail the gup refcount can be released */ |
1163 | smp_mb(); | 1163 | smp_mb(); |
1164 | 1164 | ||
1165 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 1165 | /* |
1166 | * retain hwpoison flag of the poisoned tail page: | ||
1167 | * fix for the unsuitable process killed on Guest Machine(KVM) | ||
1168 | * by the memory-failure. | ||
1169 | */ | ||
1170 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; | ||
1166 | page_tail->flags |= (page->flags & | 1171 | page_tail->flags |= (page->flags & |
1167 | ((1L << PG_referenced) | | 1172 | ((1L << PG_referenced) | |
1168 | (1L << PG_swapbacked) | | 1173 | (1L << PG_swapbacked) | |
@@ -1740,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
1740 | static void collapse_huge_page(struct mm_struct *mm, | 1745 | static void collapse_huge_page(struct mm_struct *mm, |
1741 | unsigned long address, | 1746 | unsigned long address, |
1742 | struct page **hpage, | 1747 | struct page **hpage, |
1743 | struct vm_area_struct *vma) | 1748 | struct vm_area_struct *vma, |
1749 | int node) | ||
1744 | { | 1750 | { |
1745 | pgd_t *pgd; | 1751 | pgd_t *pgd; |
1746 | pud_t *pud; | 1752 | pud_t *pud; |
@@ -1768,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1768 | * mmap_sem in read mode is good idea also to allow greater | 1774 | * mmap_sem in read mode is good idea also to allow greater |
1769 | * scalability. | 1775 | * scalability. |
1770 | */ | 1776 | */ |
1771 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); | 1777 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, |
1778 | node); | ||
1772 | if (unlikely(!new_page)) { | 1779 | if (unlikely(!new_page)) { |
1773 | up_read(&mm->mmap_sem); | 1780 | up_read(&mm->mmap_sem); |
1774 | *hpage = ERR_PTR(-ENOMEM); | 1781 | *hpage = ERR_PTR(-ENOMEM); |
@@ -1806,6 +1813,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1806 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | 1813 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ |
1807 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) | 1814 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) |
1808 | goto out; | 1815 | goto out; |
1816 | if (is_vma_temporary_stack(vma)) | ||
1817 | goto out; | ||
1809 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | 1818 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); |
1810 | 1819 | ||
1811 | pgd = pgd_offset(mm, address); | 1820 | pgd = pgd_offset(mm, address); |
@@ -1847,7 +1856,6 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1847 | set_pmd_at(mm, address, pmd, _pmd); | 1856 | set_pmd_at(mm, address, pmd, _pmd); |
1848 | spin_unlock(&mm->page_table_lock); | 1857 | spin_unlock(&mm->page_table_lock); |
1849 | anon_vma_unlock(vma->anon_vma); | 1858 | anon_vma_unlock(vma->anon_vma); |
1850 | mem_cgroup_uncharge_page(new_page); | ||
1851 | goto out; | 1859 | goto out; |
1852 | } | 1860 | } |
1853 | 1861 | ||
@@ -1893,6 +1901,7 @@ out_up_write: | |||
1893 | return; | 1901 | return; |
1894 | 1902 | ||
1895 | out: | 1903 | out: |
1904 | mem_cgroup_uncharge_page(new_page); | ||
1896 | #ifdef CONFIG_NUMA | 1905 | #ifdef CONFIG_NUMA |
1897 | put_page(new_page); | 1906 | put_page(new_page); |
1898 | #endif | 1907 | #endif |
@@ -1912,6 +1921,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
1912 | struct page *page; | 1921 | struct page *page; |
1913 | unsigned long _address; | 1922 | unsigned long _address; |
1914 | spinlock_t *ptl; | 1923 | spinlock_t *ptl; |
1924 | int node = -1; | ||
1915 | 1925 | ||
1916 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 1926 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
1917 | 1927 | ||
@@ -1942,6 +1952,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
1942 | page = vm_normal_page(vma, _address, pteval); | 1952 | page = vm_normal_page(vma, _address, pteval); |
1943 | if (unlikely(!page)) | 1953 | if (unlikely(!page)) |
1944 | goto out_unmap; | 1954 | goto out_unmap; |
1955 | /* | ||
1956 | * Chose the node of the first page. This could | ||
1957 | * be more sophisticated and look at more pages, | ||
1958 | * but isn't for now. | ||
1959 | */ | ||
1960 | if (node == -1) | ||
1961 | node = page_to_nid(page); | ||
1945 | VM_BUG_ON(PageCompound(page)); | 1962 | VM_BUG_ON(PageCompound(page)); |
1946 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 1963 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
1947 | goto out_unmap; | 1964 | goto out_unmap; |
@@ -1958,7 +1975,7 @@ out_unmap: | |||
1958 | pte_unmap_unlock(pte, ptl); | 1975 | pte_unmap_unlock(pte, ptl); |
1959 | if (ret) | 1976 | if (ret) |
1960 | /* collapse_huge_page will return with the mmap_sem released */ | 1977 | /* collapse_huge_page will return with the mmap_sem released */ |
1961 | collapse_huge_page(mm, address, hpage, vma); | 1978 | collapse_huge_page(mm, address, hpage, vma, node); |
1962 | out: | 1979 | out: |
1963 | return ret; | 1980 | return ret; |
1964 | } | 1981 | } |
@@ -2027,32 +2044,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2027 | if ((!(vma->vm_flags & VM_HUGEPAGE) && | 2044 | if ((!(vma->vm_flags & VM_HUGEPAGE) && |
2028 | !khugepaged_always()) || | 2045 | !khugepaged_always()) || |
2029 | (vma->vm_flags & VM_NOHUGEPAGE)) { | 2046 | (vma->vm_flags & VM_NOHUGEPAGE)) { |
2047 | skip: | ||
2030 | progress++; | 2048 | progress++; |
2031 | continue; | 2049 | continue; |
2032 | } | 2050 | } |
2033 | |||
2034 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | 2051 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ |
2035 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { | 2052 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) |
2036 | khugepaged_scan.address = vma->vm_end; | 2053 | goto skip; |
2037 | progress++; | 2054 | if (is_vma_temporary_stack(vma)) |
2038 | continue; | 2055 | goto skip; |
2039 | } | 2056 | |
2040 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | 2057 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); |
2041 | 2058 | ||
2042 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2059 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2043 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2060 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2044 | if (hstart >= hend) { | 2061 | if (hstart >= hend) |
2045 | progress++; | 2062 | goto skip; |
2046 | continue; | 2063 | if (khugepaged_scan.address > hend) |
2047 | } | 2064 | goto skip; |
2048 | if (khugepaged_scan.address < hstart) | 2065 | if (khugepaged_scan.address < hstart) |
2049 | khugepaged_scan.address = hstart; | 2066 | khugepaged_scan.address = hstart; |
2050 | if (khugepaged_scan.address > hend) { | 2067 | VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); |
2051 | khugepaged_scan.address = hend + HPAGE_PMD_SIZE; | ||
2052 | progress++; | ||
2053 | continue; | ||
2054 | } | ||
2055 | BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | ||
2056 | 2068 | ||
2057 | while (khugepaged_scan.address < hend) { | 2069 | while (khugepaged_scan.address < hend) { |
2058 | int ret; | 2070 | int ret; |
@@ -2081,7 +2093,7 @@ breakouterloop: | |||
2081 | breakouterloop_mmap_sem: | 2093 | breakouterloop_mmap_sem: |
2082 | 2094 | ||
2083 | spin_lock(&khugepaged_mm_lock); | 2095 | spin_lock(&khugepaged_mm_lock); |
2084 | BUG_ON(khugepaged_scan.mm_slot != mm_slot); | 2096 | VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); |
2085 | /* | 2097 | /* |
2086 | * Release the current mm_slot if this mm is about to die, or | 2098 | * Release the current mm_slot if this mm is about to die, or |
2087 | * if we scanned all vmas of this mm. | 2099 | * if we scanned all vmas of this mm. |
@@ -2236,9 +2248,9 @@ static int khugepaged(void *none) | |||
2236 | 2248 | ||
2237 | for (;;) { | 2249 | for (;;) { |
2238 | mutex_unlock(&khugepaged_mutex); | 2250 | mutex_unlock(&khugepaged_mutex); |
2239 | BUG_ON(khugepaged_thread != current); | 2251 | VM_BUG_ON(khugepaged_thread != current); |
2240 | khugepaged_loop(); | 2252 | khugepaged_loop(); |
2241 | BUG_ON(khugepaged_thread != current); | 2253 | VM_BUG_ON(khugepaged_thread != current); |
2242 | 2254 | ||
2243 | mutex_lock(&khugepaged_mutex); | 2255 | mutex_lock(&khugepaged_mutex); |
2244 | if (!khugepaged_enabled()) | 2256 | if (!khugepaged_enabled()) |
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index 177a5169bbde..ff0d9779cec8 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c | |||
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void) | |||
75 | * after the module is removed. | 75 | * after the module is removed. |
76 | */ | 76 | */ |
77 | for (i = 0; i < 10; i++) { | 77 | for (i = 0; i < 10; i++) { |
78 | elem = kmalloc(sizeof(*elem), GFP_KERNEL); | 78 | elem = kzalloc(sizeof(*elem), GFP_KERNEL); |
79 | pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); | 79 | pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); |
80 | if (!elem) | 80 | if (!elem) |
81 | return -ENOMEM; | 81 | return -ENOMEM; |
82 | memset(elem, 0, sizeof(*elem)); | ||
83 | INIT_LIST_HEAD(&elem->list); | 82 | INIT_LIST_HEAD(&elem->list); |
84 | |||
85 | list_add_tail(&elem->list, &test_list); | 83 | list_add_tail(&elem->list, &test_list); |
86 | } | 84 | } |
87 | 85 | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index bd9bc214091b..84225f3b7190 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -113,7 +113,9 @@ | |||
113 | #define BYTES_PER_POINTER sizeof(void *) | 113 | #define BYTES_PER_POINTER sizeof(void *) |
114 | 114 | ||
115 | /* GFP bitmask for kmemleak internal allocations */ | 115 | /* GFP bitmask for kmemleak internal allocations */ |
116 | #define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) | 116 | #define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ |
117 | __GFP_NORETRY | __GFP_NOMEMALLOC | \ | ||
118 | __GFP_NOWARN) | ||
117 | 119 | ||
118 | /* scanning area inside a memory block */ | 120 | /* scanning area inside a memory block */ |
119 | struct kmemleak_scan_area { | 121 | struct kmemleak_scan_area { |
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
511 | struct kmemleak_object *object; | 513 | struct kmemleak_object *object; |
512 | struct prio_tree_node *node; | 514 | struct prio_tree_node *node; |
513 | 515 | ||
514 | object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); | 516 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); |
515 | if (!object) { | 517 | if (!object) { |
516 | kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); | 518 | pr_warning("Cannot allocate a kmemleak_object structure\n"); |
519 | kmemleak_disable(); | ||
517 | return NULL; | 520 | return NULL; |
518 | } | 521 | } |
519 | 522 | ||
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) | |||
734 | return; | 737 | return; |
735 | } | 738 | } |
736 | 739 | ||
737 | area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); | 740 | area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); |
738 | if (!area) { | 741 | if (!area) { |
739 | kmemleak_warn("Cannot allocate a scan area\n"); | 742 | pr_warning("Cannot allocate a scan area\n"); |
740 | goto out; | 743 | goto out; |
741 | } | 744 | } |
742 | 745 | ||
diff --git a/mm/memblock.c b/mm/memblock.c index bdba245d8afd..4618fda975a0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, | |||
137 | 137 | ||
138 | BUG_ON(0 == size); | 138 | BUG_ON(0 == size); |
139 | 139 | ||
140 | size = memblock_align_up(size, align); | ||
141 | |||
142 | /* Pump up max_addr */ | 140 | /* Pump up max_addr */ |
143 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | 141 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) |
144 | end = memblock.current_limit; | 142 | end = memblock.current_limit; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db76ef726293..da53a252b259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
612 | /* pagein of a big page is an event. So, ignore page size */ | 612 | /* pagein of a big page is an event. So, ignore page size */ |
613 | if (nr_pages > 0) | 613 | if (nr_pages > 0) |
614 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); | 614 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
615 | else | 615 | else { |
616 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); | 616 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
617 | nr_pages = -nr_pages; /* for event */ | ||
618 | } | ||
617 | 619 | ||
618 | __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); | 620 | __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); |
619 | 621 | ||
@@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | |||
1111 | return false; | 1113 | return false; |
1112 | } | 1114 | } |
1113 | 1115 | ||
1116 | /** | ||
1117 | * mem_cgroup_check_margin - check if the memory cgroup allows charging | ||
1118 | * @mem: memory cgroup to check | ||
1119 | * @bytes: the number of bytes the caller intends to charge | ||
1120 | * | ||
1121 | * Returns a boolean value on whether @mem can be charged @bytes or | ||
1122 | * whether this would exceed the limit. | ||
1123 | */ | ||
1124 | static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) | ||
1125 | { | ||
1126 | if (!res_counter_check_margin(&mem->res, bytes)) | ||
1127 | return false; | ||
1128 | if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) | ||
1129 | return false; | ||
1130 | return true; | ||
1131 | } | ||
1132 | |||
1114 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1133 | static unsigned int get_swappiness(struct mem_cgroup *memcg) |
1115 | { | 1134 | { |
1116 | struct cgroup *cgrp = memcg->css.cgroup; | 1135 | struct cgroup *cgrp = memcg->css.cgroup; |
@@ -1832,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1832 | if (likely(!ret)) | 1851 | if (likely(!ret)) |
1833 | return CHARGE_OK; | 1852 | return CHARGE_OK; |
1834 | 1853 | ||
1854 | res_counter_uncharge(&mem->res, csize); | ||
1835 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 1855 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
1836 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1856 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1837 | } else | 1857 | } else |
1838 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 1858 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
1839 | 1859 | /* | |
1840 | if (csize > PAGE_SIZE) /* change csize and retry */ | 1860 | * csize can be either a huge page (HPAGE_SIZE), a batch of |
1861 | * regular pages (CHARGE_SIZE), or a single regular page | ||
1862 | * (PAGE_SIZE). | ||
1863 | * | ||
1864 | * Never reclaim on behalf of optional batching, retry with a | ||
1865 | * single page instead. | ||
1866 | */ | ||
1867 | if (csize == CHARGE_SIZE) | ||
1841 | return CHARGE_RETRY; | 1868 | return CHARGE_RETRY; |
1842 | 1869 | ||
1843 | if (!(gfp_mask & __GFP_WAIT)) | 1870 | if (!(gfp_mask & __GFP_WAIT)) |
1844 | return CHARGE_WOULDBLOCK; | 1871 | return CHARGE_WOULDBLOCK; |
1845 | 1872 | ||
1846 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 1873 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1847 | gfp_mask, flags); | 1874 | gfp_mask, flags); |
1875 | if (mem_cgroup_check_margin(mem_over_limit, csize)) | ||
1876 | return CHARGE_RETRY; | ||
1848 | /* | 1877 | /* |
1849 | * try_to_free_mem_cgroup_pages() might not give us a full | 1878 | * Even though the limit is exceeded at this point, reclaim |
1850 | * picture of reclaim. Some pages are reclaimed and might be | 1879 | * may have been able to free some pages. Retry the charge |
1851 | * moved to swap cache or just unmapped from the cgroup. | 1880 | * before killing the task. |
1852 | * Check the limit again to see if the reclaim reduced the | 1881 | * |
1853 | * current usage of the cgroup before giving up | 1882 | * Only for regular pages, though: huge pages are rather |
1883 | * unlikely to succeed so close to the limit, and we fall back | ||
1884 | * to regular pages anyway in case of failure. | ||
1854 | */ | 1885 | */ |
1855 | if (ret || mem_cgroup_check_under_limit(mem_over_limit)) | 1886 | if (csize == PAGE_SIZE && ret) |
1856 | return CHARGE_RETRY; | 1887 | return CHARGE_RETRY; |
1857 | 1888 | ||
1858 | /* | 1889 | /* |
@@ -2144,6 +2175,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2144 | struct page_cgroup *tail_pc = lookup_page_cgroup(tail); | 2175 | struct page_cgroup *tail_pc = lookup_page_cgroup(tail); |
2145 | unsigned long flags; | 2176 | unsigned long flags; |
2146 | 2177 | ||
2178 | if (mem_cgroup_disabled()) | ||
2179 | return; | ||
2147 | /* | 2180 | /* |
2148 | * We have no races with charge/uncharge but will have races with | 2181 | * We have no races with charge/uncharge but will have races with |
2149 | * page state accounting. | 2182 | * page state accounting. |
@@ -2233,7 +2266,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
2233 | { | 2266 | { |
2234 | int ret = -EINVAL; | 2267 | int ret = -EINVAL; |
2235 | unsigned long flags; | 2268 | unsigned long flags; |
2236 | 2269 | /* | |
2270 | * The page is isolated from LRU. So, collapse function | ||
2271 | * will not handle this page. But page splitting can happen. | ||
2272 | * Do this check under compound_page_lock(). The caller should | ||
2273 | * hold it. | ||
2274 | */ | ||
2237 | if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) | 2275 | if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) |
2238 | return -EBUSY; | 2276 | return -EBUSY; |
2239 | 2277 | ||
@@ -2265,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2265 | struct cgroup *cg = child->css.cgroup; | 2303 | struct cgroup *cg = child->css.cgroup; |
2266 | struct cgroup *pcg = cg->parent; | 2304 | struct cgroup *pcg = cg->parent; |
2267 | struct mem_cgroup *parent; | 2305 | struct mem_cgroup *parent; |
2268 | int charge = PAGE_SIZE; | 2306 | int page_size = PAGE_SIZE; |
2269 | unsigned long flags; | 2307 | unsigned long flags; |
2270 | int ret; | 2308 | int ret; |
2271 | 2309 | ||
@@ -2278,23 +2316,26 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2278 | goto out; | 2316 | goto out; |
2279 | if (isolate_lru_page(page)) | 2317 | if (isolate_lru_page(page)) |
2280 | goto put; | 2318 | goto put; |
2281 | /* The page is isolated from LRU and we have no race with splitting */ | 2319 | |
2282 | charge = PAGE_SIZE << compound_order(page); | 2320 | if (PageTransHuge(page)) |
2321 | page_size = HPAGE_SIZE; | ||
2283 | 2322 | ||
2284 | parent = mem_cgroup_from_cont(pcg); | 2323 | parent = mem_cgroup_from_cont(pcg); |
2285 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); | 2324 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, |
2325 | &parent, false, page_size); | ||
2286 | if (ret || !parent) | 2326 | if (ret || !parent) |
2287 | goto put_back; | 2327 | goto put_back; |
2288 | 2328 | ||
2289 | if (charge > PAGE_SIZE) | 2329 | if (page_size > PAGE_SIZE) |
2290 | flags = compound_lock_irqsave(page); | 2330 | flags = compound_lock_irqsave(page); |
2291 | 2331 | ||
2292 | ret = mem_cgroup_move_account(pc, child, parent, true, charge); | 2332 | ret = mem_cgroup_move_account(pc, child, parent, true, page_size); |
2293 | if (ret) | 2333 | if (ret) |
2294 | mem_cgroup_cancel_charge(parent, charge); | 2334 | mem_cgroup_cancel_charge(parent, page_size); |
2295 | put_back: | 2335 | |
2296 | if (charge > PAGE_SIZE) | 2336 | if (page_size > PAGE_SIZE) |
2297 | compound_unlock_irqrestore(page, flags); | 2337 | compound_unlock_irqrestore(page, flags); |
2338 | put_back: | ||
2298 | putback_lru_page(page); | 2339 | putback_lru_page(page); |
2299 | put: | 2340 | put: |
2300 | put_page(page); | 2341 | put_page(page); |
@@ -2312,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2312 | gfp_t gfp_mask, enum charge_type ctype) | 2353 | gfp_t gfp_mask, enum charge_type ctype) |
2313 | { | 2354 | { |
2314 | struct mem_cgroup *mem = NULL; | 2355 | struct mem_cgroup *mem = NULL; |
2356 | int page_size = PAGE_SIZE; | ||
2315 | struct page_cgroup *pc; | 2357 | struct page_cgroup *pc; |
2358 | bool oom = true; | ||
2316 | int ret; | 2359 | int ret; |
2317 | int page_size = PAGE_SIZE; | ||
2318 | 2360 | ||
2319 | if (PageTransHuge(page)) { | 2361 | if (PageTransHuge(page)) { |
2320 | page_size <<= compound_order(page); | 2362 | page_size <<= compound_order(page); |
2321 | VM_BUG_ON(!PageTransHuge(page)); | 2363 | VM_BUG_ON(!PageTransHuge(page)); |
2364 | /* | ||
2365 | * Never OOM-kill a process for a huge page. The | ||
2366 | * fault handler will fall back to regular pages. | ||
2367 | */ | ||
2368 | oom = false; | ||
2322 | } | 2369 | } |
2323 | 2370 | ||
2324 | pc = lookup_page_cgroup(page); | 2371 | pc = lookup_page_cgroup(page); |
@@ -2327,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2327 | return 0; | 2374 | return 0; |
2328 | prefetchw(pc); | 2375 | prefetchw(pc); |
2329 | 2376 | ||
2330 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); | 2377 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); |
2331 | if (ret || !mem) | 2378 | if (ret || !mem) |
2332 | return ret; | 2379 | return ret; |
2333 | 2380 | ||
@@ -5013,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
5013 | static int __init enable_swap_account(char *s) | 5060 | static int __init enable_swap_account(char *s) |
5014 | { | 5061 | { |
5015 | /* consider enabled if no parameter or 1 is given */ | 5062 | /* consider enabled if no parameter or 1 is given */ |
5016 | if (!s || !strcmp(s, "1")) | 5063 | if (!(*s) || !strcmp(s, "=1")) |
5017 | really_do_swap_account = 1; | 5064 | really_do_swap_account = 1; |
5018 | else if (!strcmp(s, "0")) | 5065 | else if (!strcmp(s, "=0")) |
5019 | really_do_swap_account = 0; | 5066 | really_do_swap_account = 0; |
5020 | return 1; | 5067 | return 1; |
5021 | } | 5068 | } |
@@ -5023,7 +5070,8 @@ __setup("swapaccount", enable_swap_account); | |||
5023 | 5070 | ||
5024 | static int __init disable_swap_account(char *s) | 5071 | static int __init disable_swap_account(char *s) |
5025 | { | 5072 | { |
5026 | enable_swap_account("0"); | 5073 | printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); |
5074 | enable_swap_account("=0"); | ||
5027 | return 1; | 5075 | return 1; |
5028 | } | 5076 | } |
5029 | __setup("noswapaccount", disable_swap_account); | 5077 | __setup("noswapaccount", disable_swap_account); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 548fbd70f026..0207c2f6f8bd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access) | |||
233 | } | 233 | } |
234 | 234 | ||
235 | /* | 235 | /* |
236 | * Only all shrink_slab here (which would also | 236 | * Only call shrink_slab here (which would also shrink other caches) if |
237 | * shrink other caches) if access is not potentially fatal. | 237 | * access is not potentially fatal. |
238 | */ | 238 | */ |
239 | if (access) { | 239 | if (access) { |
240 | int nr; | 240 | int nr; |
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
386 | struct task_struct *tsk; | 386 | struct task_struct *tsk; |
387 | struct anon_vma *av; | 387 | struct anon_vma *av; |
388 | 388 | ||
389 | if (!PageHuge(page) && unlikely(split_huge_page(page))) | ||
390 | return; | ||
391 | read_lock(&tasklist_lock); | 389 | read_lock(&tasklist_lock); |
392 | av = page_lock_anon_vma(page); | 390 | av = page_lock_anon_vma(page); |
393 | if (av == NULL) /* Not actually mapped anymore */ | 391 | if (av == NULL) /* Not actually mapped anymore */ |
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
856 | int ret; | 854 | int ret; |
857 | int kill = 1; | 855 | int kill = 1; |
858 | struct page *hpage = compound_head(p); | 856 | struct page *hpage = compound_head(p); |
857 | struct page *ppage; | ||
859 | 858 | ||
860 | if (PageReserved(p) || PageSlab(p)) | 859 | if (PageReserved(p) || PageSlab(p)) |
861 | return SWAP_SUCCESS; | 860 | return SWAP_SUCCESS; |
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
897 | } | 896 | } |
898 | 897 | ||
899 | /* | 898 | /* |
899 | * ppage: poisoned page | ||
900 | * if p is regular page(4k page) | ||
901 | * ppage == real poisoned page; | ||
902 | * else p is hugetlb or THP, ppage == head page. | ||
903 | */ | ||
904 | ppage = hpage; | ||
905 | |||
906 | if (PageTransHuge(hpage)) { | ||
907 | /* | ||
908 | * Verify that this isn't a hugetlbfs head page, the check for | ||
909 | * PageAnon is just for avoid tripping a split_huge_page | ||
910 | * internal debug check, as split_huge_page refuses to deal with | ||
911 | * anything that isn't an anon page. PageAnon can't go away fro | ||
912 | * under us because we hold a refcount on the hpage, without a | ||
913 | * refcount on the hpage. split_huge_page can't be safely called | ||
914 | * in the first place, having a refcount on the tail isn't | ||
915 | * enough * to be safe. | ||
916 | */ | ||
917 | if (!PageHuge(hpage) && PageAnon(hpage)) { | ||
918 | if (unlikely(split_huge_page(hpage))) { | ||
919 | /* | ||
920 | * FIXME: if splitting THP is failed, it is | ||
921 | * better to stop the following operation rather | ||
922 | * than causing panic by unmapping. System might | ||
923 | * survive if the page is freed later. | ||
924 | */ | ||
925 | printk(KERN_INFO | ||
926 | "MCE %#lx: failed to split THP\n", pfn); | ||
927 | |||
928 | BUG_ON(!PageHWPoison(p)); | ||
929 | return SWAP_FAIL; | ||
930 | } | ||
931 | /* THP is split, so ppage should be the real poisoned page. */ | ||
932 | ppage = p; | ||
933 | } | ||
934 | } | ||
935 | |||
936 | /* | ||
900 | * First collect all the processes that have the page | 937 | * First collect all the processes that have the page |
901 | * mapped in dirty form. This has to be done before try_to_unmap, | 938 | * mapped in dirty form. This has to be done before try_to_unmap, |
902 | * because ttu takes the rmap data structures down. | 939 | * because ttu takes the rmap data structures down. |
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
905 | * there's nothing that can be done. | 942 | * there's nothing that can be done. |
906 | */ | 943 | */ |
907 | if (kill) | 944 | if (kill) |
908 | collect_procs(hpage, &tokill); | 945 | collect_procs(ppage, &tokill); |
946 | |||
947 | if (hpage != ppage) | ||
948 | lock_page_nosync(ppage); | ||
909 | 949 | ||
910 | ret = try_to_unmap(hpage, ttu); | 950 | ret = try_to_unmap(ppage, ttu); |
911 | if (ret != SWAP_SUCCESS) | 951 | if (ret != SWAP_SUCCESS) |
912 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 952 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
913 | pfn, page_mapcount(hpage)); | 953 | pfn, page_mapcount(ppage)); |
954 | |||
955 | if (hpage != ppage) | ||
956 | unlock_page(ppage); | ||
914 | 957 | ||
915 | /* | 958 | /* |
916 | * Now that the dirty bit has been propagated to the | 959 | * Now that the dirty bit has been propagated to the |
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
921 | * use a more force-full uncatchable kill to prevent | 964 | * use a more force-full uncatchable kill to prevent |
922 | * any accesses to the poisoned memory. | 965 | * any accesses to the poisoned memory. |
923 | */ | 966 | */ |
924 | kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, | 967 | kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, |
925 | ret != SWAP_SUCCESS, p, pfn); | 968 | ret != SWAP_SUCCESS, p, pfn); |
926 | 969 | ||
927 | return ret; | 970 | return ret; |
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1022 | * The check (unnecessarily) ignores LRU pages being isolated and | 1065 | * The check (unnecessarily) ignores LRU pages being isolated and |
1023 | * walked by the page reclaim code, however that's not a big loss. | 1066 | * walked by the page reclaim code, however that's not a big loss. |
1024 | */ | 1067 | */ |
1025 | if (!PageLRU(p) && !PageHuge(p)) | 1068 | if (!PageHuge(p) && !PageTransCompound(p)) { |
1026 | shake_page(p, 0); | 1069 | if (!PageLRU(p)) |
1027 | if (!PageLRU(p) && !PageHuge(p)) { | 1070 | shake_page(p, 0); |
1028 | /* | 1071 | if (!PageLRU(p)) { |
1029 | * shake_page could have turned it free. | 1072 | /* |
1030 | */ | 1073 | * shake_page could have turned it free. |
1031 | if (is_free_buddy_page(p)) { | 1074 | */ |
1032 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1075 | if (is_free_buddy_page(p)) { |
1033 | return 0; | 1076 | action_result(pfn, "free buddy, 2nd try", |
1077 | DELAYED); | ||
1078 | return 0; | ||
1079 | } | ||
1080 | action_result(pfn, "non LRU", IGNORED); | ||
1081 | put_page(p); | ||
1082 | return -EBUSY; | ||
1034 | } | 1083 | } |
1035 | action_result(pfn, "non LRU", IGNORED); | ||
1036 | put_page(p); | ||
1037 | return -EBUSY; | ||
1038 | } | 1084 | } |
1039 | 1085 | ||
1040 | /* | 1086 | /* |
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1064 | * For error on the tail page, we should set PG_hwpoison | 1110 | * For error on the tail page, we should set PG_hwpoison |
1065 | * on the head page to show that the hugepage is hwpoisoned | 1111 | * on the head page to show that the hugepage is hwpoisoned |
1066 | */ | 1112 | */ |
1067 | if (PageTail(p) && TestSetPageHWPoison(hpage)) { | 1113 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
1068 | action_result(pfn, "hugepage already hardware poisoned", | 1114 | action_result(pfn, "hugepage already hardware poisoned", |
1069 | IGNORED); | 1115 | IGNORED); |
1070 | unlock_page(hpage); | 1116 | unlock_page(hpage); |
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1295 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, | 1341 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, |
1296 | true); | 1342 | true); |
1297 | if (ret) { | 1343 | if (ret) { |
1298 | putback_lru_pages(&pagelist); | 1344 | struct page *page1, *page2; |
1345 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | ||
1346 | put_page(page1); | ||
1347 | |||
1299 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1348 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", |
1300 | pfn, ret, page->flags); | 1349 | pfn, ret, page->flags); |
1301 | if (ret > 0) | 1350 | if (ret > 0) |
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1419 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1468 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1420 | 0, true); | 1469 | 0, true); |
1421 | if (ret) { | 1470 | if (ret) { |
1471 | putback_lru_pages(&pagelist); | ||
1422 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1472 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1423 | pfn, ret, page->flags); | 1473 | pfn, ret, page->flags); |
1424 | if (ret > 0) | 1474 | if (ret > 0) |
diff --git a/mm/memory.c b/mm/memory.c index 31250faff390..5823698c2b71 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2219 | &ptl); | 2219 | &ptl); |
2220 | if (!pte_same(*page_table, orig_pte)) { | 2220 | if (!pte_same(*page_table, orig_pte)) { |
2221 | unlock_page(old_page); | 2221 | unlock_page(old_page); |
2222 | page_cache_release(old_page); | ||
2223 | goto unlock; | 2222 | goto unlock; |
2224 | } | 2223 | } |
2225 | page_cache_release(old_page); | 2224 | page_cache_release(old_page); |
@@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2289 | &ptl); | 2288 | &ptl); |
2290 | if (!pte_same(*page_table, orig_pte)) { | 2289 | if (!pte_same(*page_table, orig_pte)) { |
2291 | unlock_page(old_page); | 2290 | unlock_page(old_page); |
2292 | page_cache_release(old_page); | ||
2293 | goto unlock; | 2291 | goto unlock; |
2294 | } | 2292 | } |
2295 | 2293 | ||
@@ -2367,16 +2365,6 @@ gotten: | |||
2367 | } | 2365 | } |
2368 | __SetPageUptodate(new_page); | 2366 | __SetPageUptodate(new_page); |
2369 | 2367 | ||
2370 | /* | ||
2371 | * Don't let another task, with possibly unlocked vma, | ||
2372 | * keep the mlocked page. | ||
2373 | */ | ||
2374 | if ((vma->vm_flags & VM_LOCKED) && old_page) { | ||
2375 | lock_page(old_page); /* for LRU manipulation */ | ||
2376 | clear_page_mlock(old_page); | ||
2377 | unlock_page(old_page); | ||
2378 | } | ||
2379 | |||
2380 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2368 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2381 | goto oom_free_new; | 2369 | goto oom_free_new; |
2382 | 2370 | ||
@@ -2444,10 +2432,20 @@ gotten: | |||
2444 | 2432 | ||
2445 | if (new_page) | 2433 | if (new_page) |
2446 | page_cache_release(new_page); | 2434 | page_cache_release(new_page); |
2447 | if (old_page) | ||
2448 | page_cache_release(old_page); | ||
2449 | unlock: | 2435 | unlock: |
2450 | pte_unmap_unlock(page_table, ptl); | 2436 | pte_unmap_unlock(page_table, ptl); |
2437 | if (old_page) { | ||
2438 | /* | ||
2439 | * Don't let another task, with possibly unlocked vma, | ||
2440 | * keep the mlocked page. | ||
2441 | */ | ||
2442 | if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { | ||
2443 | lock_page(old_page); /* LRU manipulation */ | ||
2444 | munlock_vma_page(old_page); | ||
2445 | unlock_page(old_page); | ||
2446 | } | ||
2447 | page_cache_release(old_page); | ||
2448 | } | ||
2451 | return ret; | 2449 | return ret; |
2452 | oom_free_new: | 2450 | oom_free_new: |
2453 | page_cache_release(new_page); | 2451 | page_cache_release(new_page); |
@@ -2650,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2650 | details.last_index = ULONG_MAX; | 2648 | details.last_index = ULONG_MAX; |
2651 | details.i_mmap_lock = &mapping->i_mmap_lock; | 2649 | details.i_mmap_lock = &mapping->i_mmap_lock; |
2652 | 2650 | ||
2651 | mutex_lock(&mapping->unmap_mutex); | ||
2653 | spin_lock(&mapping->i_mmap_lock); | 2652 | spin_lock(&mapping->i_mmap_lock); |
2654 | 2653 | ||
2655 | /* Protect against endless unmapping loops */ | 2654 | /* Protect against endless unmapping loops */ |
@@ -2666,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2666 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2665 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2667 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2666 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
2668 | spin_unlock(&mapping->i_mmap_lock); | 2667 | spin_unlock(&mapping->i_mmap_lock); |
2668 | mutex_unlock(&mapping->unmap_mutex); | ||
2669 | } | 2669 | } |
2670 | EXPORT_SYMBOL(unmap_mapping_range); | 2670 | EXPORT_SYMBOL(unmap_mapping_range); |
2671 | 2671 | ||
@@ -3053,12 +3053,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3053 | goto out; | 3053 | goto out; |
3054 | } | 3054 | } |
3055 | charged = 1; | 3055 | charged = 1; |
3056 | /* | ||
3057 | * Don't let another task, with possibly unlocked vma, | ||
3058 | * keep the mlocked page. | ||
3059 | */ | ||
3060 | if (vma->vm_flags & VM_LOCKED) | ||
3061 | clear_page_mlock(vmf.page); | ||
3062 | copy_user_highpage(page, vmf.page, address, vma); | 3056 | copy_user_highpage(page, vmf.page, address, vma); |
3063 | __SetPageUptodate(page); | 3057 | __SetPageUptodate(page); |
3064 | } else { | 3058 | } else { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 368fc9d23610..b53ec99f1428 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | |||
1524 | } | 1524 | } |
1525 | 1525 | ||
1526 | /* Return a zonelist indicated by gfp for node representing a mempolicy */ | 1526 | /* Return a zonelist indicated by gfp for node representing a mempolicy */ |
1527 | static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) | 1527 | static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, |
1528 | int nd) | ||
1528 | { | 1529 | { |
1529 | int nd = numa_node_id(); | ||
1530 | |||
1531 | switch (policy->mode) { | 1530 | switch (policy->mode) { |
1532 | case MPOL_PREFERRED: | 1531 | case MPOL_PREFERRED: |
1533 | if (!(policy->flags & MPOL_F_LOCAL)) | 1532 | if (!(policy->flags & MPOL_F_LOCAL)) |
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
1679 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, | 1678 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, |
1680 | huge_page_shift(hstate_vma(vma))), gfp_flags); | 1679 | huge_page_shift(hstate_vma(vma))), gfp_flags); |
1681 | } else { | 1680 | } else { |
1682 | zl = policy_zonelist(gfp_flags, *mpol); | 1681 | zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); |
1683 | if ((*mpol)->mode == MPOL_BIND) | 1682 | if ((*mpol)->mode == MPOL_BIND) |
1684 | *nodemask = &(*mpol)->v.nodes; | 1683 | *nodemask = &(*mpol)->v.nodes; |
1685 | } | 1684 | } |
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1820 | */ | 1819 | */ |
1821 | struct page * | 1820 | struct page * |
1822 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1821 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1823 | unsigned long addr) | 1822 | unsigned long addr, int node) |
1824 | { | 1823 | { |
1825 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1824 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1826 | struct zonelist *zl; | 1825 | struct zonelist *zl; |
@@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1830 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1829 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1831 | unsigned nid; | 1830 | unsigned nid; |
1832 | 1831 | ||
1833 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1832 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
1834 | mpol_cond_put(pol); | 1833 | mpol_cond_put(pol); |
1835 | page = alloc_page_interleave(gfp, order, nid); | 1834 | page = alloc_page_interleave(gfp, order, nid); |
1836 | put_mems_allowed(); | 1835 | put_mems_allowed(); |
1837 | return page; | 1836 | return page; |
1838 | } | 1837 | } |
1839 | zl = policy_zonelist(gfp, pol); | 1838 | zl = policy_zonelist(gfp, pol, node); |
1840 | if (unlikely(mpol_needs_cond_ref(pol))) { | 1839 | if (unlikely(mpol_needs_cond_ref(pol))) { |
1841 | /* | 1840 | /* |
1842 | * slow path: ref counted shared policy | 1841 | * slow path: ref counted shared policy |
@@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1892 | page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1891 | page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
1893 | else | 1892 | else |
1894 | page = __alloc_pages_nodemask(gfp, order, | 1893 | page = __alloc_pages_nodemask(gfp, order, |
1895 | policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); | 1894 | policy_zonelist(gfp, pol, numa_node_id()), |
1895 | policy_nodemask(gfp, pol)); | ||
1896 | put_mems_allowed(); | 1896 | put_mems_allowed(); |
1897 | return page; | 1897 | return page; |
1898 | } | 1898 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 46fe8cc13d67..352de555626c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -772,6 +772,7 @@ uncharge: | |||
772 | unlock: | 772 | unlock: |
773 | unlock_page(page); | 773 | unlock_page(page); |
774 | 774 | ||
775 | move_newpage: | ||
775 | if (rc != -EAGAIN) { | 776 | if (rc != -EAGAIN) { |
776 | /* | 777 | /* |
777 | * A page that has been migrated has all references | 778 | * A page that has been migrated has all references |
@@ -785,8 +786,6 @@ unlock: | |||
785 | putback_lru_page(page); | 786 | putback_lru_page(page); |
786 | } | 787 | } |
787 | 788 | ||
788 | move_newpage: | ||
789 | |||
790 | /* | 789 | /* |
791 | * Move the new page to the LRU. If migration was not successful | 790 | * Move the new page to the LRU. If migration was not successful |
792 | * then this will free the page. | 791 | * then this will free the page. |
@@ -888,7 +887,7 @@ out: | |||
888 | * are movable anymore because to has become empty | 887 | * are movable anymore because to has become empty |
889 | * or no retryable pages exist anymore. | 888 | * or no retryable pages exist anymore. |
890 | * Caller should call putback_lru_pages to return pages to the LRU | 889 | * Caller should call putback_lru_pages to return pages to the LRU |
891 | * or free list. | 890 | * or free list only if ret != 0. |
892 | * | 891 | * |
893 | * Return: Number of pages not migrated or error code. | 892 | * Return: Number of pages not migrated or error code. |
894 | */ | 893 | */ |
@@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from, | |||
981 | } | 980 | } |
982 | rc = 0; | 981 | rc = 0; |
983 | out: | 982 | out: |
984 | |||
985 | list_for_each_entry_safe(page, page2, from, lru) | ||
986 | put_page(page); | ||
987 | |||
988 | if (rc) | 983 | if (rc) |
989 | return rc; | 984 | return rc; |
990 | 985 | ||
@@ -1292,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1292 | return -EPERM; | 1287 | return -EPERM; |
1293 | 1288 | ||
1294 | /* Find the mm_struct */ | 1289 | /* Find the mm_struct */ |
1295 | read_lock(&tasklist_lock); | 1290 | rcu_read_lock(); |
1296 | task = pid ? find_task_by_vpid(pid) : current; | 1291 | task = pid ? find_task_by_vpid(pid) : current; |
1297 | if (!task) { | 1292 | if (!task) { |
1298 | read_unlock(&tasklist_lock); | 1293 | rcu_read_unlock(); |
1299 | return -ESRCH; | 1294 | return -ESRCH; |
1300 | } | 1295 | } |
1301 | mm = get_task_mm(task); | 1296 | mm = get_task_mm(task); |
1302 | read_unlock(&tasklist_lock); | 1297 | rcu_read_unlock(); |
1303 | 1298 | ||
1304 | if (!mm) | 1299 | if (!mm) |
1305 | return -EINVAL; | 1300 | return -EINVAL; |
diff --git a/mm/mlock.c b/mm/mlock.c index 13e81ee8be9d..c3924c7f00be 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
178 | if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) | 178 | if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) |
179 | gup_flags |= FOLL_WRITE; | 179 | gup_flags |= FOLL_WRITE; |
180 | 180 | ||
181 | /* | ||
182 | * We want mlock to succeed for regions that have any permissions | ||
183 | * other than PROT_NONE. | ||
184 | */ | ||
185 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) | ||
186 | gup_flags |= FOLL_FORCE; | ||
187 | |||
181 | if (vma->vm_flags & VM_LOCKED) | 188 | if (vma->vm_flags & VM_LOCKED) |
182 | gup_flags |= FOLL_MLOCK; | 189 | gup_flags |= FOLL_MLOCK; |
183 | 190 | ||
diff --git a/mm/mremap.c b/mm/mremap.c index 9925b6391b80..1de98d492ddc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
94 | */ | 94 | */ |
95 | mapping = vma->vm_file->f_mapping; | 95 | mapping = vma->vm_file->f_mapping; |
96 | spin_lock(&mapping->i_mmap_lock); | 96 | spin_lock(&mapping->i_mmap_lock); |
97 | if (new_vma->vm_truncate_count && | 97 | new_vma->vm_truncate_count = 0; |
98 | new_vma->vm_truncate_count != vma->vm_truncate_count) | ||
99 | new_vma->vm_truncate_count = 0; | ||
100 | } | 98 | } |
101 | 99 | ||
102 | /* | 100 | /* |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90c1439549fd..cdef1d4b4e47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu) | |||
1088 | pset = per_cpu_ptr(zone->pageset, cpu); | 1088 | pset = per_cpu_ptr(zone->pageset, cpu); |
1089 | 1089 | ||
1090 | pcp = &pset->pcp; | 1090 | pcp = &pset->pcp; |
1091 | free_pcppages_bulk(zone, pcp->count, pcp); | 1091 | if (pcp->count) { |
1092 | pcp->count = 0; | 1092 | free_pcppages_bulk(zone, pcp->count, pcp); |
1093 | pcp->count = 0; | ||
1094 | } | ||
1093 | local_irq_restore(flags); | 1095 | local_irq_restore(flags); |
1094 | } | 1096 | } |
1095 | } | 1097 | } |
@@ -2034,6 +2036,14 @@ restart: | |||
2034 | */ | 2036 | */ |
2035 | alloc_flags = gfp_to_alloc_flags(gfp_mask); | 2037 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
2036 | 2038 | ||
2039 | /* | ||
2040 | * Find the true preferred zone if the allocation is unconstrained by | ||
2041 | * cpusets. | ||
2042 | */ | ||
2043 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) | ||
2044 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | ||
2045 | &preferred_zone); | ||
2046 | |||
2037 | /* This is the last chance, in general, before the goto nopage. */ | 2047 | /* This is the last chance, in general, before the goto nopage. */ |
2038 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2048 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2039 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2049 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
@@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2192 | 2202 | ||
2193 | get_mems_allowed(); | 2203 | get_mems_allowed(); |
2194 | /* The preferred zone is used for statistics later */ | 2204 | /* The preferred zone is used for statistics later */ |
2195 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | 2205 | first_zones_zonelist(zonelist, high_zoneidx, |
2206 | nodemask ? : &cpuset_current_mems_allowed, | ||
2207 | &preferred_zone); | ||
2196 | if (!preferred_zone) { | 2208 | if (!preferred_zone) { |
2197 | put_mems_allowed(); | 2209 | put_mems_allowed(); |
2198 | return NULL; | 2210 | return NULL; |
@@ -5364,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5364 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | 5376 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
5365 | unsigned long check = pfn + iter; | 5377 | unsigned long check = pfn + iter; |
5366 | 5378 | ||
5367 | if (!pfn_valid_within(check)) { | 5379 | if (!pfn_valid_within(check)) |
5368 | iter++; | ||
5369 | continue; | 5380 | continue; |
5370 | } | 5381 | |
5371 | page = pfn_to_page(check); | 5382 | page = pfn_to_page(check); |
5372 | if (!page_count(page)) { | 5383 | if (!page_count(page)) { |
5373 | if (PageBuddy(page)) | 5384 | if (PageBuddy(page)) |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0369f5b3ba1b..eb663fb533e0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -6,6 +6,7 @@ | |||
6 | * Copyright (C) 2010 Linus Torvalds | 6 | * Copyright (C) 2010 Linus Torvalds |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/pagemap.h> | ||
9 | #include <asm/tlb.h> | 10 | #include <asm/tlb.h> |
10 | #include <asm-generic/pgtable.h> | 11 | #include <asm-generic/pgtable.h> |
11 | 12 | ||
@@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t; | |||
191 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) | 191 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) |
192 | 192 | ||
193 | /* | 193 | /* |
194 | * struct slab | ||
195 | * | ||
196 | * Manages the objs in a slab. Placed either at the beginning of mem allocated | ||
197 | * for a slab, or allocated from an general cache. | ||
198 | * Slabs are chained into three list: fully used, partial, fully free slabs. | ||
199 | */ | ||
200 | struct slab { | ||
201 | struct list_head list; | ||
202 | unsigned long colouroff; | ||
203 | void *s_mem; /* including colour offset */ | ||
204 | unsigned int inuse; /* num of objs active in slab */ | ||
205 | kmem_bufctl_t free; | ||
206 | unsigned short nodeid; | ||
207 | }; | ||
208 | |||
209 | /* | ||
210 | * struct slab_rcu | 194 | * struct slab_rcu |
211 | * | 195 | * |
212 | * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to | 196 | * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to |
@@ -219,8 +203,6 @@ struct slab { | |||
219 | * | 203 | * |
220 | * rcu_read_lock before reading the address, then rcu_read_unlock after | 204 | * rcu_read_lock before reading the address, then rcu_read_unlock after |
221 | * taking the spinlock within the structure expected at that address. | 205 | * taking the spinlock within the structure expected at that address. |
222 | * | ||
223 | * We assume struct slab_rcu can overlay struct slab when destroying. | ||
224 | */ | 206 | */ |
225 | struct slab_rcu { | 207 | struct slab_rcu { |
226 | struct rcu_head head; | 208 | struct rcu_head head; |
@@ -229,6 +211,27 @@ struct slab_rcu { | |||
229 | }; | 211 | }; |
230 | 212 | ||
231 | /* | 213 | /* |
214 | * struct slab | ||
215 | * | ||
216 | * Manages the objs in a slab. Placed either at the beginning of mem allocated | ||
217 | * for a slab, or allocated from an general cache. | ||
218 | * Slabs are chained into three list: fully used, partial, fully free slabs. | ||
219 | */ | ||
220 | struct slab { | ||
221 | union { | ||
222 | struct { | ||
223 | struct list_head list; | ||
224 | unsigned long colouroff; | ||
225 | void *s_mem; /* including colour offset */ | ||
226 | unsigned int inuse; /* num of objs active in slab */ | ||
227 | kmem_bufctl_t free; | ||
228 | unsigned short nodeid; | ||
229 | }; | ||
230 | struct slab_rcu __slab_cover_slab_rcu; | ||
231 | }; | ||
232 | }; | ||
233 | |||
234 | /* | ||
232 | * struct array_cache | 235 | * struct array_cache |
233 | * | 236 | * |
234 | * Purpose: | 237 | * Purpose: |
@@ -305,11 +305,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s) | |||
305 | return s->size; | 305 | return s->size; |
306 | } | 306 | } |
307 | 307 | ||
308 | static inline int order_objects(int order, unsigned long size, int reserved) | ||
309 | { | ||
310 | return ((PAGE_SIZE << order) - reserved) / size; | ||
311 | } | ||
312 | |||
308 | static inline struct kmem_cache_order_objects oo_make(int order, | 313 | static inline struct kmem_cache_order_objects oo_make(int order, |
309 | unsigned long size) | 314 | unsigned long size, int reserved) |
310 | { | 315 | { |
311 | struct kmem_cache_order_objects x = { | 316 | struct kmem_cache_order_objects x = { |
312 | (order << OO_SHIFT) + (PAGE_SIZE << order) / size | 317 | (order << OO_SHIFT) + order_objects(order, size, reserved) |
313 | }; | 318 | }; |
314 | 319 | ||
315 | return x; | 320 | return x; |
@@ -641,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
641 | return 1; | 646 | return 1; |
642 | 647 | ||
643 | start = page_address(page); | 648 | start = page_address(page); |
644 | length = (PAGE_SIZE << compound_order(page)); | 649 | length = (PAGE_SIZE << compound_order(page)) - s->reserved; |
645 | end = start + length; | 650 | end = start + length; |
646 | remainder = length % s->size; | 651 | remainder = length % s->size; |
647 | if (!remainder) | 652 | if (!remainder) |
@@ -722,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
722 | return 0; | 727 | return 0; |
723 | } | 728 | } |
724 | 729 | ||
725 | maxobj = (PAGE_SIZE << compound_order(page)) / s->size; | 730 | maxobj = order_objects(compound_order(page), s->size, s->reserved); |
726 | if (page->objects > maxobj) { | 731 | if (page->objects > maxobj) { |
727 | slab_err(s, page, "objects %u > max %u", | 732 | slab_err(s, page, "objects %u > max %u", |
728 | s->name, page->objects, maxobj); | 733 | s->name, page->objects, maxobj); |
@@ -772,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | |||
772 | nr++; | 777 | nr++; |
773 | } | 778 | } |
774 | 779 | ||
775 | max_objects = (PAGE_SIZE << compound_order(page)) / s->size; | 780 | max_objects = order_objects(compound_order(page), s->size, s->reserved); |
776 | if (max_objects > MAX_OBJS_PER_PAGE) | 781 | if (max_objects > MAX_OBJS_PER_PAGE) |
777 | max_objects = MAX_OBJS_PER_PAGE; | 782 | max_objects = MAX_OBJS_PER_PAGE; |
778 | 783 | ||
@@ -1273,21 +1278,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1273 | __free_pages(page, order); | 1278 | __free_pages(page, order); |
1274 | } | 1279 | } |
1275 | 1280 | ||
1281 | #define need_reserve_slab_rcu \ | ||
1282 | (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) | ||
1283 | |||
1276 | static void rcu_free_slab(struct rcu_head *h) | 1284 | static void rcu_free_slab(struct rcu_head *h) |
1277 | { | 1285 | { |
1278 | struct page *page; | 1286 | struct page *page; |
1279 | 1287 | ||
1280 | page = container_of((struct list_head *)h, struct page, lru); | 1288 | if (need_reserve_slab_rcu) |
1289 | page = virt_to_head_page(h); | ||
1290 | else | ||
1291 | page = container_of((struct list_head *)h, struct page, lru); | ||
1292 | |||
1281 | __free_slab(page->slab, page); | 1293 | __free_slab(page->slab, page); |
1282 | } | 1294 | } |
1283 | 1295 | ||
1284 | static void free_slab(struct kmem_cache *s, struct page *page) | 1296 | static void free_slab(struct kmem_cache *s, struct page *page) |
1285 | { | 1297 | { |
1286 | if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { | 1298 | if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { |
1287 | /* | 1299 | struct rcu_head *head; |
1288 | * RCU free overloads the RCU head over the LRU | 1300 | |
1289 | */ | 1301 | if (need_reserve_slab_rcu) { |
1290 | struct rcu_head *head = (void *)&page->lru; | 1302 | int order = compound_order(page); |
1303 | int offset = (PAGE_SIZE << order) - s->reserved; | ||
1304 | |||
1305 | VM_BUG_ON(s->reserved != sizeof(*head)); | ||
1306 | head = page_address(page) + offset; | ||
1307 | } else { | ||
1308 | /* | ||
1309 | * RCU free overloads the RCU head over the LRU | ||
1310 | */ | ||
1311 | head = (void *)&page->lru; | ||
1312 | } | ||
1291 | 1313 | ||
1292 | call_rcu(head, rcu_free_slab); | 1314 | call_rcu(head, rcu_free_slab); |
1293 | } else | 1315 | } else |
@@ -2012,13 +2034,13 @@ static int slub_nomerge; | |||
2012 | * the smallest order which will fit the object. | 2034 | * the smallest order which will fit the object. |
2013 | */ | 2035 | */ |
2014 | static inline int slab_order(int size, int min_objects, | 2036 | static inline int slab_order(int size, int min_objects, |
2015 | int max_order, int fract_leftover) | 2037 | int max_order, int fract_leftover, int reserved) |
2016 | { | 2038 | { |
2017 | int order; | 2039 | int order; |
2018 | int rem; | 2040 | int rem; |
2019 | int min_order = slub_min_order; | 2041 | int min_order = slub_min_order; |
2020 | 2042 | ||
2021 | if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) | 2043 | if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) |
2022 | return get_order(size * MAX_OBJS_PER_PAGE) - 1; | 2044 | return get_order(size * MAX_OBJS_PER_PAGE) - 1; |
2023 | 2045 | ||
2024 | for (order = max(min_order, | 2046 | for (order = max(min_order, |
@@ -2027,10 +2049,10 @@ static inline int slab_order(int size, int min_objects, | |||
2027 | 2049 | ||
2028 | unsigned long slab_size = PAGE_SIZE << order; | 2050 | unsigned long slab_size = PAGE_SIZE << order; |
2029 | 2051 | ||
2030 | if (slab_size < min_objects * size) | 2052 | if (slab_size < min_objects * size + reserved) |
2031 | continue; | 2053 | continue; |
2032 | 2054 | ||
2033 | rem = slab_size % size; | 2055 | rem = (slab_size - reserved) % size; |
2034 | 2056 | ||
2035 | if (rem <= slab_size / fract_leftover) | 2057 | if (rem <= slab_size / fract_leftover) |
2036 | break; | 2058 | break; |
@@ -2040,7 +2062,7 @@ static inline int slab_order(int size, int min_objects, | |||
2040 | return order; | 2062 | return order; |
2041 | } | 2063 | } |
2042 | 2064 | ||
2043 | static inline int calculate_order(int size) | 2065 | static inline int calculate_order(int size, int reserved) |
2044 | { | 2066 | { |
2045 | int order; | 2067 | int order; |
2046 | int min_objects; | 2068 | int min_objects; |
@@ -2058,14 +2080,14 @@ static inline int calculate_order(int size) | |||
2058 | min_objects = slub_min_objects; | 2080 | min_objects = slub_min_objects; |
2059 | if (!min_objects) | 2081 | if (!min_objects) |
2060 | min_objects = 4 * (fls(nr_cpu_ids) + 1); | 2082 | min_objects = 4 * (fls(nr_cpu_ids) + 1); |
2061 | max_objects = (PAGE_SIZE << slub_max_order)/size; | 2083 | max_objects = order_objects(slub_max_order, size, reserved); |
2062 | min_objects = min(min_objects, max_objects); | 2084 | min_objects = min(min_objects, max_objects); |
2063 | 2085 | ||
2064 | while (min_objects > 1) { | 2086 | while (min_objects > 1) { |
2065 | fraction = 16; | 2087 | fraction = 16; |
2066 | while (fraction >= 4) { | 2088 | while (fraction >= 4) { |
2067 | order = slab_order(size, min_objects, | 2089 | order = slab_order(size, min_objects, |
2068 | slub_max_order, fraction); | 2090 | slub_max_order, fraction, reserved); |
2069 | if (order <= slub_max_order) | 2091 | if (order <= slub_max_order) |
2070 | return order; | 2092 | return order; |
2071 | fraction /= 2; | 2093 | fraction /= 2; |
@@ -2077,14 +2099,14 @@ static inline int calculate_order(int size) | |||
2077 | * We were unable to place multiple objects in a slab. Now | 2099 | * We were unable to place multiple objects in a slab. Now |
2078 | * lets see if we can place a single object there. | 2100 | * lets see if we can place a single object there. |
2079 | */ | 2101 | */ |
2080 | order = slab_order(size, 1, slub_max_order, 1); | 2102 | order = slab_order(size, 1, slub_max_order, 1, reserved); |
2081 | if (order <= slub_max_order) | 2103 | if (order <= slub_max_order) |
2082 | return order; | 2104 | return order; |
2083 | 2105 | ||
2084 | /* | 2106 | /* |
2085 | * Doh this slab cannot be placed using slub_max_order. | 2107 | * Doh this slab cannot be placed using slub_max_order. |
2086 | */ | 2108 | */ |
2087 | order = slab_order(size, 1, MAX_ORDER, 1); | 2109 | order = slab_order(size, 1, MAX_ORDER, 1, reserved); |
2088 | if (order < MAX_ORDER) | 2110 | if (order < MAX_ORDER) |
2089 | return order; | 2111 | return order; |
2090 | return -ENOSYS; | 2112 | return -ENOSYS; |
@@ -2335,7 +2357,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2335 | if (forced_order >= 0) | 2357 | if (forced_order >= 0) |
2336 | order = forced_order; | 2358 | order = forced_order; |
2337 | else | 2359 | else |
2338 | order = calculate_order(size); | 2360 | order = calculate_order(size, s->reserved); |
2339 | 2361 | ||
2340 | if (order < 0) | 2362 | if (order < 0) |
2341 | return 0; | 2363 | return 0; |
@@ -2353,8 +2375,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2353 | /* | 2375 | /* |
2354 | * Determine the number of objects per slab | 2376 | * Determine the number of objects per slab |
2355 | */ | 2377 | */ |
2356 | s->oo = oo_make(order, size); | 2378 | s->oo = oo_make(order, size, s->reserved); |
2357 | s->min = oo_make(get_order(size), size); | 2379 | s->min = oo_make(get_order(size), size, s->reserved); |
2358 | if (oo_objects(s->oo) > oo_objects(s->max)) | 2380 | if (oo_objects(s->oo) > oo_objects(s->max)) |
2359 | s->max = s->oo; | 2381 | s->max = s->oo; |
2360 | 2382 | ||
@@ -2373,6 +2395,10 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
2373 | s->objsize = size; | 2395 | s->objsize = size; |
2374 | s->align = align; | 2396 | s->align = align; |
2375 | s->flags = kmem_cache_flags(size, flags, name, ctor); | 2397 | s->flags = kmem_cache_flags(size, flags, name, ctor); |
2398 | s->reserved = 0; | ||
2399 | |||
2400 | if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) | ||
2401 | s->reserved = sizeof(struct rcu_head); | ||
2376 | 2402 | ||
2377 | if (!calculate_sizes(s, -1)) | 2403 | if (!calculate_sizes(s, -1)) |
2378 | goto error; | 2404 | goto error; |
@@ -4014,6 +4040,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) | |||
4014 | } | 4040 | } |
4015 | SLAB_ATTR_RO(destroy_by_rcu); | 4041 | SLAB_ATTR_RO(destroy_by_rcu); |
4016 | 4042 | ||
4043 | static ssize_t reserved_show(struct kmem_cache *s, char *buf) | ||
4044 | { | ||
4045 | return sprintf(buf, "%d\n", s->reserved); | ||
4046 | } | ||
4047 | SLAB_ATTR_RO(reserved); | ||
4048 | |||
4017 | #ifdef CONFIG_SLUB_DEBUG | 4049 | #ifdef CONFIG_SLUB_DEBUG |
4018 | static ssize_t slabs_show(struct kmem_cache *s, char *buf) | 4050 | static ssize_t slabs_show(struct kmem_cache *s, char *buf) |
4019 | { | 4051 | { |
@@ -4300,6 +4332,7 @@ static struct attribute *slab_attrs[] = { | |||
4300 | &reclaim_account_attr.attr, | 4332 | &reclaim_account_attr.attr, |
4301 | &destroy_by_rcu_attr.attr, | 4333 | &destroy_by_rcu_attr.attr, |
4302 | &shrink_attr.attr, | 4334 | &shrink_attr.attr, |
4335 | &reserved_attr.attr, | ||
4303 | #ifdef CONFIG_SLUB_DEBUG | 4336 | #ifdef CONFIG_SLUB_DEBUG |
4304 | &total_objects_attr.attr, | 4337 | &total_objects_attr.attr, |
4305 | &slabs_attr.attr, | 4338 | &slabs_attr.attr, |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 07a458d72fa8..0341c5700e34 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1940 | 1940 | ||
1941 | error = -EINVAL; | 1941 | error = -EINVAL; |
1942 | if (S_ISBLK(inode->i_mode)) { | 1942 | if (S_ISBLK(inode->i_mode)) { |
1943 | bdev = I_BDEV(inode); | 1943 | bdev = bdgrab(I_BDEV(inode)); |
1944 | error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, | 1944 | error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, |
1945 | sys_swapon); | 1945 | sys_swapon); |
1946 | if (error < 0) { | 1946 | if (error < 0) { |
diff --git a/mm/truncate.c b/mm/truncate.c index 49feb46e77b8..d64296be00d3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
225 | next = start; | 225 | next = start; |
226 | while (next <= end && | 226 | while (next <= end && |
227 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 227 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
228 | mem_cgroup_uncharge_start(); | ||
228 | for (i = 0; i < pagevec_count(&pvec); i++) { | 229 | for (i = 0; i < pagevec_count(&pvec); i++) { |
229 | struct page *page = pvec.pages[i]; | 230 | struct page *page = pvec.pages[i]; |
230 | pgoff_t page_index = page->index; | 231 | pgoff_t page_index = page->index; |
@@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
247 | unlock_page(page); | 248 | unlock_page(page); |
248 | } | 249 | } |
249 | pagevec_release(&pvec); | 250 | pagevec_release(&pvec); |
251 | mem_cgroup_uncharge_end(); | ||
250 | cond_resched(); | 252 | cond_resched(); |
251 | } | 253 | } |
252 | 254 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index f5d90dedebba..6771ea70bfe7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
1841 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | 1841 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) |
1842 | return false; | 1842 | return false; |
1843 | 1843 | ||
1844 | /* | 1844 | /* Consider stopping depending on scan and reclaim activity */ |
1845 | * If we failed to reclaim and have scanned the full list, stop. | 1845 | if (sc->gfp_mask & __GFP_REPEAT) { |
1846 | * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far | 1846 | /* |
1847 | * faster but obviously would be less likely to succeed | 1847 | * For __GFP_REPEAT allocations, stop reclaiming if the |
1848 | * allocation. If this is desirable, use GFP_REPEAT to decide | 1848 | * full LRU list has been scanned and we are still failing |
1849 | * if both reclaimed and scanned should be checked or just | 1849 | * to reclaim pages. This full LRU scan is potentially |
1850 | * reclaimed | 1850 | * expensive but a __GFP_REPEAT caller really wants to succeed |
1851 | */ | 1851 | */ |
1852 | if (!nr_reclaimed && !nr_scanned) | 1852 | if (!nr_reclaimed && !nr_scanned) |
1853 | return false; | 1853 | return false; |
1854 | } else { | ||
1855 | /* | ||
1856 | * For non-__GFP_REPEAT allocations which can presumably | ||
1857 | * fail without consequence, stop if we failed to reclaim | ||
1858 | * any pages from the last SWAP_CLUSTER_MAX number of | ||
1859 | * pages that were scanned. This will return to the | ||
1860 | * caller faster at the risk reclaim/compaction and | ||
1861 | * the resulting allocation attempt fails | ||
1862 | */ | ||
1863 | if (!nr_reclaimed) | ||
1864 | return false; | ||
1865 | } | ||
1854 | 1866 | ||
1855 | /* | 1867 | /* |
1856 | * If we have not reclaimed enough pages for compaction and the | 1868 | * If we have not reclaimed enough pages for compaction and the |
@@ -1882,12 +1894,12 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1882 | unsigned long nr[NR_LRU_LISTS]; | 1894 | unsigned long nr[NR_LRU_LISTS]; |
1883 | unsigned long nr_to_scan; | 1895 | unsigned long nr_to_scan; |
1884 | enum lru_list l; | 1896 | enum lru_list l; |
1885 | unsigned long nr_reclaimed; | 1897 | unsigned long nr_reclaimed, nr_scanned; |
1886 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1898 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1887 | unsigned long nr_scanned = sc->nr_scanned; | ||
1888 | 1899 | ||
1889 | restart: | 1900 | restart: |
1890 | nr_reclaimed = 0; | 1901 | nr_reclaimed = 0; |
1902 | nr_scanned = sc->nr_scanned; | ||
1891 | get_scan_count(zone, sc, nr, priority); | 1903 | get_scan_count(zone, sc, nr, priority); |
1892 | 1904 | ||
1893 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1905 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
@@ -2083,7 +2095,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2083 | struct zone *preferred_zone; | 2095 | struct zone *preferred_zone; |
2084 | 2096 | ||
2085 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | 2097 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), |
2086 | NULL, &preferred_zone); | 2098 | &cpuset_current_mems_allowed, |
2099 | &preferred_zone); | ||
2087 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | 2100 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); |
2088 | } | 2101 | } |
2089 | } | 2102 | } |