aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c43
-rw-r--r--mm/memory.c21
-rw-r--r--mm/mlock.c5
-rw-r--r--mm/mmap.c11
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page_alloc.c57
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/shmem.c149
-rw-r--r--mm/slub.c4
-rw-r--r--mm/swap.c3
-rw-r--r--mm/vmscan.c2
11 files changed, 179 insertions, 127 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 470dcda10add..83326ad66d9b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1408,6 +1408,9 @@ out:
1408 return ret; 1408 return ret;
1409} 1409}
1410 1410
1411#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
1412 VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1413
1411int hugepage_madvise(struct vm_area_struct *vma, 1414int hugepage_madvise(struct vm_area_struct *vma,
1412 unsigned long *vm_flags, int advice) 1415 unsigned long *vm_flags, int advice)
1413{ 1416{
@@ -1416,11 +1419,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
1416 /* 1419 /*
1417 * Be somewhat over-protective like KSM for now! 1420 * Be somewhat over-protective like KSM for now!
1418 */ 1421 */
1419 if (*vm_flags & (VM_HUGEPAGE | 1422 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1420 VM_SHARED | VM_MAYSHARE |
1421 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1422 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1423 VM_MIXEDMAP | VM_SAO))
1424 return -EINVAL; 1423 return -EINVAL;
1425 *vm_flags &= ~VM_NOHUGEPAGE; 1424 *vm_flags &= ~VM_NOHUGEPAGE;
1426 *vm_flags |= VM_HUGEPAGE; 1425 *vm_flags |= VM_HUGEPAGE;
@@ -1436,11 +1435,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
1436 /* 1435 /*
1437 * Be somewhat over-protective like KSM for now! 1436 * Be somewhat over-protective like KSM for now!
1438 */ 1437 */
1439 if (*vm_flags & (VM_NOHUGEPAGE | 1438 if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
1440 VM_SHARED | VM_MAYSHARE |
1441 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1442 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1443 VM_MIXEDMAP | VM_SAO))
1444 return -EINVAL; 1439 return -EINVAL;
1445 *vm_flags &= ~VM_HUGEPAGE; 1440 *vm_flags &= ~VM_HUGEPAGE;
1446 *vm_flags |= VM_NOHUGEPAGE; 1441 *vm_flags |= VM_NOHUGEPAGE;
@@ -1574,10 +1569,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1574 * page fault if needed. 1569 * page fault if needed.
1575 */ 1570 */
1576 return 0; 1571 return 0;
1577 if (vma->vm_file || vma->vm_ops) 1572 if (vma->vm_ops)
1578 /* khugepaged not yet working on file or special mappings */ 1573 /* khugepaged not yet working on file or special mappings */
1579 return 0; 1574 return 0;
1580 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 1575 /*
1576 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1577 * true too, verify it here.
1578 */
1579 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1581 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1580 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1582 hend = vma->vm_end & HPAGE_PMD_MASK; 1581 hend = vma->vm_end & HPAGE_PMD_MASK;
1583 if (hstart < hend) 1582 if (hstart < hend)
@@ -1828,12 +1827,15 @@ static void collapse_huge_page(struct mm_struct *mm,
1828 (vma->vm_flags & VM_NOHUGEPAGE)) 1827 (vma->vm_flags & VM_NOHUGEPAGE))
1829 goto out; 1828 goto out;
1830 1829
1831 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 1830 if (!vma->anon_vma || vma->vm_ops)
1832 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1833 goto out; 1831 goto out;
1834 if (is_vma_temporary_stack(vma)) 1832 if (is_vma_temporary_stack(vma))
1835 goto out; 1833 goto out;
1836 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 1834 /*
1835 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1836 * true too, verify it here.
1837 */
1838 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1837 1839
1838 pgd = pgd_offset(mm, address); 1840 pgd = pgd_offset(mm, address);
1839 if (!pgd_present(*pgd)) 1841 if (!pgd_present(*pgd))
@@ -2066,13 +2068,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2066 progress++; 2068 progress++;
2067 continue; 2069 continue;
2068 } 2070 }
2069 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 2071 if (!vma->anon_vma || vma->vm_ops)
2070 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
2071 goto skip; 2072 goto skip;
2072 if (is_vma_temporary_stack(vma)) 2073 if (is_vma_temporary_stack(vma))
2073 goto skip; 2074 goto skip;
2074 2075 /*
2075 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 2076 * If is_pfn_mapping() is true is_learn_pfn_mapping()
2077 * must be true too, verify it here.
2078 */
2079 VM_BUG_ON(is_linear_pfn_mapping(vma) ||
2080 vma->vm_flags & VM_NO_THP);
2076 2081
2077 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2082 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2078 hend = vma->vm_end & HPAGE_PMD_MASK; 2083 hend = vma->vm_end & HPAGE_PMD_MASK;
diff --git a/mm/memory.c b/mm/memory.c
index ce22a250926f..61e66f026563 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1359,7 +1359,7 @@ split_fallthrough:
1359 */ 1359 */
1360 mark_page_accessed(page); 1360 mark_page_accessed(page);
1361 } 1361 }
1362 if (flags & FOLL_MLOCK) { 1362 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1363 /* 1363 /*
1364 * The preliminary mapping check is mainly to avoid the 1364 * The preliminary mapping check is mainly to avoid the
1365 * pointless overhead of lock_page on the ZERO_PAGE 1365 * pointless overhead of lock_page on the ZERO_PAGE
@@ -1412,9 +1412,8 @@ no_page_table:
1412 1412
1413static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) 1413static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1414{ 1414{
1415 return (vma->vm_flags & VM_GROWSDOWN) && 1415 return stack_guard_page_start(vma, addr) ||
1416 (vma->vm_start == addr) && 1416 stack_guard_page_end(vma, addr+PAGE_SIZE);
1417 !vma_stack_continue(vma->vm_prev, addr);
1418} 1417}
1419 1418
1420/** 1419/**
@@ -1551,13 +1550,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1551 continue; 1550 continue;
1552 } 1551 }
1553 1552
1554 /*
1555 * If we don't actually want the page itself,
1556 * and it's the stack guard page, just skip it.
1557 */
1558 if (!pages && stack_guard_page(vma, start))
1559 goto next_page;
1560
1561 do { 1553 do {
1562 struct page *page; 1554 struct page *page;
1563 unsigned int foll_flags = gup_flags; 1555 unsigned int foll_flags = gup_flags;
@@ -1574,6 +1566,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1574 int ret; 1566 int ret;
1575 unsigned int fault_flags = 0; 1567 unsigned int fault_flags = 0;
1576 1568
1569 /* For mlock, just skip the stack guard page. */
1570 if (foll_flags & FOLL_MLOCK) {
1571 if (stack_guard_page(vma, start))
1572 goto next_page;
1573 }
1577 if (foll_flags & FOLL_WRITE) 1574 if (foll_flags & FOLL_WRITE)
1578 fault_flags |= FAULT_FLAG_WRITE; 1575 fault_flags |= FAULT_FLAG_WRITE;
1579 if (nonblocking) 1576 if (nonblocking)
@@ -3396,7 +3393,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3396 * run pte_offset_map on the pmd, if an huge pmd could 3393 * run pte_offset_map on the pmd, if an huge pmd could
3397 * materialize from under us from a different thread. 3394 * materialize from under us from a different thread.
3398 */ 3395 */
3399 if (unlikely(__pte_alloc(mm, vma, pmd, address))) 3396 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3400 return VM_FAULT_OOM; 3397 return VM_FAULT_OOM;
3401 /* if an huge pmd materialized from under us just retry later */ 3398 /* if an huge pmd materialized from under us just retry later */
3402 if (unlikely(pmd_trans_huge(*pmd))) 3399 if (unlikely(pmd_trans_huge(*pmd)))
diff --git a/mm/mlock.c b/mm/mlock.c
index 6b55e3efe0df..516b2c2ddd5a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -162,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
162 VM_BUG_ON(end > vma->vm_end); 162 VM_BUG_ON(end > vma->vm_end);
163 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 163 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
164 164
165 gup_flags = FOLL_TOUCH; 165 gup_flags = FOLL_TOUCH | FOLL_MLOCK;
166 /* 166 /*
167 * We want to touch writable mappings with a write fault in order 167 * We want to touch writable mappings with a write fault in order
168 * to break COW, except for shared mappings because these don't COW 168 * to break COW, except for shared mappings because these don't COW
@@ -178,9 +178,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
178 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) 178 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
179 gup_flags |= FOLL_FORCE; 179 gup_flags |= FOLL_FORCE;
180 180
181 if (vma->vm_flags & VM_LOCKED)
182 gup_flags |= FOLL_MLOCK;
183
184 return __get_user_pages(current, mm, addr, nr_pages, gup_flags, 181 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
185 NULL, NULL, nonblocking); 182 NULL, NULL, nonblocking);
186} 183}
diff --git a/mm/mmap.c b/mm/mmap.c
index e27e0cf0de03..772140c53ab1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1767,10 +1767,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1767 size = address - vma->vm_start; 1767 size = address - vma->vm_start;
1768 grow = (address - vma->vm_end) >> PAGE_SHIFT; 1768 grow = (address - vma->vm_end) >> PAGE_SHIFT;
1769 1769
1770 error = acct_stack_growth(vma, size, grow); 1770 error = -ENOMEM;
1771 if (!error) { 1771 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1772 vma->vm_end = address; 1772 error = acct_stack_growth(vma, size, grow);
1773 perf_event_mmap(vma); 1773 if (!error) {
1774 vma->vm_end = address;
1775 perf_event_mmap(vma);
1776 }
1774 } 1777 }
1775 } 1778 }
1776 vma_unlock_anon_vma(vma); 1779 vma_unlock_anon_vma(vma);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 83fb72c108b7..f52e85c80e8d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -172,10 +172,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
172 172
173 /* 173 /*
174 * The baseline for the badness score is the proportion of RAM that each 174 * The baseline for the badness score is the proportion of RAM that each
175 * task's rss and swap space use. 175 * task's rss, pagetable and swap space use.
176 */ 176 */
177 points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / 177 points = get_mm_rss(p->mm) + p->mm->nr_ptes;
178 totalpages; 178 points += get_mm_counter(p->mm, MM_SWAPENTS);
179
180 points *= 1000;
181 points /= totalpages;
179 task_unlock(p); 182 task_unlock(p);
180 183
181 /* 184 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9f8a97b9a350..3f8bce264df6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2317,6 +2317,21 @@ void free_pages(unsigned long addr, unsigned int order)
2317 2317
2318EXPORT_SYMBOL(free_pages); 2318EXPORT_SYMBOL(free_pages);
2319 2319
2320static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2321{
2322 if (addr) {
2323 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2324 unsigned long used = addr + PAGE_ALIGN(size);
2325
2326 split_page(virt_to_page((void *)addr), order);
2327 while (used < alloc_end) {
2328 free_page(used);
2329 used += PAGE_SIZE;
2330 }
2331 }
2332 return (void *)addr;
2333}
2334
2320/** 2335/**
2321 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2336 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2322 * @size: the number of bytes to allocate 2337 * @size: the number of bytes to allocate
@@ -2336,22 +2351,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2336 unsigned long addr; 2351 unsigned long addr;
2337 2352
2338 addr = __get_free_pages(gfp_mask, order); 2353 addr = __get_free_pages(gfp_mask, order);
2339 if (addr) { 2354 return make_alloc_exact(addr, order, size);
2340 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2341 unsigned long used = addr + PAGE_ALIGN(size);
2342
2343 split_page(virt_to_page((void *)addr), order);
2344 while (used < alloc_end) {
2345 free_page(used);
2346 used += PAGE_SIZE;
2347 }
2348 }
2349
2350 return (void *)addr;
2351} 2355}
2352EXPORT_SYMBOL(alloc_pages_exact); 2356EXPORT_SYMBOL(alloc_pages_exact);
2353 2357
2354/** 2358/**
2359 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2360 * pages on a node.
2361 * @nid: the preferred node ID where memory should be allocated
2362 * @size: the number of bytes to allocate
2363 * @gfp_mask: GFP flags for the allocation
2364 *
2365 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2366 * back.
2367 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2368 * but is not exact.
2369 */
2370void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2371{
2372 unsigned order = get_order(size);
2373 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2374 if (!p)
2375 return NULL;
2376 return make_alloc_exact((unsigned long)page_address(p), order, size);
2377}
2378EXPORT_SYMBOL(alloc_pages_exact_nid);
2379
2380/**
2355 * free_pages_exact - release memory allocated via alloc_pages_exact() 2381 * free_pages_exact - release memory allocated via alloc_pages_exact()
2356 * @virt: the value returned by alloc_pages_exact. 2382 * @virt: the value returned by alloc_pages_exact.
2357 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2383 * @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -3564,7 +3590,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3564 3590
3565 if (!slab_is_available()) { 3591 if (!slab_is_available()) {
3566 zone->wait_table = (wait_queue_head_t *) 3592 zone->wait_table = (wait_queue_head_t *)
3567 alloc_bootmem_node(pgdat, alloc_size); 3593 alloc_bootmem_node_nopanic(pgdat, alloc_size);
3568 } else { 3594 } else {
3569 /* 3595 /*
3570 * This case means that a zone whose size was 0 gets new memory 3596 * This case means that a zone whose size was 0 gets new memory
@@ -4141,7 +4167,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
4141 unsigned long usemapsize = usemap_size(zonesize); 4167 unsigned long usemapsize = usemap_size(zonesize);
4142 zone->pageblock_flags = NULL; 4168 zone->pageblock_flags = NULL;
4143 if (usemapsize) 4169 if (usemapsize)
4144 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 4170 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4171 usemapsize);
4145} 4172}
4146#else 4173#else
4147static inline void setup_usemap(struct pglist_data *pgdat, 4174static inline void setup_usemap(struct pglist_data *pgdat,
@@ -4307,7 +4334,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4307 size = (end - start) * sizeof(struct page); 4334 size = (end - start) * sizeof(struct page);
4308 map = alloc_remap(pgdat->node_id, size); 4335 map = alloc_remap(pgdat->node_id, size);
4309 if (!map) 4336 if (!map)
4310 map = alloc_bootmem_node(pgdat, size); 4337 map = alloc_bootmem_node_nopanic(pgdat, size);
4311 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4338 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4312 } 4339 }
4313#ifndef CONFIG_NEED_MULTIPLE_NODES 4340#ifndef CONFIG_NEED_MULTIPLE_NODES
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 99055010cece..2daadc322ba6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -134,7 +134,7 @@ static void *__init_refok alloc_page_cgroup(size_t size, int nid)
134{ 134{
135 void *addr = NULL; 135 void *addr = NULL;
136 136
137 addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN); 137 addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
138 if (addr) 138 if (addr)
139 return addr; 139 return addr;
140 140
diff --git a/mm/shmem.c b/mm/shmem.c
index 8fa27e4e582a..dfc7069102ee 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -852,7 +852,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_
852 852
853static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) 853static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
854{ 854{
855 struct inode *inode; 855 struct address_space *mapping;
856 unsigned long idx; 856 unsigned long idx;
857 unsigned long size; 857 unsigned long size;
858 unsigned long limit; 858 unsigned long limit;
@@ -875,8 +875,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
875 if (size > SHMEM_NR_DIRECT) 875 if (size > SHMEM_NR_DIRECT)
876 size = SHMEM_NR_DIRECT; 876 size = SHMEM_NR_DIRECT;
877 offset = shmem_find_swp(entry, ptr, ptr+size); 877 offset = shmem_find_swp(entry, ptr, ptr+size);
878 if (offset >= 0) 878 if (offset >= 0) {
879 shmem_swp_balance_unmap();
879 goto found; 880 goto found;
881 }
880 if (!info->i_indirect) 882 if (!info->i_indirect)
881 goto lost2; 883 goto lost2;
882 884
@@ -914,11 +916,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
914 if (size > ENTRIES_PER_PAGE) 916 if (size > ENTRIES_PER_PAGE)
915 size = ENTRIES_PER_PAGE; 917 size = ENTRIES_PER_PAGE;
916 offset = shmem_find_swp(entry, ptr, ptr+size); 918 offset = shmem_find_swp(entry, ptr, ptr+size);
917 shmem_swp_unmap(ptr);
918 if (offset >= 0) { 919 if (offset >= 0) {
919 shmem_dir_unmap(dir); 920 shmem_dir_unmap(dir);
920 goto found; 921 goto found;
921 } 922 }
923 shmem_swp_unmap(ptr);
922 } 924 }
923 } 925 }
924lost1: 926lost1:
@@ -928,8 +930,7 @@ lost2:
928 return 0; 930 return 0;
929found: 931found:
930 idx += offset; 932 idx += offset;
931 inode = igrab(&info->vfs_inode); 933 ptr += offset;
932 spin_unlock(&info->lock);
933 934
934 /* 935 /*
935 * Move _head_ to start search for next from here. 936 * Move _head_ to start search for next from here.
@@ -940,37 +941,18 @@ found:
940 */ 941 */
941 if (shmem_swaplist.next != &info->swaplist) 942 if (shmem_swaplist.next != &info->swaplist)
942 list_move_tail(&shmem_swaplist, &info->swaplist); 943 list_move_tail(&shmem_swaplist, &info->swaplist);
943 mutex_unlock(&shmem_swaplist_mutex);
944 944
945 error = 1;
946 if (!inode)
947 goto out;
948 /* 945 /*
949 * Charge page using GFP_KERNEL while we can wait. 946 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
950 * Charged back to the user(not to caller) when swap account is used. 947 * but also to hold up shmem_evict_inode(): so inode cannot be freed
951 * add_to_page_cache() will be called with GFP_NOWAIT. 948 * beneath us (pagelock doesn't help until the page is in pagecache).
952 */ 949 */
953 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 950 mapping = info->vfs_inode.i_mapping;
954 if (error) 951 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
955 goto out; 952 /* which does mem_cgroup_uncharge_cache_page on error */
956 error = radix_tree_preload(GFP_KERNEL);
957 if (error) {
958 mem_cgroup_uncharge_cache_page(page);
959 goto out;
960 }
961 error = 1;
962
963 spin_lock(&info->lock);
964 ptr = shmem_swp_entry(info, idx, NULL);
965 if (ptr && ptr->val == entry.val) {
966 error = add_to_page_cache_locked(page, inode->i_mapping,
967 idx, GFP_NOWAIT);
968 /* does mem_cgroup_uncharge_cache_page on error */
969 } else /* we must compensate for our precharge above */
970 mem_cgroup_uncharge_cache_page(page);
971 953
972 if (error == -EEXIST) { 954 if (error == -EEXIST) {
973 struct page *filepage = find_get_page(inode->i_mapping, idx); 955 struct page *filepage = find_get_page(mapping, idx);
974 error = 1; 956 error = 1;
975 if (filepage) { 957 if (filepage) {
976 /* 958 /*
@@ -990,14 +972,8 @@ found:
990 swap_free(entry); 972 swap_free(entry);
991 error = 1; /* not an error, but entry was found */ 973 error = 1; /* not an error, but entry was found */
992 } 974 }
993 if (ptr) 975 shmem_swp_unmap(ptr);
994 shmem_swp_unmap(ptr);
995 spin_unlock(&info->lock); 976 spin_unlock(&info->lock);
996 radix_tree_preload_end();
997out:
998 unlock_page(page);
999 page_cache_release(page);
1000 iput(inode); /* allows for NULL */
1001 return error; 977 return error;
1002} 978}
1003 979
@@ -1009,6 +985,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1009 struct list_head *p, *next; 985 struct list_head *p, *next;
1010 struct shmem_inode_info *info; 986 struct shmem_inode_info *info;
1011 int found = 0; 987 int found = 0;
988 int error;
989
990 /*
991 * Charge page using GFP_KERNEL while we can wait, before taking
992 * the shmem_swaplist_mutex which might hold up shmem_writepage().
993 * Charged back to the user (not to caller) when swap account is used.
994 * add_to_page_cache() will be called with GFP_NOWAIT.
995 */
996 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
997 if (error)
998 goto out;
999 /*
1000 * Try to preload while we can wait, to not make a habit of
1001 * draining atomic reserves; but don't latch on to this cpu,
1002 * it's okay if sometimes we get rescheduled after this.
1003 */
1004 error = radix_tree_preload(GFP_KERNEL);
1005 if (error)
1006 goto uncharge;
1007 radix_tree_preload_end();
1012 1008
1013 mutex_lock(&shmem_swaplist_mutex); 1009 mutex_lock(&shmem_swaplist_mutex);
1014 list_for_each_safe(p, next, &shmem_swaplist) { 1010 list_for_each_safe(p, next, &shmem_swaplist) {
@@ -1016,17 +1012,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1016 found = shmem_unuse_inode(info, entry, page); 1012 found = shmem_unuse_inode(info, entry, page);
1017 cond_resched(); 1013 cond_resched();
1018 if (found) 1014 if (found)
1019 goto out; 1015 break;
1020 } 1016 }
1021 mutex_unlock(&shmem_swaplist_mutex); 1017 mutex_unlock(&shmem_swaplist_mutex);
1022 /* 1018
1023 * Can some race bring us here? We've been holding page lock, 1019uncharge:
1024 * so I think not; but would rather try again later than BUG() 1020 if (!found)
1025 */ 1021 mem_cgroup_uncharge_cache_page(page);
1022 if (found < 0)
1023 error = found;
1024out:
1026 unlock_page(page); 1025 unlock_page(page);
1027 page_cache_release(page); 1026 page_cache_release(page);
1028out: 1027 return error;
1029 return (found < 0) ? found : 0;
1030} 1028}
1031 1029
1032/* 1030/*
@@ -1064,7 +1062,25 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1064 else 1062 else
1065 swap.val = 0; 1063 swap.val = 0;
1066 1064
1065 /*
1066 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1067 * if it's not already there. Do it now because we cannot take
1068 * mutex while holding spinlock, and must do so before the page
1069 * is moved to swap cache, when its pagelock no longer protects
1070 * the inode from eviction. But don't unlock the mutex until
1071 * we've taken the spinlock, because shmem_unuse_inode() will
1072 * prune a !swapped inode from the swaplist under both locks.
1073 */
1074 if (swap.val) {
1075 mutex_lock(&shmem_swaplist_mutex);
1076 if (list_empty(&info->swaplist))
1077 list_add_tail(&info->swaplist, &shmem_swaplist);
1078 }
1079
1067 spin_lock(&info->lock); 1080 spin_lock(&info->lock);
1081 if (swap.val)
1082 mutex_unlock(&shmem_swaplist_mutex);
1083
1068 if (index >= info->next_index) { 1084 if (index >= info->next_index) {
1069 BUG_ON(!(info->flags & SHMEM_TRUNCATE)); 1085 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1070 goto unlock; 1086 goto unlock;
@@ -1084,21 +1100,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1084 delete_from_page_cache(page); 1100 delete_from_page_cache(page);
1085 shmem_swp_set(info, entry, swap.val); 1101 shmem_swp_set(info, entry, swap.val);
1086 shmem_swp_unmap(entry); 1102 shmem_swp_unmap(entry);
1087 if (list_empty(&info->swaplist))
1088 inode = igrab(inode);
1089 else
1090 inode = NULL;
1091 spin_unlock(&info->lock); 1103 spin_unlock(&info->lock);
1092 swap_shmem_alloc(swap); 1104 swap_shmem_alloc(swap);
1093 BUG_ON(page_mapped(page)); 1105 BUG_ON(page_mapped(page));
1094 swap_writepage(page, wbc); 1106 swap_writepage(page, wbc);
1095 if (inode) {
1096 mutex_lock(&shmem_swaplist_mutex);
1097 /* move instead of add in case we're racing */
1098 list_move_tail(&info->swaplist, &shmem_swaplist);
1099 mutex_unlock(&shmem_swaplist_mutex);
1100 iput(inode);
1101 }
1102 return 0; 1107 return 0;
1103 } 1108 }
1104 1109
@@ -1400,20 +1405,14 @@ repeat:
1400 if (sbinfo->max_blocks) { 1405 if (sbinfo->max_blocks) {
1401 if (percpu_counter_compare(&sbinfo->used_blocks, 1406 if (percpu_counter_compare(&sbinfo->used_blocks,
1402 sbinfo->max_blocks) >= 0 || 1407 sbinfo->max_blocks) >= 0 ||
1403 shmem_acct_block(info->flags)) { 1408 shmem_acct_block(info->flags))
1404 spin_unlock(&info->lock); 1409 goto nospace;
1405 error = -ENOSPC;
1406 goto failed;
1407 }
1408 percpu_counter_inc(&sbinfo->used_blocks); 1410 percpu_counter_inc(&sbinfo->used_blocks);
1409 spin_lock(&inode->i_lock); 1411 spin_lock(&inode->i_lock);
1410 inode->i_blocks += BLOCKS_PER_PAGE; 1412 inode->i_blocks += BLOCKS_PER_PAGE;
1411 spin_unlock(&inode->i_lock); 1413 spin_unlock(&inode->i_lock);
1412 } else if (shmem_acct_block(info->flags)) { 1414 } else if (shmem_acct_block(info->flags))
1413 spin_unlock(&info->lock); 1415 goto nospace;
1414 error = -ENOSPC;
1415 goto failed;
1416 }
1417 1416
1418 if (!filepage) { 1417 if (!filepage) {
1419 int ret; 1418 int ret;
@@ -1493,6 +1492,24 @@ done:
1493 error = 0; 1492 error = 0;
1494 goto out; 1493 goto out;
1495 1494
1495nospace:
1496 /*
1497 * Perhaps the page was brought in from swap between find_lock_page
1498 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1499 * but must also avoid reporting a spurious ENOSPC while working on a
1500 * full tmpfs. (When filepage has been passed in to shmem_getpage, it
1501 * is already in page cache, which prevents this race from occurring.)
1502 */
1503 if (!filepage) {
1504 struct page *page = find_get_page(mapping, idx);
1505 if (page) {
1506 spin_unlock(&info->lock);
1507 page_cache_release(page);
1508 goto repeat;
1509 }
1510 }
1511 spin_unlock(&info->lock);
1512 error = -ENOSPC;
1496failed: 1513failed:
1497 if (*pagep != filepage) { 1514 if (*pagep != filepage) {
1498 unlock_page(filepage); 1515 unlock_page(filepage);
diff --git a/mm/slub.c b/mm/slub.c
index 94d2a33a866e..9d2e5e46bf09 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1940,7 +1940,7 @@ redo:
1940 * Since this is without lock semantics the protection is only against 1940 * Since this is without lock semantics the protection is only against
1941 * code executing on this cpu *not* from access by other cpus. 1941 * code executing on this cpu *not* from access by other cpus.
1942 */ 1942 */
1943 if (unlikely(!this_cpu_cmpxchg_double( 1943 if (unlikely(!irqsafe_cpu_cmpxchg_double(
1944 s->cpu_slab->freelist, s->cpu_slab->tid, 1944 s->cpu_slab->freelist, s->cpu_slab->tid,
1945 object, tid, 1945 object, tid,
1946 get_freepointer(s, object), next_tid(tid)))) { 1946 get_freepointer(s, object), next_tid(tid)))) {
@@ -2145,7 +2145,7 @@ redo:
2145 set_freepointer(s, object, c->freelist); 2145 set_freepointer(s, object, c->freelist);
2146 2146
2147#ifdef CONFIG_CMPXCHG_LOCAL 2147#ifdef CONFIG_CMPXCHG_LOCAL
2148 if (unlikely(!this_cpu_cmpxchg_double( 2148 if (unlikely(!irqsafe_cpu_cmpxchg_double(
2149 s->cpu_slab->freelist, s->cpu_slab->tid, 2149 s->cpu_slab->freelist, s->cpu_slab->tid,
2150 c->freelist, tid, 2150 c->freelist, tid,
2151 object, next_tid(tid)))) { 2151 object, next_tid(tid)))) {
diff --git a/mm/swap.c b/mm/swap.c
index a448db377cb0..5602f1a1b1e7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -396,6 +396,9 @@ static void lru_deactivate_fn(struct page *page, void *arg)
396 if (!PageLRU(page)) 396 if (!PageLRU(page))
397 return; 397 return;
398 398
399 if (PageUnevictable(page))
400 return;
401
399 /* Some processes are using the page */ 402 /* Some processes are using the page */
400 if (page_mapped(page)) 403 if (page_mapped(page))
401 return; 404 return;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6b435c80079..8bfd45050a61 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -937,7 +937,7 @@ keep_lumpy:
937 * back off and wait for congestion to clear because further reclaim 937 * back off and wait for congestion to clear because further reclaim
938 * will encounter the same problem 938 * will encounter the same problem
939 */ 939 */
940 if (nr_dirty == nr_congested && nr_dirty != 0) 940 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
941 zone_set_flag(zone, ZONE_CONGESTED); 941 zone_set_flag(zone, ZONE_CONGESTED);
942 942
943 free_page_list(&free_pages); 943 free_page_list(&free_pages);