diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/huge_memory.c | 43 | ||||
-rw-r--r-- | mm/memory.c | 21 | ||||
-rw-r--r-- | mm/mlock.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 11 | ||||
-rw-r--r-- | mm/oom_kill.c | 9 | ||||
-rw-r--r-- | mm/page_alloc.c | 57 | ||||
-rw-r--r-- | mm/page_cgroup.c | 2 | ||||
-rw-r--r-- | mm/shmem.c | 149 | ||||
-rw-r--r-- | mm/slub.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/vmscan.c | 2 |
11 files changed, 179 insertions, 127 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 470dcda10add..83326ad66d9b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1408,6 +1408,9 @@ out: | |||
1408 | return ret; | 1408 | return ret; |
1409 | } | 1409 | } |
1410 | 1410 | ||
1411 | #define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ | ||
1412 | VM_HUGETLB|VM_SHARED|VM_MAYSHARE) | ||
1413 | |||
1411 | int hugepage_madvise(struct vm_area_struct *vma, | 1414 | int hugepage_madvise(struct vm_area_struct *vma, |
1412 | unsigned long *vm_flags, int advice) | 1415 | unsigned long *vm_flags, int advice) |
1413 | { | 1416 | { |
@@ -1416,11 +1419,7 @@ int hugepage_madvise(struct vm_area_struct *vma, | |||
1416 | /* | 1419 | /* |
1417 | * Be somewhat over-protective like KSM for now! | 1420 | * Be somewhat over-protective like KSM for now! |
1418 | */ | 1421 | */ |
1419 | if (*vm_flags & (VM_HUGEPAGE | | 1422 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) |
1420 | VM_SHARED | VM_MAYSHARE | | ||
1421 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
1422 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
1423 | VM_MIXEDMAP | VM_SAO)) | ||
1424 | return -EINVAL; | 1423 | return -EINVAL; |
1425 | *vm_flags &= ~VM_NOHUGEPAGE; | 1424 | *vm_flags &= ~VM_NOHUGEPAGE; |
1426 | *vm_flags |= VM_HUGEPAGE; | 1425 | *vm_flags |= VM_HUGEPAGE; |
@@ -1436,11 +1435,7 @@ int hugepage_madvise(struct vm_area_struct *vma, | |||
1436 | /* | 1435 | /* |
1437 | * Be somewhat over-protective like KSM for now! | 1436 | * Be somewhat over-protective like KSM for now! |
1438 | */ | 1437 | */ |
1439 | if (*vm_flags & (VM_NOHUGEPAGE | | 1438 | if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) |
1440 | VM_SHARED | VM_MAYSHARE | | ||
1441 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
1442 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
1443 | VM_MIXEDMAP | VM_SAO)) | ||
1444 | return -EINVAL; | 1439 | return -EINVAL; |
1445 | *vm_flags &= ~VM_HUGEPAGE; | 1440 | *vm_flags &= ~VM_HUGEPAGE; |
1446 | *vm_flags |= VM_NOHUGEPAGE; | 1441 | *vm_flags |= VM_NOHUGEPAGE; |
@@ -1574,10 +1569,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | |||
1574 | * page fault if needed. | 1569 | * page fault if needed. |
1575 | */ | 1570 | */ |
1576 | return 0; | 1571 | return 0; |
1577 | if (vma->vm_file || vma->vm_ops) | 1572 | if (vma->vm_ops) |
1578 | /* khugepaged not yet working on file or special mappings */ | 1573 | /* khugepaged not yet working on file or special mappings */ |
1579 | return 0; | 1574 | return 0; |
1580 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | 1575 | /* |
1576 | * If is_pfn_mapping() is true is_learn_pfn_mapping() must be | ||
1577 | * true too, verify it here. | ||
1578 | */ | ||
1579 | VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); | ||
1581 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 1580 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
1582 | hend = vma->vm_end & HPAGE_PMD_MASK; | 1581 | hend = vma->vm_end & HPAGE_PMD_MASK; |
1583 | if (hstart < hend) | 1582 | if (hstart < hend) |
@@ -1828,12 +1827,15 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1828 | (vma->vm_flags & VM_NOHUGEPAGE)) | 1827 | (vma->vm_flags & VM_NOHUGEPAGE)) |
1829 | goto out; | 1828 | goto out; |
1830 | 1829 | ||
1831 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | 1830 | if (!vma->anon_vma || vma->vm_ops) |
1832 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) | ||
1833 | goto out; | 1831 | goto out; |
1834 | if (is_vma_temporary_stack(vma)) | 1832 | if (is_vma_temporary_stack(vma)) |
1835 | goto out; | 1833 | goto out; |
1836 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | 1834 | /* |
1835 | * If is_pfn_mapping() is true is_learn_pfn_mapping() must be | ||
1836 | * true too, verify it here. | ||
1837 | */ | ||
1838 | VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); | ||
1837 | 1839 | ||
1838 | pgd = pgd_offset(mm, address); | 1840 | pgd = pgd_offset(mm, address); |
1839 | if (!pgd_present(*pgd)) | 1841 | if (!pgd_present(*pgd)) |
@@ -2066,13 +2068,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2066 | progress++; | 2068 | progress++; |
2067 | continue; | 2069 | continue; |
2068 | } | 2070 | } |
2069 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | 2071 | if (!vma->anon_vma || vma->vm_ops) |
2070 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) | ||
2071 | goto skip; | 2072 | goto skip; |
2072 | if (is_vma_temporary_stack(vma)) | 2073 | if (is_vma_temporary_stack(vma)) |
2073 | goto skip; | 2074 | goto skip; |
2074 | 2075 | /* | |
2075 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | 2076 | * If is_pfn_mapping() is true is_learn_pfn_mapping() |
2077 | * must be true too, verify it here. | ||
2078 | */ | ||
2079 | VM_BUG_ON(is_linear_pfn_mapping(vma) || | ||
2080 | vma->vm_flags & VM_NO_THP); | ||
2076 | 2081 | ||
2077 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2082 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2078 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2083 | hend = vma->vm_end & HPAGE_PMD_MASK; |
diff --git a/mm/memory.c b/mm/memory.c index ce22a250926f..61e66f026563 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1359,7 +1359,7 @@ split_fallthrough: | |||
1359 | */ | 1359 | */ |
1360 | mark_page_accessed(page); | 1360 | mark_page_accessed(page); |
1361 | } | 1361 | } |
1362 | if (flags & FOLL_MLOCK) { | 1362 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
1363 | /* | 1363 | /* |
1364 | * The preliminary mapping check is mainly to avoid the | 1364 | * The preliminary mapping check is mainly to avoid the |
1365 | * pointless overhead of lock_page on the ZERO_PAGE | 1365 | * pointless overhead of lock_page on the ZERO_PAGE |
@@ -1412,9 +1412,8 @@ no_page_table: | |||
1412 | 1412 | ||
1413 | static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) | 1413 | static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) |
1414 | { | 1414 | { |
1415 | return (vma->vm_flags & VM_GROWSDOWN) && | 1415 | return stack_guard_page_start(vma, addr) || |
1416 | (vma->vm_start == addr) && | 1416 | stack_guard_page_end(vma, addr+PAGE_SIZE); |
1417 | !vma_stack_continue(vma->vm_prev, addr); | ||
1418 | } | 1417 | } |
1419 | 1418 | ||
1420 | /** | 1419 | /** |
@@ -1551,13 +1550,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1551 | continue; | 1550 | continue; |
1552 | } | 1551 | } |
1553 | 1552 | ||
1554 | /* | ||
1555 | * If we don't actually want the page itself, | ||
1556 | * and it's the stack guard page, just skip it. | ||
1557 | */ | ||
1558 | if (!pages && stack_guard_page(vma, start)) | ||
1559 | goto next_page; | ||
1560 | |||
1561 | do { | 1553 | do { |
1562 | struct page *page; | 1554 | struct page *page; |
1563 | unsigned int foll_flags = gup_flags; | 1555 | unsigned int foll_flags = gup_flags; |
@@ -1574,6 +1566,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1574 | int ret; | 1566 | int ret; |
1575 | unsigned int fault_flags = 0; | 1567 | unsigned int fault_flags = 0; |
1576 | 1568 | ||
1569 | /* For mlock, just skip the stack guard page. */ | ||
1570 | if (foll_flags & FOLL_MLOCK) { | ||
1571 | if (stack_guard_page(vma, start)) | ||
1572 | goto next_page; | ||
1573 | } | ||
1577 | if (foll_flags & FOLL_WRITE) | 1574 | if (foll_flags & FOLL_WRITE) |
1578 | fault_flags |= FAULT_FLAG_WRITE; | 1575 | fault_flags |= FAULT_FLAG_WRITE; |
1579 | if (nonblocking) | 1576 | if (nonblocking) |
@@ -3396,7 +3393,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3396 | * run pte_offset_map on the pmd, if an huge pmd could | 3393 | * run pte_offset_map on the pmd, if an huge pmd could |
3397 | * materialize from under us from a different thread. | 3394 | * materialize from under us from a different thread. |
3398 | */ | 3395 | */ |
3399 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | 3396 | if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) |
3400 | return VM_FAULT_OOM; | 3397 | return VM_FAULT_OOM; |
3401 | /* if an huge pmd materialized from under us just retry later */ | 3398 | /* if an huge pmd materialized from under us just retry later */ |
3402 | if (unlikely(pmd_trans_huge(*pmd))) | 3399 | if (unlikely(pmd_trans_huge(*pmd))) |
diff --git a/mm/mlock.c b/mm/mlock.c index 6b55e3efe0df..516b2c2ddd5a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -162,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
162 | VM_BUG_ON(end > vma->vm_end); | 162 | VM_BUG_ON(end > vma->vm_end); |
163 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 163 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
164 | 164 | ||
165 | gup_flags = FOLL_TOUCH; | 165 | gup_flags = FOLL_TOUCH | FOLL_MLOCK; |
166 | /* | 166 | /* |
167 | * We want to touch writable mappings with a write fault in order | 167 | * We want to touch writable mappings with a write fault in order |
168 | * to break COW, except for shared mappings because these don't COW | 168 | * to break COW, except for shared mappings because these don't COW |
@@ -178,9 +178,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
178 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) | 178 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) |
179 | gup_flags |= FOLL_FORCE; | 179 | gup_flags |= FOLL_FORCE; |
180 | 180 | ||
181 | if (vma->vm_flags & VM_LOCKED) | ||
182 | gup_flags |= FOLL_MLOCK; | ||
183 | |||
184 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, | 181 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, |
185 | NULL, NULL, nonblocking); | 182 | NULL, NULL, nonblocking); |
186 | } | 183 | } |
@@ -1767,10 +1767,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1767 | size = address - vma->vm_start; | 1767 | size = address - vma->vm_start; |
1768 | grow = (address - vma->vm_end) >> PAGE_SHIFT; | 1768 | grow = (address - vma->vm_end) >> PAGE_SHIFT; |
1769 | 1769 | ||
1770 | error = acct_stack_growth(vma, size, grow); | 1770 | error = -ENOMEM; |
1771 | if (!error) { | 1771 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
1772 | vma->vm_end = address; | 1772 | error = acct_stack_growth(vma, size, grow); |
1773 | perf_event_mmap(vma); | 1773 | if (!error) { |
1774 | vma->vm_end = address; | ||
1775 | perf_event_mmap(vma); | ||
1776 | } | ||
1774 | } | 1777 | } |
1775 | } | 1778 | } |
1776 | vma_unlock_anon_vma(vma); | 1779 | vma_unlock_anon_vma(vma); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 83fb72c108b7..f52e85c80e8d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -172,10 +172,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | |||
172 | 172 | ||
173 | /* | 173 | /* |
174 | * The baseline for the badness score is the proportion of RAM that each | 174 | * The baseline for the badness score is the proportion of RAM that each |
175 | * task's rss and swap space use. | 175 | * task's rss, pagetable and swap space use. |
176 | */ | 176 | */ |
177 | points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / | 177 | points = get_mm_rss(p->mm) + p->mm->nr_ptes; |
178 | totalpages; | 178 | points += get_mm_counter(p->mm, MM_SWAPENTS); |
179 | |||
180 | points *= 1000; | ||
181 | points /= totalpages; | ||
179 | task_unlock(p); | 182 | task_unlock(p); |
180 | 183 | ||
181 | /* | 184 | /* |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f8a97b9a350..3f8bce264df6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2317,6 +2317,21 @@ void free_pages(unsigned long addr, unsigned int order) | |||
2317 | 2317 | ||
2318 | EXPORT_SYMBOL(free_pages); | 2318 | EXPORT_SYMBOL(free_pages); |
2319 | 2319 | ||
2320 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) | ||
2321 | { | ||
2322 | if (addr) { | ||
2323 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | ||
2324 | unsigned long used = addr + PAGE_ALIGN(size); | ||
2325 | |||
2326 | split_page(virt_to_page((void *)addr), order); | ||
2327 | while (used < alloc_end) { | ||
2328 | free_page(used); | ||
2329 | used += PAGE_SIZE; | ||
2330 | } | ||
2331 | } | ||
2332 | return (void *)addr; | ||
2333 | } | ||
2334 | |||
2320 | /** | 2335 | /** |
2321 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. | 2336 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. |
2322 | * @size: the number of bytes to allocate | 2337 | * @size: the number of bytes to allocate |
@@ -2336,22 +2351,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | |||
2336 | unsigned long addr; | 2351 | unsigned long addr; |
2337 | 2352 | ||
2338 | addr = __get_free_pages(gfp_mask, order); | 2353 | addr = __get_free_pages(gfp_mask, order); |
2339 | if (addr) { | 2354 | return make_alloc_exact(addr, order, size); |
2340 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | ||
2341 | unsigned long used = addr + PAGE_ALIGN(size); | ||
2342 | |||
2343 | split_page(virt_to_page((void *)addr), order); | ||
2344 | while (used < alloc_end) { | ||
2345 | free_page(used); | ||
2346 | used += PAGE_SIZE; | ||
2347 | } | ||
2348 | } | ||
2349 | |||
2350 | return (void *)addr; | ||
2351 | } | 2355 | } |
2352 | EXPORT_SYMBOL(alloc_pages_exact); | 2356 | EXPORT_SYMBOL(alloc_pages_exact); |
2353 | 2357 | ||
2354 | /** | 2358 | /** |
2359 | * alloc_pages_exact_nid - allocate an exact number of physically-contiguous | ||
2360 | * pages on a node. | ||
2361 | * @nid: the preferred node ID where memory should be allocated | ||
2362 | * @size: the number of bytes to allocate | ||
2363 | * @gfp_mask: GFP flags for the allocation | ||
2364 | * | ||
2365 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling | ||
2366 | * back. | ||
2367 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, | ||
2368 | * but is not exact. | ||
2369 | */ | ||
2370 | void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) | ||
2371 | { | ||
2372 | unsigned order = get_order(size); | ||
2373 | struct page *p = alloc_pages_node(nid, gfp_mask, order); | ||
2374 | if (!p) | ||
2375 | return NULL; | ||
2376 | return make_alloc_exact((unsigned long)page_address(p), order, size); | ||
2377 | } | ||
2378 | EXPORT_SYMBOL(alloc_pages_exact_nid); | ||
2379 | |||
2380 | /** | ||
2355 | * free_pages_exact - release memory allocated via alloc_pages_exact() | 2381 | * free_pages_exact - release memory allocated via alloc_pages_exact() |
2356 | * @virt: the value returned by alloc_pages_exact. | 2382 | * @virt: the value returned by alloc_pages_exact. |
2357 | * @size: size of allocation, same value as passed to alloc_pages_exact(). | 2383 | * @size: size of allocation, same value as passed to alloc_pages_exact(). |
@@ -3564,7 +3590,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
3564 | 3590 | ||
3565 | if (!slab_is_available()) { | 3591 | if (!slab_is_available()) { |
3566 | zone->wait_table = (wait_queue_head_t *) | 3592 | zone->wait_table = (wait_queue_head_t *) |
3567 | alloc_bootmem_node(pgdat, alloc_size); | 3593 | alloc_bootmem_node_nopanic(pgdat, alloc_size); |
3568 | } else { | 3594 | } else { |
3569 | /* | 3595 | /* |
3570 | * This case means that a zone whose size was 0 gets new memory | 3596 | * This case means that a zone whose size was 0 gets new memory |
@@ -4141,7 +4167,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
4141 | unsigned long usemapsize = usemap_size(zonesize); | 4167 | unsigned long usemapsize = usemap_size(zonesize); |
4142 | zone->pageblock_flags = NULL; | 4168 | zone->pageblock_flags = NULL; |
4143 | if (usemapsize) | 4169 | if (usemapsize) |
4144 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | 4170 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, |
4171 | usemapsize); | ||
4145 | } | 4172 | } |
4146 | #else | 4173 | #else |
4147 | static inline void setup_usemap(struct pglist_data *pgdat, | 4174 | static inline void setup_usemap(struct pglist_data *pgdat, |
@@ -4307,7 +4334,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
4307 | size = (end - start) * sizeof(struct page); | 4334 | size = (end - start) * sizeof(struct page); |
4308 | map = alloc_remap(pgdat->node_id, size); | 4335 | map = alloc_remap(pgdat->node_id, size); |
4309 | if (!map) | 4336 | if (!map) |
4310 | map = alloc_bootmem_node(pgdat, size); | 4337 | map = alloc_bootmem_node_nopanic(pgdat, size); |
4311 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 4338 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
4312 | } | 4339 | } |
4313 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4340 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 99055010cece..2daadc322ba6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -134,7 +134,7 @@ static void *__init_refok alloc_page_cgroup(size_t size, int nid) | |||
134 | { | 134 | { |
135 | void *addr = NULL; | 135 | void *addr = NULL; |
136 | 136 | ||
137 | addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN); | 137 | addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); |
138 | if (addr) | 138 | if (addr) |
139 | return addr; | 139 | return addr; |
140 | 140 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 8fa27e4e582a..dfc7069102ee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -852,7 +852,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_ | |||
852 | 852 | ||
853 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | 853 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) |
854 | { | 854 | { |
855 | struct inode *inode; | 855 | struct address_space *mapping; |
856 | unsigned long idx; | 856 | unsigned long idx; |
857 | unsigned long size; | 857 | unsigned long size; |
858 | unsigned long limit; | 858 | unsigned long limit; |
@@ -875,8 +875,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
875 | if (size > SHMEM_NR_DIRECT) | 875 | if (size > SHMEM_NR_DIRECT) |
876 | size = SHMEM_NR_DIRECT; | 876 | size = SHMEM_NR_DIRECT; |
877 | offset = shmem_find_swp(entry, ptr, ptr+size); | 877 | offset = shmem_find_swp(entry, ptr, ptr+size); |
878 | if (offset >= 0) | 878 | if (offset >= 0) { |
879 | shmem_swp_balance_unmap(); | ||
879 | goto found; | 880 | goto found; |
881 | } | ||
880 | if (!info->i_indirect) | 882 | if (!info->i_indirect) |
881 | goto lost2; | 883 | goto lost2; |
882 | 884 | ||
@@ -914,11 +916,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
914 | if (size > ENTRIES_PER_PAGE) | 916 | if (size > ENTRIES_PER_PAGE) |
915 | size = ENTRIES_PER_PAGE; | 917 | size = ENTRIES_PER_PAGE; |
916 | offset = shmem_find_swp(entry, ptr, ptr+size); | 918 | offset = shmem_find_swp(entry, ptr, ptr+size); |
917 | shmem_swp_unmap(ptr); | ||
918 | if (offset >= 0) { | 919 | if (offset >= 0) { |
919 | shmem_dir_unmap(dir); | 920 | shmem_dir_unmap(dir); |
920 | goto found; | 921 | goto found; |
921 | } | 922 | } |
923 | shmem_swp_unmap(ptr); | ||
922 | } | 924 | } |
923 | } | 925 | } |
924 | lost1: | 926 | lost1: |
@@ -928,8 +930,7 @@ lost2: | |||
928 | return 0; | 930 | return 0; |
929 | found: | 931 | found: |
930 | idx += offset; | 932 | idx += offset; |
931 | inode = igrab(&info->vfs_inode); | 933 | ptr += offset; |
932 | spin_unlock(&info->lock); | ||
933 | 934 | ||
934 | /* | 935 | /* |
935 | * Move _head_ to start search for next from here. | 936 | * Move _head_ to start search for next from here. |
@@ -940,37 +941,18 @@ found: | |||
940 | */ | 941 | */ |
941 | if (shmem_swaplist.next != &info->swaplist) | 942 | if (shmem_swaplist.next != &info->swaplist) |
942 | list_move_tail(&shmem_swaplist, &info->swaplist); | 943 | list_move_tail(&shmem_swaplist, &info->swaplist); |
943 | mutex_unlock(&shmem_swaplist_mutex); | ||
944 | 944 | ||
945 | error = 1; | ||
946 | if (!inode) | ||
947 | goto out; | ||
948 | /* | 945 | /* |
949 | * Charge page using GFP_KERNEL while we can wait. | 946 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
950 | * Charged back to the user(not to caller) when swap account is used. | 947 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
951 | * add_to_page_cache() will be called with GFP_NOWAIT. | 948 | * beneath us (pagelock doesn't help until the page is in pagecache). |
952 | */ | 949 | */ |
953 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 950 | mapping = info->vfs_inode.i_mapping; |
954 | if (error) | 951 | error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); |
955 | goto out; | 952 | /* which does mem_cgroup_uncharge_cache_page on error */ |
956 | error = radix_tree_preload(GFP_KERNEL); | ||
957 | if (error) { | ||
958 | mem_cgroup_uncharge_cache_page(page); | ||
959 | goto out; | ||
960 | } | ||
961 | error = 1; | ||
962 | |||
963 | spin_lock(&info->lock); | ||
964 | ptr = shmem_swp_entry(info, idx, NULL); | ||
965 | if (ptr && ptr->val == entry.val) { | ||
966 | error = add_to_page_cache_locked(page, inode->i_mapping, | ||
967 | idx, GFP_NOWAIT); | ||
968 | /* does mem_cgroup_uncharge_cache_page on error */ | ||
969 | } else /* we must compensate for our precharge above */ | ||
970 | mem_cgroup_uncharge_cache_page(page); | ||
971 | 953 | ||
972 | if (error == -EEXIST) { | 954 | if (error == -EEXIST) { |
973 | struct page *filepage = find_get_page(inode->i_mapping, idx); | 955 | struct page *filepage = find_get_page(mapping, idx); |
974 | error = 1; | 956 | error = 1; |
975 | if (filepage) { | 957 | if (filepage) { |
976 | /* | 958 | /* |
@@ -990,14 +972,8 @@ found: | |||
990 | swap_free(entry); | 972 | swap_free(entry); |
991 | error = 1; /* not an error, but entry was found */ | 973 | error = 1; /* not an error, but entry was found */ |
992 | } | 974 | } |
993 | if (ptr) | 975 | shmem_swp_unmap(ptr); |
994 | shmem_swp_unmap(ptr); | ||
995 | spin_unlock(&info->lock); | 976 | spin_unlock(&info->lock); |
996 | radix_tree_preload_end(); | ||
997 | out: | ||
998 | unlock_page(page); | ||
999 | page_cache_release(page); | ||
1000 | iput(inode); /* allows for NULL */ | ||
1001 | return error; | 977 | return error; |
1002 | } | 978 | } |
1003 | 979 | ||
@@ -1009,6 +985,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1009 | struct list_head *p, *next; | 985 | struct list_head *p, *next; |
1010 | struct shmem_inode_info *info; | 986 | struct shmem_inode_info *info; |
1011 | int found = 0; | 987 | int found = 0; |
988 | int error; | ||
989 | |||
990 | /* | ||
991 | * Charge page using GFP_KERNEL while we can wait, before taking | ||
992 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | ||
993 | * Charged back to the user (not to caller) when swap account is used. | ||
994 | * add_to_page_cache() will be called with GFP_NOWAIT. | ||
995 | */ | ||
996 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | ||
997 | if (error) | ||
998 | goto out; | ||
999 | /* | ||
1000 | * Try to preload while we can wait, to not make a habit of | ||
1001 | * draining atomic reserves; but don't latch on to this cpu, | ||
1002 | * it's okay if sometimes we get rescheduled after this. | ||
1003 | */ | ||
1004 | error = radix_tree_preload(GFP_KERNEL); | ||
1005 | if (error) | ||
1006 | goto uncharge; | ||
1007 | radix_tree_preload_end(); | ||
1012 | 1008 | ||
1013 | mutex_lock(&shmem_swaplist_mutex); | 1009 | mutex_lock(&shmem_swaplist_mutex); |
1014 | list_for_each_safe(p, next, &shmem_swaplist) { | 1010 | list_for_each_safe(p, next, &shmem_swaplist) { |
@@ -1016,17 +1012,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
1016 | found = shmem_unuse_inode(info, entry, page); | 1012 | found = shmem_unuse_inode(info, entry, page); |
1017 | cond_resched(); | 1013 | cond_resched(); |
1018 | if (found) | 1014 | if (found) |
1019 | goto out; | 1015 | break; |
1020 | } | 1016 | } |
1021 | mutex_unlock(&shmem_swaplist_mutex); | 1017 | mutex_unlock(&shmem_swaplist_mutex); |
1022 | /* | 1018 | |
1023 | * Can some race bring us here? We've been holding page lock, | 1019 | uncharge: |
1024 | * so I think not; but would rather try again later than BUG() | 1020 | if (!found) |
1025 | */ | 1021 | mem_cgroup_uncharge_cache_page(page); |
1022 | if (found < 0) | ||
1023 | error = found; | ||
1024 | out: | ||
1026 | unlock_page(page); | 1025 | unlock_page(page); |
1027 | page_cache_release(page); | 1026 | page_cache_release(page); |
1028 | out: | 1027 | return error; |
1029 | return (found < 0) ? found : 0; | ||
1030 | } | 1028 | } |
1031 | 1029 | ||
1032 | /* | 1030 | /* |
@@ -1064,7 +1062,25 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1064 | else | 1062 | else |
1065 | swap.val = 0; | 1063 | swap.val = 0; |
1066 | 1064 | ||
1065 | /* | ||
1066 | * Add inode to shmem_unuse()'s list of swapped-out inodes, | ||
1067 | * if it's not already there. Do it now because we cannot take | ||
1068 | * mutex while holding spinlock, and must do so before the page | ||
1069 | * is moved to swap cache, when its pagelock no longer protects | ||
1070 | * the inode from eviction. But don't unlock the mutex until | ||
1071 | * we've taken the spinlock, because shmem_unuse_inode() will | ||
1072 | * prune a !swapped inode from the swaplist under both locks. | ||
1073 | */ | ||
1074 | if (swap.val) { | ||
1075 | mutex_lock(&shmem_swaplist_mutex); | ||
1076 | if (list_empty(&info->swaplist)) | ||
1077 | list_add_tail(&info->swaplist, &shmem_swaplist); | ||
1078 | } | ||
1079 | |||
1067 | spin_lock(&info->lock); | 1080 | spin_lock(&info->lock); |
1081 | if (swap.val) | ||
1082 | mutex_unlock(&shmem_swaplist_mutex); | ||
1083 | |||
1068 | if (index >= info->next_index) { | 1084 | if (index >= info->next_index) { |
1069 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | 1085 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); |
1070 | goto unlock; | 1086 | goto unlock; |
@@ -1084,21 +1100,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1084 | delete_from_page_cache(page); | 1100 | delete_from_page_cache(page); |
1085 | shmem_swp_set(info, entry, swap.val); | 1101 | shmem_swp_set(info, entry, swap.val); |
1086 | shmem_swp_unmap(entry); | 1102 | shmem_swp_unmap(entry); |
1087 | if (list_empty(&info->swaplist)) | ||
1088 | inode = igrab(inode); | ||
1089 | else | ||
1090 | inode = NULL; | ||
1091 | spin_unlock(&info->lock); | 1103 | spin_unlock(&info->lock); |
1092 | swap_shmem_alloc(swap); | 1104 | swap_shmem_alloc(swap); |
1093 | BUG_ON(page_mapped(page)); | 1105 | BUG_ON(page_mapped(page)); |
1094 | swap_writepage(page, wbc); | 1106 | swap_writepage(page, wbc); |
1095 | if (inode) { | ||
1096 | mutex_lock(&shmem_swaplist_mutex); | ||
1097 | /* move instead of add in case we're racing */ | ||
1098 | list_move_tail(&info->swaplist, &shmem_swaplist); | ||
1099 | mutex_unlock(&shmem_swaplist_mutex); | ||
1100 | iput(inode); | ||
1101 | } | ||
1102 | return 0; | 1107 | return 0; |
1103 | } | 1108 | } |
1104 | 1109 | ||
@@ -1400,20 +1405,14 @@ repeat: | |||
1400 | if (sbinfo->max_blocks) { | 1405 | if (sbinfo->max_blocks) { |
1401 | if (percpu_counter_compare(&sbinfo->used_blocks, | 1406 | if (percpu_counter_compare(&sbinfo->used_blocks, |
1402 | sbinfo->max_blocks) >= 0 || | 1407 | sbinfo->max_blocks) >= 0 || |
1403 | shmem_acct_block(info->flags)) { | 1408 | shmem_acct_block(info->flags)) |
1404 | spin_unlock(&info->lock); | 1409 | goto nospace; |
1405 | error = -ENOSPC; | ||
1406 | goto failed; | ||
1407 | } | ||
1408 | percpu_counter_inc(&sbinfo->used_blocks); | 1410 | percpu_counter_inc(&sbinfo->used_blocks); |
1409 | spin_lock(&inode->i_lock); | 1411 | spin_lock(&inode->i_lock); |
1410 | inode->i_blocks += BLOCKS_PER_PAGE; | 1412 | inode->i_blocks += BLOCKS_PER_PAGE; |
1411 | spin_unlock(&inode->i_lock); | 1413 | spin_unlock(&inode->i_lock); |
1412 | } else if (shmem_acct_block(info->flags)) { | 1414 | } else if (shmem_acct_block(info->flags)) |
1413 | spin_unlock(&info->lock); | 1415 | goto nospace; |
1414 | error = -ENOSPC; | ||
1415 | goto failed; | ||
1416 | } | ||
1417 | 1416 | ||
1418 | if (!filepage) { | 1417 | if (!filepage) { |
1419 | int ret; | 1418 | int ret; |
@@ -1493,6 +1492,24 @@ done: | |||
1493 | error = 0; | 1492 | error = 0; |
1494 | goto out; | 1493 | goto out; |
1495 | 1494 | ||
1495 | nospace: | ||
1496 | /* | ||
1497 | * Perhaps the page was brought in from swap between find_lock_page | ||
1498 | * and taking info->lock? We allow for that at add_to_page_cache_lru, | ||
1499 | * but must also avoid reporting a spurious ENOSPC while working on a | ||
1500 | * full tmpfs. (When filepage has been passed in to shmem_getpage, it | ||
1501 | * is already in page cache, which prevents this race from occurring.) | ||
1502 | */ | ||
1503 | if (!filepage) { | ||
1504 | struct page *page = find_get_page(mapping, idx); | ||
1505 | if (page) { | ||
1506 | spin_unlock(&info->lock); | ||
1507 | page_cache_release(page); | ||
1508 | goto repeat; | ||
1509 | } | ||
1510 | } | ||
1511 | spin_unlock(&info->lock); | ||
1512 | error = -ENOSPC; | ||
1496 | failed: | 1513 | failed: |
1497 | if (*pagep != filepage) { | 1514 | if (*pagep != filepage) { |
1498 | unlock_page(filepage); | 1515 | unlock_page(filepage); |
@@ -1940,7 +1940,7 @@ redo: | |||
1940 | * Since this is without lock semantics the protection is only against | 1940 | * Since this is without lock semantics the protection is only against |
1941 | * code executing on this cpu *not* from access by other cpus. | 1941 | * code executing on this cpu *not* from access by other cpus. |
1942 | */ | 1942 | */ |
1943 | if (unlikely(!this_cpu_cmpxchg_double( | 1943 | if (unlikely(!irqsafe_cpu_cmpxchg_double( |
1944 | s->cpu_slab->freelist, s->cpu_slab->tid, | 1944 | s->cpu_slab->freelist, s->cpu_slab->tid, |
1945 | object, tid, | 1945 | object, tid, |
1946 | get_freepointer(s, object), next_tid(tid)))) { | 1946 | get_freepointer(s, object), next_tid(tid)))) { |
@@ -2145,7 +2145,7 @@ redo: | |||
2145 | set_freepointer(s, object, c->freelist); | 2145 | set_freepointer(s, object, c->freelist); |
2146 | 2146 | ||
2147 | #ifdef CONFIG_CMPXCHG_LOCAL | 2147 | #ifdef CONFIG_CMPXCHG_LOCAL |
2148 | if (unlikely(!this_cpu_cmpxchg_double( | 2148 | if (unlikely(!irqsafe_cpu_cmpxchg_double( |
2149 | s->cpu_slab->freelist, s->cpu_slab->tid, | 2149 | s->cpu_slab->freelist, s->cpu_slab->tid, |
2150 | c->freelist, tid, | 2150 | c->freelist, tid, |
2151 | object, next_tid(tid)))) { | 2151 | object, next_tid(tid)))) { |
@@ -396,6 +396,9 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
396 | if (!PageLRU(page)) | 396 | if (!PageLRU(page)) |
397 | return; | 397 | return; |
398 | 398 | ||
399 | if (PageUnevictable(page)) | ||
400 | return; | ||
401 | |||
399 | /* Some processes are using the page */ | 402 | /* Some processes are using the page */ |
400 | if (page_mapped(page)) | 403 | if (page_mapped(page)) |
401 | return; | 404 | return; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index f6b435c80079..8bfd45050a61 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -937,7 +937,7 @@ keep_lumpy: | |||
937 | * back off and wait for congestion to clear because further reclaim | 937 | * back off and wait for congestion to clear because further reclaim |
938 | * will encounter the same problem | 938 | * will encounter the same problem |
939 | */ | 939 | */ |
940 | if (nr_dirty == nr_congested && nr_dirty != 0) | 940 | if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) |
941 | zone_set_flag(zone, ZONE_CONGESTED); | 941 | zone_set_flag(zone, ZONE_CONGESTED); |
942 | 942 | ||
943 | free_page_list(&free_pages); | 943 | free_page_list(&free_pages); |