diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 25 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 14 | ||||
-rw-r--r-- | mm/filemap.c | 21 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 404 | ||||
-rw-r--r-- | mm/hugetlb.c | 127 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 5 | ||||
-rw-r--r-- | mm/kmemleak.c | 4 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/list_lru.c | 3 | ||||
-rw-r--r-- | mm/madvise.c | 5 | ||||
-rw-r--r-- | mm/memblock.c | 124 | ||||
-rw-r--r-- | mm/memcontrol.c | 931 | ||||
-rw-r--r-- | mm/memory-failure.c | 46 | ||||
-rw-r--r-- | mm/memory.c | 224 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 65 | ||||
-rw-r--r-- | mm/mempolicy.c | 149 | ||||
-rw-r--r-- | mm/migrate.c | 67 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mm_init.c | 18 | ||||
-rw-r--r-- | mm/mmap.c | 23 | ||||
-rw-r--r-- | mm/mmzone.c | 14 | ||||
-rw-r--r-- | mm/mprotect.c | 76 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/nobootmem.c | 25 | ||||
-rw-r--r-- | mm/nommu.c | 5 | ||||
-rw-r--r-- | mm/oom_kill.c | 8 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 42 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 5 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 16 | ||||
-rw-r--r-- | mm/readahead.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 15 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slab.h | 6 | ||||
-rw-r--r-- | mm/slab_common.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 4 | ||||
-rw-r--r-- | mm/sparse.c | 53 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/util.c | 13 | ||||
-rw-r--r-- | mm/vmalloc.c | 48 | ||||
-rw-r--r-- | mm/vmscan.c | 88 | ||||
-rw-r--r-- | mm/vmstat.c | 22 | ||||
-rw-r--r-- | mm/zswap.c | 199 |
48 files changed, 1774 insertions, 1199 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 026771a9b097..eb69f352401d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -20,7 +20,7 @@ config FLATMEM_MANUAL | |||
20 | 20 | ||
21 | Some users of more advanced features like NUMA and | 21 | Some users of more advanced features like NUMA and |
22 | memory hotplug may have different options here. | 22 | memory hotplug may have different options here. |
23 | DISCONTIGMEM is an more mature, better tested system, | 23 | DISCONTIGMEM is a more mature, better tested system, |
24 | but is incompatible with memory hotplug and may suffer | 24 | but is incompatible with memory hotplug and may suffer |
25 | decreased performance over SPARSEMEM. If unsure between | 25 | decreased performance over SPARSEMEM. If unsure between |
26 | "Sparse Memory" and "Discontiguous Memory", choose | 26 | "Sparse Memory" and "Discontiguous Memory", choose |
@@ -153,11 +153,18 @@ config MOVABLE_NODE | |||
153 | help | 153 | help |
154 | Allow a node to have only movable memory. Pages used by the kernel, | 154 | Allow a node to have only movable memory. Pages used by the kernel, |
155 | such as direct mapping pages cannot be migrated. So the corresponding | 155 | such as direct mapping pages cannot be migrated. So the corresponding |
156 | memory device cannot be hotplugged. This option allows users to | 156 | memory device cannot be hotplugged. This option allows the following |
157 | online all the memory of a node as movable memory so that the whole | 157 | two things: |
158 | node can be hotplugged. Users who don't use the memory hotplug | 158 | - When the system is booting, node full of hotpluggable memory can |
159 | feature are fine with this option on since they don't online memory | 159 | be arranged to have only movable memory so that the whole node can |
160 | as movable. | 160 | be hot-removed. (need movable_node boot option specified). |
161 | - After the system is up, the option allows users to online all the | ||
162 | memory of a node as movable memory so that the whole node can be | ||
163 | hot-removed. | ||
164 | |||
165 | Users who don't use the memory hotplug feature are fine with this | ||
166 | option on since they don't specify movable_node boot option or they | ||
167 | don't online memory as movable. | ||
161 | 168 | ||
162 | Say Y here if you want to hotplug a whole node. | 169 | Say Y here if you want to hotplug a whole node. |
163 | Say N here if you want kernel to use memory on all nodes evenly. | 170 | Say N here if you want kernel to use memory on all nodes evenly. |
@@ -183,7 +190,7 @@ config MEMORY_HOTPLUG_SPARSE | |||
183 | config MEMORY_HOTREMOVE | 190 | config MEMORY_HOTREMOVE |
184 | bool "Allow for memory hot remove" | 191 | bool "Allow for memory hot remove" |
185 | select MEMORY_ISOLATION | 192 | select MEMORY_ISOLATION |
186 | select HAVE_BOOTMEM_INFO_NODE if X86_64 | 193 | select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) |
187 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | 194 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE |
188 | depends on MIGRATION | 195 | depends on MIGRATION |
189 | 196 | ||
@@ -211,9 +218,11 @@ config SPLIT_PTLOCK_CPUS | |||
211 | int | 218 | int |
212 | default "999999" if ARM && !CPU_CACHE_VIPT | 219 | default "999999" if ARM && !CPU_CACHE_VIPT |
213 | default "999999" if PARISC && !PA20 | 220 | default "999999" if PARISC && !PA20 |
214 | default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC | ||
215 | default "4" | 221 | default "4" |
216 | 222 | ||
223 | config ARCH_ENABLE_SPLIT_PMD_PTLOCK | ||
224 | boolean | ||
225 | |||
217 | # | 226 | # |
218 | # support for memory balloon compaction | 227 | # support for memory balloon compaction |
219 | config BALLOON_COMPACTION | 228 | config BALLOON_COMPACTION |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 6ab7744e692e..90bd3507b413 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) | |||
172 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 172 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
173 | { | 173 | { |
174 | struct page *page; | 174 | struct page *page; |
175 | unsigned long start, end, pages, count = 0; | 175 | unsigned long *map, start, end, pages, count = 0; |
176 | 176 | ||
177 | if (!bdata->node_bootmem_map) | 177 | if (!bdata->node_bootmem_map) |
178 | return 0; | 178 | return 0; |
179 | 179 | ||
180 | map = bdata->node_bootmem_map; | ||
180 | start = bdata->node_min_pfn; | 181 | start = bdata->node_min_pfn; |
181 | end = bdata->node_low_pfn; | 182 | end = bdata->node_low_pfn; |
182 | 183 | ||
@@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
184 | bdata - bootmem_node_data, start, end); | 185 | bdata - bootmem_node_data, start, end); |
185 | 186 | ||
186 | while (start < end) { | 187 | while (start < end) { |
187 | unsigned long *map, idx, vec; | 188 | unsigned long idx, vec; |
188 | unsigned shift; | 189 | unsigned shift; |
189 | 190 | ||
190 | map = bdata->node_bootmem_map; | ||
191 | idx = start - bdata->node_min_pfn; | 191 | idx = start - bdata->node_min_pfn; |
192 | shift = idx & (BITS_PER_LONG - 1); | 192 | shift = idx & (BITS_PER_LONG - 1); |
193 | /* | 193 | /* |
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
784 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 784 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
785 | 785 | ||
786 | /* update goal according ...MAX_DMA32_PFN */ | 786 | /* update goal according ...MAX_DMA32_PFN */ |
787 | end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; | 787 | end_pfn = pgdat_end_pfn(pgdat); |
788 | 788 | ||
789 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && | 789 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && |
790 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { | 790 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { |
diff --git a/mm/bounce.c b/mm/bounce.c index c9f0a4339a7d..5a7d58fb883b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -204,6 +204,8 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
204 | struct bio_vec *to, *from; | 204 | struct bio_vec *to, *from; |
205 | unsigned i; | 205 | unsigned i; |
206 | 206 | ||
207 | if (force) | ||
208 | goto bounce; | ||
207 | bio_for_each_segment(from, *bio_orig, i) | 209 | bio_for_each_segment(from, *bio_orig, i) |
208 | if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) | 210 | if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) |
209 | goto bounce; | 211 | goto bounce; |
diff --git a/mm/compaction.c b/mm/compaction.c index c43789388cd8..805165bcd3dd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page) | |||
235 | } | 235 | } |
236 | 236 | ||
237 | /* | 237 | /* |
238 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 238 | * Isolate free pages onto a private freelist. If @strict is true, will abort |
239 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 239 | * returning 0 on any invalid PFNs or non-free pages inside of the pageblock |
240 | * pages inside of the pageblock (even though it may still end up isolating | 240 | * (even though it may still end up isolating some pages). |
241 | * some pages). | ||
242 | */ | 241 | */ |
243 | static unsigned long isolate_freepages_block(struct compact_control *cc, | 242 | static unsigned long isolate_freepages_block(struct compact_control *cc, |
244 | unsigned long blockpfn, | 243 | unsigned long blockpfn, |
@@ -677,6 +676,13 @@ static void isolate_freepages(struct zone *zone, | |||
677 | pfn -= pageblock_nr_pages) { | 676 | pfn -= pageblock_nr_pages) { |
678 | unsigned long isolated; | 677 | unsigned long isolated; |
679 | 678 | ||
679 | /* | ||
680 | * This can iterate a massively long zone without finding any | ||
681 | * suitable migration targets, so periodically check if we need | ||
682 | * to schedule. | ||
683 | */ | ||
684 | cond_resched(); | ||
685 | |||
680 | if (!pfn_valid(pfn)) | 686 | if (!pfn_valid(pfn)) |
681 | continue; | 687 | continue; |
682 | 688 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..b7749a92021c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1090,7 +1090,6 @@ static void shrink_readahead_size_eio(struct file *filp, | |||
1090 | * @filp: the file to read | 1090 | * @filp: the file to read |
1091 | * @ppos: current file position | 1091 | * @ppos: current file position |
1092 | * @desc: read_descriptor | 1092 | * @desc: read_descriptor |
1093 | * @actor: read method | ||
1094 | * | 1093 | * |
1095 | * This is a generic file read routine, and uses the | 1094 | * This is a generic file read routine, and uses the |
1096 | * mapping->a_ops->readpage() function for the actual low-level stuff. | 1095 | * mapping->a_ops->readpage() function for the actual low-level stuff. |
@@ -1099,7 +1098,7 @@ static void shrink_readahead_size_eio(struct file *filp, | |||
1099 | * of the logic when it comes to error handling etc. | 1098 | * of the logic when it comes to error handling etc. |
1100 | */ | 1099 | */ |
1101 | static void do_generic_file_read(struct file *filp, loff_t *ppos, | 1100 | static void do_generic_file_read(struct file *filp, loff_t *ppos, |
1102 | read_descriptor_t *desc, read_actor_t actor) | 1101 | read_descriptor_t *desc) |
1103 | { | 1102 | { |
1104 | struct address_space *mapping = filp->f_mapping; | 1103 | struct address_space *mapping = filp->f_mapping; |
1105 | struct inode *inode = mapping->host; | 1104 | struct inode *inode = mapping->host; |
@@ -1200,13 +1199,14 @@ page_ok: | |||
1200 | * Ok, we have the page, and it's up-to-date, so | 1199 | * Ok, we have the page, and it's up-to-date, so |
1201 | * now we can copy it to user space... | 1200 | * now we can copy it to user space... |
1202 | * | 1201 | * |
1203 | * The actor routine returns how many bytes were actually used.. | 1202 | * The file_read_actor routine returns how many bytes were |
1203 | * actually used.. | ||
1204 | * NOTE! This may not be the same as how much of a user buffer | 1204 | * NOTE! This may not be the same as how much of a user buffer |
1205 | * we filled up (we may be padding etc), so we can only update | 1205 | * we filled up (we may be padding etc), so we can only update |
1206 | * "pos" here (the actor routine has to update the user buffer | 1206 | * "pos" here (the actor routine has to update the user buffer |
1207 | * pointers and the remaining count). | 1207 | * pointers and the remaining count). |
1208 | */ | 1208 | */ |
1209 | ret = actor(desc, page, offset, nr); | 1209 | ret = file_read_actor(desc, page, offset, nr); |
1210 | offset += ret; | 1210 | offset += ret; |
1211 | index += offset >> PAGE_CACHE_SHIFT; | 1211 | index += offset >> PAGE_CACHE_SHIFT; |
1212 | offset &= ~PAGE_CACHE_MASK; | 1212 | offset &= ~PAGE_CACHE_MASK; |
@@ -1479,7 +1479,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1479 | if (desc.count == 0) | 1479 | if (desc.count == 0) |
1480 | continue; | 1480 | continue; |
1481 | desc.error = 0; | 1481 | desc.error = 0; |
1482 | do_generic_file_read(filp, ppos, &desc, file_read_actor); | 1482 | do_generic_file_read(filp, ppos, &desc); |
1483 | retval += desc.written; | 1483 | retval += desc.written; |
1484 | if (desc.error) { | 1484 | if (desc.error) { |
1485 | retval = retval ?: desc.error; | 1485 | retval = retval ?: desc.error; |
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1616 | struct inode *inode = mapping->host; | 1616 | struct inode *inode = mapping->host; |
1617 | pgoff_t offset = vmf->pgoff; | 1617 | pgoff_t offset = vmf->pgoff; |
1618 | struct page *page; | 1618 | struct page *page; |
1619 | bool memcg_oom; | ||
1620 | pgoff_t size; | 1619 | pgoff_t size; |
1621 | int ret = 0; | 1620 | int ret = 0; |
1622 | 1621 | ||
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1625 | return VM_FAULT_SIGBUS; | 1624 | return VM_FAULT_SIGBUS; |
1626 | 1625 | ||
1627 | /* | 1626 | /* |
1628 | * Do we have something in the page cache already? Either | 1627 | * Do we have something in the page cache already? |
1629 | * way, try readahead, but disable the memcg OOM killer for it | ||
1630 | * as readahead is optional and no errors are propagated up | ||
1631 | * the fault stack. The OOM killer is enabled while trying to | ||
1632 | * instantiate the faulting page individually below. | ||
1633 | */ | 1628 | */ |
1634 | page = find_get_page(mapping, offset); | 1629 | page = find_get_page(mapping, offset); |
1635 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { | 1630 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1637 | * We found the page, so try async readahead before | 1632 | * We found the page, so try async readahead before |
1638 | * waiting for the lock. | 1633 | * waiting for the lock. |
1639 | */ | 1634 | */ |
1640 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1641 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1635 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1642 | mem_cgroup_toggle_oom(memcg_oom); | ||
1643 | } else if (!page) { | 1636 | } else if (!page) { |
1644 | /* No page in the page cache at all */ | 1637 | /* No page in the page cache at all */ |
1645 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1646 | do_sync_mmap_readahead(vma, ra, file, offset); | 1638 | do_sync_mmap_readahead(vma, ra, file, offset); |
1647 | mem_cgroup_toggle_oom(memcg_oom); | ||
1648 | count_vm_event(PGMAJFAULT); | 1639 | count_vm_event(PGMAJFAULT); |
1649 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1640 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1650 | ret = VM_FAULT_MAJOR; | 1641 | ret = VM_FAULT_MAJOR; |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 28fe26b64f8a..d8d9fe3f685c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -26,7 +26,7 @@ | |||
26 | * of ZERO_PAGE(), such as /dev/zero | 26 | * of ZERO_PAGE(), such as /dev/zero |
27 | */ | 27 | */ |
28 | static DEFINE_MUTEX(xip_sparse_mutex); | 28 | static DEFINE_MUTEX(xip_sparse_mutex); |
29 | static seqcount_t xip_sparse_seq = SEQCNT_ZERO; | 29 | static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); |
30 | static struct page *__xip_sparse_page; | 30 | static struct page *__xip_sparse_page; |
31 | 31 | ||
32 | /* called under xip_sparse_mutex */ | 32 | /* called under xip_sparse_mutex */ |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d8..bccd5a628ea6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -27,11 +27,12 @@ | |||
27 | #include "internal.h" | 27 | #include "internal.h" |
28 | 28 | ||
29 | /* | 29 | /* |
30 | * By default transparent hugepage support is enabled for all mappings | 30 | * By default transparent hugepage support is disabled in order that avoid |
31 | * and khugepaged scans all mappings. Defrag is only invoked by | 31 | * to risk increase the memory footprint of applications without a guaranteed |
32 | * khugepaged hugepage allocations and by page faults inside | 32 | * benefit. When transparent hugepage support is enabled, is for all mappings, |
33 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived | 33 | * and khugepaged scans all mappings. |
34 | * allocations. | 34 | * Defrag is invoked by khugepaged hugepage allocations and by page faults |
35 | * for all hugepage allocations. | ||
35 | */ | 36 | */ |
36 | unsigned long transparent_hugepage_flags __read_mostly = | 37 | unsigned long transparent_hugepage_flags __read_mostly = |
37 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS | 38 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS |
@@ -709,6 +710,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
709 | struct page *page) | 710 | struct page *page) |
710 | { | 711 | { |
711 | pgtable_t pgtable; | 712 | pgtable_t pgtable; |
713 | spinlock_t *ptl; | ||
712 | 714 | ||
713 | VM_BUG_ON(!PageCompound(page)); | 715 | VM_BUG_ON(!PageCompound(page)); |
714 | pgtable = pte_alloc_one(mm, haddr); | 716 | pgtable = pte_alloc_one(mm, haddr); |
@@ -723,9 +725,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
723 | */ | 725 | */ |
724 | __SetPageUptodate(page); | 726 | __SetPageUptodate(page); |
725 | 727 | ||
726 | spin_lock(&mm->page_table_lock); | 728 | ptl = pmd_lock(mm, pmd); |
727 | if (unlikely(!pmd_none(*pmd))) { | 729 | if (unlikely(!pmd_none(*pmd))) { |
728 | spin_unlock(&mm->page_table_lock); | 730 | spin_unlock(ptl); |
729 | mem_cgroup_uncharge_page(page); | 731 | mem_cgroup_uncharge_page(page); |
730 | put_page(page); | 732 | put_page(page); |
731 | pte_free(mm, pgtable); | 733 | pte_free(mm, pgtable); |
@@ -737,8 +739,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 739 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
738 | set_pmd_at(mm, haddr, pmd, entry); | 740 | set_pmd_at(mm, haddr, pmd, entry); |
739 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 741 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
740 | mm->nr_ptes++; | 742 | atomic_long_inc(&mm->nr_ptes); |
741 | spin_unlock(&mm->page_table_lock); | 743 | spin_unlock(ptl); |
742 | } | 744 | } |
743 | 745 | ||
744 | return 0; | 746 | return 0; |
@@ -758,14 +760,7 @@ static inline struct page *alloc_hugepage_vma(int defrag, | |||
758 | HPAGE_PMD_ORDER, vma, haddr, nd); | 760 | HPAGE_PMD_ORDER, vma, haddr, nd); |
759 | } | 761 | } |
760 | 762 | ||
761 | #ifndef CONFIG_NUMA | 763 | /* Caller must hold page table lock. */ |
762 | static inline struct page *alloc_hugepage(int defrag) | ||
763 | { | ||
764 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | ||
765 | HPAGE_PMD_ORDER); | ||
766 | } | ||
767 | #endif | ||
768 | |||
769 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 764 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
770 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 765 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
771 | struct page *zero_page) | 766 | struct page *zero_page) |
@@ -778,7 +773,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
778 | entry = pmd_mkhuge(entry); | 773 | entry = pmd_mkhuge(entry); |
779 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 774 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
780 | set_pmd_at(mm, haddr, pmd, entry); | 775 | set_pmd_at(mm, haddr, pmd, entry); |
781 | mm->nr_ptes++; | 776 | atomic_long_inc(&mm->nr_ptes); |
782 | return true; | 777 | return true; |
783 | } | 778 | } |
784 | 779 | ||
@@ -797,6 +792,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
797 | return VM_FAULT_OOM; | 792 | return VM_FAULT_OOM; |
798 | if (!(flags & FAULT_FLAG_WRITE) && | 793 | if (!(flags & FAULT_FLAG_WRITE) && |
799 | transparent_hugepage_use_zero_page()) { | 794 | transparent_hugepage_use_zero_page()) { |
795 | spinlock_t *ptl; | ||
800 | pgtable_t pgtable; | 796 | pgtable_t pgtable; |
801 | struct page *zero_page; | 797 | struct page *zero_page; |
802 | bool set; | 798 | bool set; |
@@ -809,10 +805,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
809 | count_vm_event(THP_FAULT_FALLBACK); | 805 | count_vm_event(THP_FAULT_FALLBACK); |
810 | return VM_FAULT_FALLBACK; | 806 | return VM_FAULT_FALLBACK; |
811 | } | 807 | } |
812 | spin_lock(&mm->page_table_lock); | 808 | ptl = pmd_lock(mm, pmd); |
813 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | 809 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, |
814 | zero_page); | 810 | zero_page); |
815 | spin_unlock(&mm->page_table_lock); | 811 | spin_unlock(ptl); |
816 | if (!set) { | 812 | if (!set) { |
817 | pte_free(mm, pgtable); | 813 | pte_free(mm, pgtable); |
818 | put_huge_zero_page(); | 814 | put_huge_zero_page(); |
@@ -845,6 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
845 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 841 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
846 | struct vm_area_struct *vma) | 842 | struct vm_area_struct *vma) |
847 | { | 843 | { |
844 | spinlock_t *dst_ptl, *src_ptl; | ||
848 | struct page *src_page; | 845 | struct page *src_page; |
849 | pmd_t pmd; | 846 | pmd_t pmd; |
850 | pgtable_t pgtable; | 847 | pgtable_t pgtable; |
@@ -855,8 +852,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
855 | if (unlikely(!pgtable)) | 852 | if (unlikely(!pgtable)) |
856 | goto out; | 853 | goto out; |
857 | 854 | ||
858 | spin_lock(&dst_mm->page_table_lock); | 855 | dst_ptl = pmd_lock(dst_mm, dst_pmd); |
859 | spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); | 856 | src_ptl = pmd_lockptr(src_mm, src_pmd); |
857 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | ||
860 | 858 | ||
861 | ret = -EAGAIN; | 859 | ret = -EAGAIN; |
862 | pmd = *src_pmd; | 860 | pmd = *src_pmd; |
@@ -865,7 +863,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
865 | goto out_unlock; | 863 | goto out_unlock; |
866 | } | 864 | } |
867 | /* | 865 | /* |
868 | * mm->page_table_lock is enough to be sure that huge zero pmd is not | 866 | * When page table lock is held, the huge zero pmd should not be |
869 | * under splitting since we don't split the page itself, only pmd to | 867 | * under splitting since we don't split the page itself, only pmd to |
870 | * a page table. | 868 | * a page table. |
871 | */ | 869 | */ |
@@ -886,8 +884,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
886 | } | 884 | } |
887 | if (unlikely(pmd_trans_splitting(pmd))) { | 885 | if (unlikely(pmd_trans_splitting(pmd))) { |
888 | /* split huge page running from under us */ | 886 | /* split huge page running from under us */ |
889 | spin_unlock(&src_mm->page_table_lock); | 887 | spin_unlock(src_ptl); |
890 | spin_unlock(&dst_mm->page_table_lock); | 888 | spin_unlock(dst_ptl); |
891 | pte_free(dst_mm, pgtable); | 889 | pte_free(dst_mm, pgtable); |
892 | 890 | ||
893 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ | 891 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ |
@@ -903,12 +901,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
903 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 901 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
904 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); | 902 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); |
905 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 903 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
906 | dst_mm->nr_ptes++; | 904 | atomic_long_inc(&dst_mm->nr_ptes); |
907 | 905 | ||
908 | ret = 0; | 906 | ret = 0; |
909 | out_unlock: | 907 | out_unlock: |
910 | spin_unlock(&src_mm->page_table_lock); | 908 | spin_unlock(src_ptl); |
911 | spin_unlock(&dst_mm->page_table_lock); | 909 | spin_unlock(dst_ptl); |
912 | out: | 910 | out: |
913 | return ret; | 911 | return ret; |
914 | } | 912 | } |
@@ -919,10 +917,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm, | |||
919 | pmd_t *pmd, pmd_t orig_pmd, | 917 | pmd_t *pmd, pmd_t orig_pmd, |
920 | int dirty) | 918 | int dirty) |
921 | { | 919 | { |
920 | spinlock_t *ptl; | ||
922 | pmd_t entry; | 921 | pmd_t entry; |
923 | unsigned long haddr; | 922 | unsigned long haddr; |
924 | 923 | ||
925 | spin_lock(&mm->page_table_lock); | 924 | ptl = pmd_lock(mm, pmd); |
926 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 925 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
927 | goto unlock; | 926 | goto unlock; |
928 | 927 | ||
@@ -932,13 +931,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm, | |||
932 | update_mmu_cache_pmd(vma, address, pmd); | 931 | update_mmu_cache_pmd(vma, address, pmd); |
933 | 932 | ||
934 | unlock: | 933 | unlock: |
935 | spin_unlock(&mm->page_table_lock); | 934 | spin_unlock(ptl); |
936 | } | 935 | } |
937 | 936 | ||
938 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | 937 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, |
939 | struct vm_area_struct *vma, unsigned long address, | 938 | struct vm_area_struct *vma, unsigned long address, |
940 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) | 939 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) |
941 | { | 940 | { |
941 | spinlock_t *ptl; | ||
942 | pgtable_t pgtable; | 942 | pgtable_t pgtable; |
943 | pmd_t _pmd; | 943 | pmd_t _pmd; |
944 | struct page *page; | 944 | struct page *page; |
@@ -965,7 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | |||
965 | mmun_end = haddr + HPAGE_PMD_SIZE; | 965 | mmun_end = haddr + HPAGE_PMD_SIZE; |
966 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 966 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
967 | 967 | ||
968 | spin_lock(&mm->page_table_lock); | 968 | ptl = pmd_lock(mm, pmd); |
969 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 969 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
970 | goto out_free_page; | 970 | goto out_free_page; |
971 | 971 | ||
@@ -992,7 +992,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | |||
992 | } | 992 | } |
993 | smp_wmb(); /* make pte visible before pmd */ | 993 | smp_wmb(); /* make pte visible before pmd */ |
994 | pmd_populate(mm, pmd, pgtable); | 994 | pmd_populate(mm, pmd, pgtable); |
995 | spin_unlock(&mm->page_table_lock); | 995 | spin_unlock(ptl); |
996 | put_huge_zero_page(); | 996 | put_huge_zero_page(); |
997 | inc_mm_counter(mm, MM_ANONPAGES); | 997 | inc_mm_counter(mm, MM_ANONPAGES); |
998 | 998 | ||
@@ -1002,7 +1002,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | |||
1002 | out: | 1002 | out: |
1003 | return ret; | 1003 | return ret; |
1004 | out_free_page: | 1004 | out_free_page: |
1005 | spin_unlock(&mm->page_table_lock); | 1005 | spin_unlock(ptl); |
1006 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1006 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1007 | mem_cgroup_uncharge_page(page); | 1007 | mem_cgroup_uncharge_page(page); |
1008 | put_page(page); | 1008 | put_page(page); |
@@ -1016,6 +1016,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1016 | struct page *page, | 1016 | struct page *page, |
1017 | unsigned long haddr) | 1017 | unsigned long haddr) |
1018 | { | 1018 | { |
1019 | spinlock_t *ptl; | ||
1019 | pgtable_t pgtable; | 1020 | pgtable_t pgtable; |
1020 | pmd_t _pmd; | 1021 | pmd_t _pmd; |
1021 | int ret = 0, i; | 1022 | int ret = 0, i; |
@@ -1062,7 +1063,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1062 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1063 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1063 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1064 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1064 | 1065 | ||
1065 | spin_lock(&mm->page_table_lock); | 1066 | ptl = pmd_lock(mm, pmd); |
1066 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1067 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
1067 | goto out_free_pages; | 1068 | goto out_free_pages; |
1068 | VM_BUG_ON(!PageHead(page)); | 1069 | VM_BUG_ON(!PageHead(page)); |
@@ -1088,7 +1089,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1088 | smp_wmb(); /* make pte visible before pmd */ | 1089 | smp_wmb(); /* make pte visible before pmd */ |
1089 | pmd_populate(mm, pmd, pgtable); | 1090 | pmd_populate(mm, pmd, pgtable); |
1090 | page_remove_rmap(page); | 1091 | page_remove_rmap(page); |
1091 | spin_unlock(&mm->page_table_lock); | 1092 | spin_unlock(ptl); |
1092 | 1093 | ||
1093 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1094 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1094 | 1095 | ||
@@ -1099,7 +1100,7 @@ out: | |||
1099 | return ret; | 1100 | return ret; |
1100 | 1101 | ||
1101 | out_free_pages: | 1102 | out_free_pages: |
1102 | spin_unlock(&mm->page_table_lock); | 1103 | spin_unlock(ptl); |
1103 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1104 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1104 | mem_cgroup_uncharge_start(); | 1105 | mem_cgroup_uncharge_start(); |
1105 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1106 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
@@ -1114,17 +1115,19 @@ out_free_pages: | |||
1114 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1115 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1115 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | 1116 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) |
1116 | { | 1117 | { |
1118 | spinlock_t *ptl; | ||
1117 | int ret = 0; | 1119 | int ret = 0; |
1118 | struct page *page = NULL, *new_page; | 1120 | struct page *page = NULL, *new_page; |
1119 | unsigned long haddr; | 1121 | unsigned long haddr; |
1120 | unsigned long mmun_start; /* For mmu_notifiers */ | 1122 | unsigned long mmun_start; /* For mmu_notifiers */ |
1121 | unsigned long mmun_end; /* For mmu_notifiers */ | 1123 | unsigned long mmun_end; /* For mmu_notifiers */ |
1122 | 1124 | ||
1125 | ptl = pmd_lockptr(mm, pmd); | ||
1123 | VM_BUG_ON(!vma->anon_vma); | 1126 | VM_BUG_ON(!vma->anon_vma); |
1124 | haddr = address & HPAGE_PMD_MASK; | 1127 | haddr = address & HPAGE_PMD_MASK; |
1125 | if (is_huge_zero_pmd(orig_pmd)) | 1128 | if (is_huge_zero_pmd(orig_pmd)) |
1126 | goto alloc; | 1129 | goto alloc; |
1127 | spin_lock(&mm->page_table_lock); | 1130 | spin_lock(ptl); |
1128 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1131 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
1129 | goto out_unlock; | 1132 | goto out_unlock; |
1130 | 1133 | ||
@@ -1140,7 +1143,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1140 | goto out_unlock; | 1143 | goto out_unlock; |
1141 | } | 1144 | } |
1142 | get_page(page); | 1145 | get_page(page); |
1143 | spin_unlock(&mm->page_table_lock); | 1146 | spin_unlock(ptl); |
1144 | alloc: | 1147 | alloc: |
1145 | if (transparent_hugepage_enabled(vma) && | 1148 | if (transparent_hugepage_enabled(vma) && |
1146 | !transparent_hugepage_debug_cow()) | 1149 | !transparent_hugepage_debug_cow()) |
@@ -1187,11 +1190,11 @@ alloc: | |||
1187 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1190 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1188 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1191 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1189 | 1192 | ||
1190 | spin_lock(&mm->page_table_lock); | 1193 | spin_lock(ptl); |
1191 | if (page) | 1194 | if (page) |
1192 | put_page(page); | 1195 | put_page(page); |
1193 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1196 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
1194 | spin_unlock(&mm->page_table_lock); | 1197 | spin_unlock(ptl); |
1195 | mem_cgroup_uncharge_page(new_page); | 1198 | mem_cgroup_uncharge_page(new_page); |
1196 | put_page(new_page); | 1199 | put_page(new_page); |
1197 | goto out_mn; | 1200 | goto out_mn; |
@@ -1213,13 +1216,13 @@ alloc: | |||
1213 | } | 1216 | } |
1214 | ret |= VM_FAULT_WRITE; | 1217 | ret |= VM_FAULT_WRITE; |
1215 | } | 1218 | } |
1216 | spin_unlock(&mm->page_table_lock); | 1219 | spin_unlock(ptl); |
1217 | out_mn: | 1220 | out_mn: |
1218 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1221 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1219 | out: | 1222 | out: |
1220 | return ret; | 1223 | return ret; |
1221 | out_unlock: | 1224 | out_unlock: |
1222 | spin_unlock(&mm->page_table_lock); | 1225 | spin_unlock(ptl); |
1223 | return ret; | 1226 | return ret; |
1224 | } | 1227 | } |
1225 | 1228 | ||
@@ -1231,7 +1234,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1231 | struct mm_struct *mm = vma->vm_mm; | 1234 | struct mm_struct *mm = vma->vm_mm; |
1232 | struct page *page = NULL; | 1235 | struct page *page = NULL; |
1233 | 1236 | ||
1234 | assert_spin_locked(&mm->page_table_lock); | 1237 | assert_spin_locked(pmd_lockptr(mm, pmd)); |
1235 | 1238 | ||
1236 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | 1239 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) |
1237 | goto out; | 1240 | goto out; |
@@ -1278,73 +1281,116 @@ out: | |||
1278 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1281 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1279 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | 1282 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) |
1280 | { | 1283 | { |
1284 | spinlock_t *ptl; | ||
1285 | struct anon_vma *anon_vma = NULL; | ||
1281 | struct page *page; | 1286 | struct page *page; |
1282 | unsigned long haddr = addr & HPAGE_PMD_MASK; | 1287 | unsigned long haddr = addr & HPAGE_PMD_MASK; |
1283 | int target_nid; | 1288 | int page_nid = -1, this_nid = numa_node_id(); |
1284 | int current_nid = -1; | 1289 | int target_nid, last_cpupid = -1; |
1285 | bool migrated; | 1290 | bool page_locked; |
1291 | bool migrated = false; | ||
1292 | int flags = 0; | ||
1286 | 1293 | ||
1287 | spin_lock(&mm->page_table_lock); | 1294 | ptl = pmd_lock(mm, pmdp); |
1288 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1295 | if (unlikely(!pmd_same(pmd, *pmdp))) |
1289 | goto out_unlock; | 1296 | goto out_unlock; |
1290 | 1297 | ||
1291 | page = pmd_page(pmd); | 1298 | page = pmd_page(pmd); |
1292 | get_page(page); | 1299 | BUG_ON(is_huge_zero_page(page)); |
1293 | current_nid = page_to_nid(page); | 1300 | page_nid = page_to_nid(page); |
1301 | last_cpupid = page_cpupid_last(page); | ||
1294 | count_vm_numa_event(NUMA_HINT_FAULTS); | 1302 | count_vm_numa_event(NUMA_HINT_FAULTS); |
1295 | if (current_nid == numa_node_id()) | 1303 | if (page_nid == this_nid) { |
1296 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | 1304 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); |
1305 | flags |= TNF_FAULT_LOCAL; | ||
1306 | } | ||
1297 | 1307 | ||
1308 | /* | ||
1309 | * Avoid grouping on DSO/COW pages in specific and RO pages | ||
1310 | * in general, RO pages shouldn't hurt as much anyway since | ||
1311 | * they can be in shared cache state. | ||
1312 | */ | ||
1313 | if (!pmd_write(pmd)) | ||
1314 | flags |= TNF_NO_GROUP; | ||
1315 | |||
1316 | /* | ||
1317 | * Acquire the page lock to serialise THP migrations but avoid dropping | ||
1318 | * page_table_lock if at all possible | ||
1319 | */ | ||
1320 | page_locked = trylock_page(page); | ||
1298 | target_nid = mpol_misplaced(page, vma, haddr); | 1321 | target_nid = mpol_misplaced(page, vma, haddr); |
1299 | if (target_nid == -1) { | 1322 | if (target_nid == -1) { |
1300 | put_page(page); | 1323 | /* If the page was locked, there are no parallel migrations */ |
1301 | goto clear_pmdnuma; | 1324 | if (page_locked) |
1325 | goto clear_pmdnuma; | ||
1326 | |||
1327 | /* | ||
1328 | * Otherwise wait for potential migrations and retry. We do | ||
1329 | * relock and check_same as the page may no longer be mapped. | ||
1330 | * As the fault is being retried, do not account for it. | ||
1331 | */ | ||
1332 | spin_unlock(ptl); | ||
1333 | wait_on_page_locked(page); | ||
1334 | page_nid = -1; | ||
1335 | goto out; | ||
1302 | } | 1336 | } |
1303 | 1337 | ||
1304 | /* Acquire the page lock to serialise THP migrations */ | 1338 | /* Page is misplaced, serialise migrations and parallel THP splits */ |
1305 | spin_unlock(&mm->page_table_lock); | 1339 | get_page(page); |
1306 | lock_page(page); | 1340 | spin_unlock(ptl); |
1341 | if (!page_locked) | ||
1342 | lock_page(page); | ||
1343 | anon_vma = page_lock_anon_vma_read(page); | ||
1307 | 1344 | ||
1308 | /* Confirm the PTE did not while locked */ | 1345 | /* Confirm the PMD did not change while page_table_lock was released */ |
1309 | spin_lock(&mm->page_table_lock); | 1346 | spin_lock(ptl); |
1310 | if (unlikely(!pmd_same(pmd, *pmdp))) { | 1347 | if (unlikely(!pmd_same(pmd, *pmdp))) { |
1311 | unlock_page(page); | 1348 | unlock_page(page); |
1312 | put_page(page); | 1349 | put_page(page); |
1350 | page_nid = -1; | ||
1313 | goto out_unlock; | 1351 | goto out_unlock; |
1314 | } | 1352 | } |
1315 | spin_unlock(&mm->page_table_lock); | ||
1316 | 1353 | ||
1317 | /* Migrate the THP to the requested node */ | 1354 | /* |
1355 | * Migrate the THP to the requested node, returns with page unlocked | ||
1356 | * and pmd_numa cleared. | ||
1357 | */ | ||
1358 | spin_unlock(ptl); | ||
1318 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1359 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
1319 | pmdp, pmd, addr, page, target_nid); | 1360 | pmdp, pmd, addr, page, target_nid); |
1320 | if (!migrated) | 1361 | if (migrated) { |
1321 | goto check_same; | 1362 | flags |= TNF_MIGRATED; |
1322 | 1363 | page_nid = target_nid; | |
1323 | task_numa_fault(target_nid, HPAGE_PMD_NR, true); | 1364 | } |
1324 | return 0; | ||
1325 | 1365 | ||
1326 | check_same: | 1366 | goto out; |
1327 | spin_lock(&mm->page_table_lock); | ||
1328 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1329 | goto out_unlock; | ||
1330 | clear_pmdnuma: | 1367 | clear_pmdnuma: |
1368 | BUG_ON(!PageLocked(page)); | ||
1331 | pmd = pmd_mknonnuma(pmd); | 1369 | pmd = pmd_mknonnuma(pmd); |
1332 | set_pmd_at(mm, haddr, pmdp, pmd); | 1370 | set_pmd_at(mm, haddr, pmdp, pmd); |
1333 | VM_BUG_ON(pmd_numa(*pmdp)); | 1371 | VM_BUG_ON(pmd_numa(*pmdp)); |
1334 | update_mmu_cache_pmd(vma, addr, pmdp); | 1372 | update_mmu_cache_pmd(vma, addr, pmdp); |
1373 | unlock_page(page); | ||
1335 | out_unlock: | 1374 | out_unlock: |
1336 | spin_unlock(&mm->page_table_lock); | 1375 | spin_unlock(ptl); |
1337 | if (current_nid != -1) | 1376 | |
1338 | task_numa_fault(current_nid, HPAGE_PMD_NR, false); | 1377 | out: |
1378 | if (anon_vma) | ||
1379 | page_unlock_anon_vma_read(anon_vma); | ||
1380 | |||
1381 | if (page_nid != -1) | ||
1382 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); | ||
1383 | |||
1339 | return 0; | 1384 | return 0; |
1340 | } | 1385 | } |
1341 | 1386 | ||
1342 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1387 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1343 | pmd_t *pmd, unsigned long addr) | 1388 | pmd_t *pmd, unsigned long addr) |
1344 | { | 1389 | { |
1390 | spinlock_t *ptl; | ||
1345 | int ret = 0; | 1391 | int ret = 0; |
1346 | 1392 | ||
1347 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1393 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1348 | struct page *page; | 1394 | struct page *page; |
1349 | pgtable_t pgtable; | 1395 | pgtable_t pgtable; |
1350 | pmd_t orig_pmd; | 1396 | pmd_t orig_pmd; |
@@ -1358,8 +1404,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1358 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1404 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1359 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); | 1405 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); |
1360 | if (is_huge_zero_pmd(orig_pmd)) { | 1406 | if (is_huge_zero_pmd(orig_pmd)) { |
1361 | tlb->mm->nr_ptes--; | 1407 | atomic_long_dec(&tlb->mm->nr_ptes); |
1362 | spin_unlock(&tlb->mm->page_table_lock); | 1408 | spin_unlock(ptl); |
1363 | put_huge_zero_page(); | 1409 | put_huge_zero_page(); |
1364 | } else { | 1410 | } else { |
1365 | page = pmd_page(orig_pmd); | 1411 | page = pmd_page(orig_pmd); |
@@ -1367,8 +1413,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1367 | VM_BUG_ON(page_mapcount(page) < 0); | 1413 | VM_BUG_ON(page_mapcount(page) < 0); |
1368 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1414 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1369 | VM_BUG_ON(!PageHead(page)); | 1415 | VM_BUG_ON(!PageHead(page)); |
1370 | tlb->mm->nr_ptes--; | 1416 | atomic_long_dec(&tlb->mm->nr_ptes); |
1371 | spin_unlock(&tlb->mm->page_table_lock); | 1417 | spin_unlock(ptl); |
1372 | tlb_remove_page(tlb, page); | 1418 | tlb_remove_page(tlb, page); |
1373 | } | 1419 | } |
1374 | pte_free(tlb->mm, pgtable); | 1420 | pte_free(tlb->mm, pgtable); |
@@ -1381,14 +1427,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1381 | unsigned long addr, unsigned long end, | 1427 | unsigned long addr, unsigned long end, |
1382 | unsigned char *vec) | 1428 | unsigned char *vec) |
1383 | { | 1429 | { |
1430 | spinlock_t *ptl; | ||
1384 | int ret = 0; | 1431 | int ret = 0; |
1385 | 1432 | ||
1386 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1433 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1387 | /* | 1434 | /* |
1388 | * All logical pages in the range are present | 1435 | * All logical pages in the range are present |
1389 | * if backed by a huge page. | 1436 | * if backed by a huge page. |
1390 | */ | 1437 | */ |
1391 | spin_unlock(&vma->vm_mm->page_table_lock); | 1438 | spin_unlock(ptl); |
1392 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | 1439 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); |
1393 | ret = 1; | 1440 | ret = 1; |
1394 | } | 1441 | } |
@@ -1401,6 +1448,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1401 | unsigned long new_addr, unsigned long old_end, | 1448 | unsigned long new_addr, unsigned long old_end, |
1402 | pmd_t *old_pmd, pmd_t *new_pmd) | 1449 | pmd_t *old_pmd, pmd_t *new_pmd) |
1403 | { | 1450 | { |
1451 | spinlock_t *old_ptl, *new_ptl; | ||
1404 | int ret = 0; | 1452 | int ret = 0; |
1405 | pmd_t pmd; | 1453 | pmd_t pmd; |
1406 | 1454 | ||
@@ -1421,41 +1469,69 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1421 | goto out; | 1469 | goto out; |
1422 | } | 1470 | } |
1423 | 1471 | ||
1424 | ret = __pmd_trans_huge_lock(old_pmd, vma); | 1472 | /* |
1473 | * We don't have to worry about the ordering of src and dst | ||
1474 | * ptlocks because exclusive mmap_sem prevents deadlock. | ||
1475 | */ | ||
1476 | ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); | ||
1425 | if (ret == 1) { | 1477 | if (ret == 1) { |
1478 | new_ptl = pmd_lockptr(mm, new_pmd); | ||
1479 | if (new_ptl != old_ptl) | ||
1480 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); | ||
1426 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | 1481 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1427 | VM_BUG_ON(!pmd_none(*new_pmd)); | 1482 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1428 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); | 1483 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); |
1429 | spin_unlock(&mm->page_table_lock); | 1484 | if (new_ptl != old_ptl) |
1485 | spin_unlock(new_ptl); | ||
1486 | spin_unlock(old_ptl); | ||
1430 | } | 1487 | } |
1431 | out: | 1488 | out: |
1432 | return ret; | 1489 | return ret; |
1433 | } | 1490 | } |
1434 | 1491 | ||
1492 | /* | ||
1493 | * Returns | ||
1494 | * - 0 if PMD could not be locked | ||
1495 | * - 1 if PMD was locked but protections unchange and TLB flush unnecessary | ||
1496 | * - HPAGE_PMD_NR is protections changed and TLB flush necessary | ||
1497 | */ | ||
1435 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1498 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1436 | unsigned long addr, pgprot_t newprot, int prot_numa) | 1499 | unsigned long addr, pgprot_t newprot, int prot_numa) |
1437 | { | 1500 | { |
1438 | struct mm_struct *mm = vma->vm_mm; | 1501 | struct mm_struct *mm = vma->vm_mm; |
1502 | spinlock_t *ptl; | ||
1439 | int ret = 0; | 1503 | int ret = 0; |
1440 | 1504 | ||
1441 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1505 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1442 | pmd_t entry; | 1506 | pmd_t entry; |
1443 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1507 | ret = 1; |
1444 | if (!prot_numa) { | 1508 | if (!prot_numa) { |
1509 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1445 | entry = pmd_modify(entry, newprot); | 1510 | entry = pmd_modify(entry, newprot); |
1511 | ret = HPAGE_PMD_NR; | ||
1446 | BUG_ON(pmd_write(entry)); | 1512 | BUG_ON(pmd_write(entry)); |
1447 | } else { | 1513 | } else { |
1448 | struct page *page = pmd_page(*pmd); | 1514 | struct page *page = pmd_page(*pmd); |
1449 | 1515 | ||
1450 | /* only check non-shared pages */ | 1516 | /* |
1451 | if (page_mapcount(page) == 1 && | 1517 | * Do not trap faults against the zero page. The |
1518 | * read-only data is likely to be read-cached on the | ||
1519 | * local CPU cache and it is less useful to know about | ||
1520 | * local vs remote hits on the zero page. | ||
1521 | */ | ||
1522 | if (!is_huge_zero_page(page) && | ||
1452 | !pmd_numa(*pmd)) { | 1523 | !pmd_numa(*pmd)) { |
1524 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1453 | entry = pmd_mknuma(entry); | 1525 | entry = pmd_mknuma(entry); |
1526 | ret = HPAGE_PMD_NR; | ||
1454 | } | 1527 | } |
1455 | } | 1528 | } |
1456 | set_pmd_at(mm, addr, pmd, entry); | 1529 | |
1457 | spin_unlock(&vma->vm_mm->page_table_lock); | 1530 | /* Set PMD if cleared earlier */ |
1458 | ret = 1; | 1531 | if (ret == HPAGE_PMD_NR) |
1532 | set_pmd_at(mm, addr, pmd, entry); | ||
1533 | |||
1534 | spin_unlock(ptl); | ||
1459 | } | 1535 | } |
1460 | 1536 | ||
1461 | return ret; | 1537 | return ret; |
@@ -1468,12 +1544,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1468 | * Note that if it returns 1, this routine returns without unlocking page | 1544 | * Note that if it returns 1, this routine returns without unlocking page |
1469 | * table locks. So callers must unlock them. | 1545 | * table locks. So callers must unlock them. |
1470 | */ | 1546 | */ |
1471 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | 1547 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, |
1548 | spinlock_t **ptl) | ||
1472 | { | 1549 | { |
1473 | spin_lock(&vma->vm_mm->page_table_lock); | 1550 | *ptl = pmd_lock(vma->vm_mm, pmd); |
1474 | if (likely(pmd_trans_huge(*pmd))) { | 1551 | if (likely(pmd_trans_huge(*pmd))) { |
1475 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1552 | if (unlikely(pmd_trans_splitting(*pmd))) { |
1476 | spin_unlock(&vma->vm_mm->page_table_lock); | 1553 | spin_unlock(*ptl); |
1477 | wait_split_huge_page(vma->anon_vma, pmd); | 1554 | wait_split_huge_page(vma->anon_vma, pmd); |
1478 | return -1; | 1555 | return -1; |
1479 | } else { | 1556 | } else { |
@@ -1482,27 +1559,37 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | |||
1482 | return 1; | 1559 | return 1; |
1483 | } | 1560 | } |
1484 | } | 1561 | } |
1485 | spin_unlock(&vma->vm_mm->page_table_lock); | 1562 | spin_unlock(*ptl); |
1486 | return 0; | 1563 | return 0; |
1487 | } | 1564 | } |
1488 | 1565 | ||
1566 | /* | ||
1567 | * This function returns whether a given @page is mapped onto the @address | ||
1568 | * in the virtual space of @mm. | ||
1569 | * | ||
1570 | * When it's true, this function returns *pmd with holding the page table lock | ||
1571 | * and passing it back to the caller via @ptl. | ||
1572 | * If it's false, returns NULL without holding the page table lock. | ||
1573 | */ | ||
1489 | pmd_t *page_check_address_pmd(struct page *page, | 1574 | pmd_t *page_check_address_pmd(struct page *page, |
1490 | struct mm_struct *mm, | 1575 | struct mm_struct *mm, |
1491 | unsigned long address, | 1576 | unsigned long address, |
1492 | enum page_check_address_pmd_flag flag) | 1577 | enum page_check_address_pmd_flag flag, |
1578 | spinlock_t **ptl) | ||
1493 | { | 1579 | { |
1494 | pmd_t *pmd, *ret = NULL; | 1580 | pmd_t *pmd; |
1495 | 1581 | ||
1496 | if (address & ~HPAGE_PMD_MASK) | 1582 | if (address & ~HPAGE_PMD_MASK) |
1497 | goto out; | 1583 | return NULL; |
1498 | 1584 | ||
1499 | pmd = mm_find_pmd(mm, address); | 1585 | pmd = mm_find_pmd(mm, address); |
1500 | if (!pmd) | 1586 | if (!pmd) |
1501 | goto out; | 1587 | return NULL; |
1588 | *ptl = pmd_lock(mm, pmd); | ||
1502 | if (pmd_none(*pmd)) | 1589 | if (pmd_none(*pmd)) |
1503 | goto out; | 1590 | goto unlock; |
1504 | if (pmd_page(*pmd) != page) | 1591 | if (pmd_page(*pmd) != page) |
1505 | goto out; | 1592 | goto unlock; |
1506 | /* | 1593 | /* |
1507 | * split_vma() may create temporary aliased mappings. There is | 1594 | * split_vma() may create temporary aliased mappings. There is |
1508 | * no risk as long as all huge pmd are found and have their | 1595 | * no risk as long as all huge pmd are found and have their |
@@ -1512,14 +1599,15 @@ pmd_t *page_check_address_pmd(struct page *page, | |||
1512 | */ | 1599 | */ |
1513 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && | 1600 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && |
1514 | pmd_trans_splitting(*pmd)) | 1601 | pmd_trans_splitting(*pmd)) |
1515 | goto out; | 1602 | goto unlock; |
1516 | if (pmd_trans_huge(*pmd)) { | 1603 | if (pmd_trans_huge(*pmd)) { |
1517 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && | 1604 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && |
1518 | !pmd_trans_splitting(*pmd)); | 1605 | !pmd_trans_splitting(*pmd)); |
1519 | ret = pmd; | 1606 | return pmd; |
1520 | } | 1607 | } |
1521 | out: | 1608 | unlock: |
1522 | return ret; | 1609 | spin_unlock(*ptl); |
1610 | return NULL; | ||
1523 | } | 1611 | } |
1524 | 1612 | ||
1525 | static int __split_huge_page_splitting(struct page *page, | 1613 | static int __split_huge_page_splitting(struct page *page, |
@@ -1527,6 +1615,7 @@ static int __split_huge_page_splitting(struct page *page, | |||
1527 | unsigned long address) | 1615 | unsigned long address) |
1528 | { | 1616 | { |
1529 | struct mm_struct *mm = vma->vm_mm; | 1617 | struct mm_struct *mm = vma->vm_mm; |
1618 | spinlock_t *ptl; | ||
1530 | pmd_t *pmd; | 1619 | pmd_t *pmd; |
1531 | int ret = 0; | 1620 | int ret = 0; |
1532 | /* For mmu_notifiers */ | 1621 | /* For mmu_notifiers */ |
@@ -1534,9 +1623,8 @@ static int __split_huge_page_splitting(struct page *page, | |||
1534 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; | 1623 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; |
1535 | 1624 | ||
1536 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1625 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
1537 | spin_lock(&mm->page_table_lock); | ||
1538 | pmd = page_check_address_pmd(page, mm, address, | 1626 | pmd = page_check_address_pmd(page, mm, address, |
1539 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | 1627 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); |
1540 | if (pmd) { | 1628 | if (pmd) { |
1541 | /* | 1629 | /* |
1542 | * We can't temporarily set the pmd to null in order | 1630 | * We can't temporarily set the pmd to null in order |
@@ -1547,8 +1635,8 @@ static int __split_huge_page_splitting(struct page *page, | |||
1547 | */ | 1635 | */ |
1548 | pmdp_splitting_flush(vma, address, pmd); | 1636 | pmdp_splitting_flush(vma, address, pmd); |
1549 | ret = 1; | 1637 | ret = 1; |
1638 | spin_unlock(ptl); | ||
1550 | } | 1639 | } |
1551 | spin_unlock(&mm->page_table_lock); | ||
1552 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1640 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
1553 | 1641 | ||
1554 | return ret; | 1642 | return ret; |
@@ -1636,7 +1724,7 @@ static void __split_huge_page_refcount(struct page *page, | |||
1636 | page_tail->mapping = page->mapping; | 1724 | page_tail->mapping = page->mapping; |
1637 | 1725 | ||
1638 | page_tail->index = page->index + i; | 1726 | page_tail->index = page->index + i; |
1639 | page_nid_xchg_last(page_tail, page_nid_last(page)); | 1727 | page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); |
1640 | 1728 | ||
1641 | BUG_ON(!PageAnon(page_tail)); | 1729 | BUG_ON(!PageAnon(page_tail)); |
1642 | BUG_ON(!PageUptodate(page_tail)); | 1730 | BUG_ON(!PageUptodate(page_tail)); |
@@ -1679,14 +1767,14 @@ static int __split_huge_page_map(struct page *page, | |||
1679 | unsigned long address) | 1767 | unsigned long address) |
1680 | { | 1768 | { |
1681 | struct mm_struct *mm = vma->vm_mm; | 1769 | struct mm_struct *mm = vma->vm_mm; |
1770 | spinlock_t *ptl; | ||
1682 | pmd_t *pmd, _pmd; | 1771 | pmd_t *pmd, _pmd; |
1683 | int ret = 0, i; | 1772 | int ret = 0, i; |
1684 | pgtable_t pgtable; | 1773 | pgtable_t pgtable; |
1685 | unsigned long haddr; | 1774 | unsigned long haddr; |
1686 | 1775 | ||
1687 | spin_lock(&mm->page_table_lock); | ||
1688 | pmd = page_check_address_pmd(page, mm, address, | 1776 | pmd = page_check_address_pmd(page, mm, address, |
1689 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | 1777 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); |
1690 | if (pmd) { | 1778 | if (pmd) { |
1691 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 1779 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
1692 | pmd_populate(mm, &_pmd, pgtable); | 1780 | pmd_populate(mm, &_pmd, pgtable); |
@@ -1741,8 +1829,8 @@ static int __split_huge_page_map(struct page *page, | |||
1741 | pmdp_invalidate(vma, address, pmd); | 1829 | pmdp_invalidate(vma, address, pmd); |
1742 | pmd_populate(mm, pmd, pgtable); | 1830 | pmd_populate(mm, pmd, pgtable); |
1743 | ret = 1; | 1831 | ret = 1; |
1832 | spin_unlock(ptl); | ||
1744 | } | 1833 | } |
1745 | spin_unlock(&mm->page_table_lock); | ||
1746 | 1834 | ||
1747 | return ret; | 1835 | return ret; |
1748 | } | 1836 | } |
@@ -2139,7 +2227,34 @@ static void khugepaged_alloc_sleep(void) | |||
2139 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | 2227 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); |
2140 | } | 2228 | } |
2141 | 2229 | ||
2230 | static int khugepaged_node_load[MAX_NUMNODES]; | ||
2231 | |||
2142 | #ifdef CONFIG_NUMA | 2232 | #ifdef CONFIG_NUMA |
2233 | static int khugepaged_find_target_node(void) | ||
2234 | { | ||
2235 | static int last_khugepaged_target_node = NUMA_NO_NODE; | ||
2236 | int nid, target_node = 0, max_value = 0; | ||
2237 | |||
2238 | /* find first node with max normal pages hit */ | ||
2239 | for (nid = 0; nid < MAX_NUMNODES; nid++) | ||
2240 | if (khugepaged_node_load[nid] > max_value) { | ||
2241 | max_value = khugepaged_node_load[nid]; | ||
2242 | target_node = nid; | ||
2243 | } | ||
2244 | |||
2245 | /* do some balance if several nodes have the same hit record */ | ||
2246 | if (target_node <= last_khugepaged_target_node) | ||
2247 | for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; | ||
2248 | nid++) | ||
2249 | if (max_value == khugepaged_node_load[nid]) { | ||
2250 | target_node = nid; | ||
2251 | break; | ||
2252 | } | ||
2253 | |||
2254 | last_khugepaged_target_node = target_node; | ||
2255 | return target_node; | ||
2256 | } | ||
2257 | |||
2143 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | 2258 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
2144 | { | 2259 | { |
2145 | if (IS_ERR(*hpage)) { | 2260 | if (IS_ERR(*hpage)) { |
@@ -2173,9 +2288,8 @@ static struct page | |||
2173 | * mmap_sem in read mode is good idea also to allow greater | 2288 | * mmap_sem in read mode is good idea also to allow greater |
2174 | * scalability. | 2289 | * scalability. |
2175 | */ | 2290 | */ |
2176 | *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 2291 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( |
2177 | node, __GFP_OTHER_NODE); | 2292 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); |
2178 | |||
2179 | /* | 2293 | /* |
2180 | * After allocating the hugepage, release the mmap_sem read lock in | 2294 | * After allocating the hugepage, release the mmap_sem read lock in |
2181 | * preparation for taking it in write mode. | 2295 | * preparation for taking it in write mode. |
@@ -2191,6 +2305,17 @@ static struct page | |||
2191 | return *hpage; | 2305 | return *hpage; |
2192 | } | 2306 | } |
2193 | #else | 2307 | #else |
2308 | static int khugepaged_find_target_node(void) | ||
2309 | { | ||
2310 | return 0; | ||
2311 | } | ||
2312 | |||
2313 | static inline struct page *alloc_hugepage(int defrag) | ||
2314 | { | ||
2315 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | ||
2316 | HPAGE_PMD_ORDER); | ||
2317 | } | ||
2318 | |||
2194 | static struct page *khugepaged_alloc_hugepage(bool *wait) | 2319 | static struct page *khugepaged_alloc_hugepage(bool *wait) |
2195 | { | 2320 | { |
2196 | struct page *hpage; | 2321 | struct page *hpage; |
@@ -2257,7 +2382,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2257 | pte_t *pte; | 2382 | pte_t *pte; |
2258 | pgtable_t pgtable; | 2383 | pgtable_t pgtable; |
2259 | struct page *new_page; | 2384 | struct page *new_page; |
2260 | spinlock_t *ptl; | 2385 | spinlock_t *pmd_ptl, *pte_ptl; |
2261 | int isolated; | 2386 | int isolated; |
2262 | unsigned long hstart, hend; | 2387 | unsigned long hstart, hend; |
2263 | unsigned long mmun_start; /* For mmu_notifiers */ | 2388 | unsigned long mmun_start; /* For mmu_notifiers */ |
@@ -2300,12 +2425,12 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2300 | anon_vma_lock_write(vma->anon_vma); | 2425 | anon_vma_lock_write(vma->anon_vma); |
2301 | 2426 | ||
2302 | pte = pte_offset_map(pmd, address); | 2427 | pte = pte_offset_map(pmd, address); |
2303 | ptl = pte_lockptr(mm, pmd); | 2428 | pte_ptl = pte_lockptr(mm, pmd); |
2304 | 2429 | ||
2305 | mmun_start = address; | 2430 | mmun_start = address; |
2306 | mmun_end = address + HPAGE_PMD_SIZE; | 2431 | mmun_end = address + HPAGE_PMD_SIZE; |
2307 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2432 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2308 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | 2433 | pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ |
2309 | /* | 2434 | /* |
2310 | * After this gup_fast can't run anymore. This also removes | 2435 | * After this gup_fast can't run anymore. This also removes |
2311 | * any huge TLB entry from the CPU so we won't allow | 2436 | * any huge TLB entry from the CPU so we won't allow |
@@ -2313,16 +2438,16 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2313 | * to avoid the risk of CPU bugs in that area. | 2438 | * to avoid the risk of CPU bugs in that area. |
2314 | */ | 2439 | */ |
2315 | _pmd = pmdp_clear_flush(vma, address, pmd); | 2440 | _pmd = pmdp_clear_flush(vma, address, pmd); |
2316 | spin_unlock(&mm->page_table_lock); | 2441 | spin_unlock(pmd_ptl); |
2317 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2442 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2318 | 2443 | ||
2319 | spin_lock(ptl); | 2444 | spin_lock(pte_ptl); |
2320 | isolated = __collapse_huge_page_isolate(vma, address, pte); | 2445 | isolated = __collapse_huge_page_isolate(vma, address, pte); |
2321 | spin_unlock(ptl); | 2446 | spin_unlock(pte_ptl); |
2322 | 2447 | ||
2323 | if (unlikely(!isolated)) { | 2448 | if (unlikely(!isolated)) { |
2324 | pte_unmap(pte); | 2449 | pte_unmap(pte); |
2325 | spin_lock(&mm->page_table_lock); | 2450 | spin_lock(pmd_ptl); |
2326 | BUG_ON(!pmd_none(*pmd)); | 2451 | BUG_ON(!pmd_none(*pmd)); |
2327 | /* | 2452 | /* |
2328 | * We can only use set_pmd_at when establishing | 2453 | * We can only use set_pmd_at when establishing |
@@ -2330,7 +2455,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2330 | * points to regular pagetables. Use pmd_populate for that | 2455 | * points to regular pagetables. Use pmd_populate for that |
2331 | */ | 2456 | */ |
2332 | pmd_populate(mm, pmd, pmd_pgtable(_pmd)); | 2457 | pmd_populate(mm, pmd, pmd_pgtable(_pmd)); |
2333 | spin_unlock(&mm->page_table_lock); | 2458 | spin_unlock(pmd_ptl); |
2334 | anon_vma_unlock_write(vma->anon_vma); | 2459 | anon_vma_unlock_write(vma->anon_vma); |
2335 | goto out; | 2460 | goto out; |
2336 | } | 2461 | } |
@@ -2341,7 +2466,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2341 | */ | 2466 | */ |
2342 | anon_vma_unlock_write(vma->anon_vma); | 2467 | anon_vma_unlock_write(vma->anon_vma); |
2343 | 2468 | ||
2344 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | 2469 | __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); |
2345 | pte_unmap(pte); | 2470 | pte_unmap(pte); |
2346 | __SetPageUptodate(new_page); | 2471 | __SetPageUptodate(new_page); |
2347 | pgtable = pmd_pgtable(_pmd); | 2472 | pgtable = pmd_pgtable(_pmd); |
@@ -2356,13 +2481,13 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2356 | */ | 2481 | */ |
2357 | smp_wmb(); | 2482 | smp_wmb(); |
2358 | 2483 | ||
2359 | spin_lock(&mm->page_table_lock); | 2484 | spin_lock(pmd_ptl); |
2360 | BUG_ON(!pmd_none(*pmd)); | 2485 | BUG_ON(!pmd_none(*pmd)); |
2361 | page_add_new_anon_rmap(new_page, vma, address); | 2486 | page_add_new_anon_rmap(new_page, vma, address); |
2362 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 2487 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
2363 | set_pmd_at(mm, address, pmd, _pmd); | 2488 | set_pmd_at(mm, address, pmd, _pmd); |
2364 | update_mmu_cache_pmd(vma, address, pmd); | 2489 | update_mmu_cache_pmd(vma, address, pmd); |
2365 | spin_unlock(&mm->page_table_lock); | 2490 | spin_unlock(pmd_ptl); |
2366 | 2491 | ||
2367 | *hpage = NULL; | 2492 | *hpage = NULL; |
2368 | 2493 | ||
@@ -2397,6 +2522,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2397 | if (pmd_trans_huge(*pmd)) | 2522 | if (pmd_trans_huge(*pmd)) |
2398 | goto out; | 2523 | goto out; |
2399 | 2524 | ||
2525 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); | ||
2400 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2526 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2401 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | 2527 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; |
2402 | _pte++, _address += PAGE_SIZE) { | 2528 | _pte++, _address += PAGE_SIZE) { |
@@ -2413,12 +2539,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2413 | if (unlikely(!page)) | 2539 | if (unlikely(!page)) |
2414 | goto out_unmap; | 2540 | goto out_unmap; |
2415 | /* | 2541 | /* |
2416 | * Chose the node of the first page. This could | 2542 | * Record which node the original page is from and save this |
2417 | * be more sophisticated and look at more pages, | 2543 | * information to khugepaged_node_load[]. |
2418 | * but isn't for now. | 2544 | * Khupaged will allocate hugepage from the node has the max |
2545 | * hit record. | ||
2419 | */ | 2546 | */ |
2420 | if (node == NUMA_NO_NODE) | 2547 | node = page_to_nid(page); |
2421 | node = page_to_nid(page); | 2548 | khugepaged_node_load[node]++; |
2422 | VM_BUG_ON(PageCompound(page)); | 2549 | VM_BUG_ON(PageCompound(page)); |
2423 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2550 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2424 | goto out_unmap; | 2551 | goto out_unmap; |
@@ -2433,9 +2560,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2433 | ret = 1; | 2560 | ret = 1; |
2434 | out_unmap: | 2561 | out_unmap: |
2435 | pte_unmap_unlock(pte, ptl); | 2562 | pte_unmap_unlock(pte, ptl); |
2436 | if (ret) | 2563 | if (ret) { |
2564 | node = khugepaged_find_target_node(); | ||
2437 | /* collapse_huge_page will return with the mmap_sem released */ | 2565 | /* collapse_huge_page will return with the mmap_sem released */ |
2438 | collapse_huge_page(mm, address, hpage, vma, node); | 2566 | collapse_huge_page(mm, address, hpage, vma, node); |
2567 | } | ||
2439 | out: | 2568 | out: |
2440 | return ret; | 2569 | return ret; |
2441 | } | 2570 | } |
@@ -2687,6 +2816,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | |||
2687 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | 2816 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, |
2688 | pmd_t *pmd) | 2817 | pmd_t *pmd) |
2689 | { | 2818 | { |
2819 | spinlock_t *ptl; | ||
2690 | struct page *page; | 2820 | struct page *page; |
2691 | struct mm_struct *mm = vma->vm_mm; | 2821 | struct mm_struct *mm = vma->vm_mm; |
2692 | unsigned long haddr = address & HPAGE_PMD_MASK; | 2822 | unsigned long haddr = address & HPAGE_PMD_MASK; |
@@ -2697,29 +2827,37 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | |||
2697 | 2827 | ||
2698 | mmun_start = haddr; | 2828 | mmun_start = haddr; |
2699 | mmun_end = haddr + HPAGE_PMD_SIZE; | 2829 | mmun_end = haddr + HPAGE_PMD_SIZE; |
2830 | again: | ||
2700 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2831 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2701 | spin_lock(&mm->page_table_lock); | 2832 | ptl = pmd_lock(mm, pmd); |
2702 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2833 | if (unlikely(!pmd_trans_huge(*pmd))) { |
2703 | spin_unlock(&mm->page_table_lock); | 2834 | spin_unlock(ptl); |
2704 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2835 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2705 | return; | 2836 | return; |
2706 | } | 2837 | } |
2707 | if (is_huge_zero_pmd(*pmd)) { | 2838 | if (is_huge_zero_pmd(*pmd)) { |
2708 | __split_huge_zero_page_pmd(vma, haddr, pmd); | 2839 | __split_huge_zero_page_pmd(vma, haddr, pmd); |
2709 | spin_unlock(&mm->page_table_lock); | 2840 | spin_unlock(ptl); |
2710 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2841 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2711 | return; | 2842 | return; |
2712 | } | 2843 | } |
2713 | page = pmd_page(*pmd); | 2844 | page = pmd_page(*pmd); |
2714 | VM_BUG_ON(!page_count(page)); | 2845 | VM_BUG_ON(!page_count(page)); |
2715 | get_page(page); | 2846 | get_page(page); |
2716 | spin_unlock(&mm->page_table_lock); | 2847 | spin_unlock(ptl); |
2717 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2848 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2718 | 2849 | ||
2719 | split_huge_page(page); | 2850 | split_huge_page(page); |
2720 | 2851 | ||
2721 | put_page(page); | 2852 | put_page(page); |
2722 | BUG_ON(pmd_trans_huge(*pmd)); | 2853 | |
2854 | /* | ||
2855 | * We don't always have down_write of mmap_sem here: a racing | ||
2856 | * do_huge_pmd_wp_page() might have copied-on-write to another | ||
2857 | * huge page before our split_huge_page() got the anon_vma lock. | ||
2858 | */ | ||
2859 | if (unlikely(pmd_trans_huge(*pmd))) | ||
2860 | goto again; | ||
2723 | } | 2861 | } |
2724 | 2862 | ||
2725 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | 2863 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b49579c7f2a5..7d57af21f49e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -653,6 +653,7 @@ static void free_huge_page(struct page *page) | |||
653 | BUG_ON(page_count(page)); | 653 | BUG_ON(page_count(page)); |
654 | BUG_ON(page_mapcount(page)); | 654 | BUG_ON(page_mapcount(page)); |
655 | restore_reserve = PagePrivate(page); | 655 | restore_reserve = PagePrivate(page); |
656 | ClearPagePrivate(page); | ||
656 | 657 | ||
657 | spin_lock(&hugetlb_lock); | 658 | spin_lock(&hugetlb_lock); |
658 | hugetlb_cgroup_uncharge_page(hstate_index(h), | 659 | hugetlb_cgroup_uncharge_page(hstate_index(h), |
@@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
695 | /* we rely on prep_new_huge_page to set the destructor */ | 696 | /* we rely on prep_new_huge_page to set the destructor */ |
696 | set_compound_order(page, order); | 697 | set_compound_order(page, order); |
697 | __SetPageHead(page); | 698 | __SetPageHead(page); |
699 | __ClearPageReserved(page); | ||
698 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | 700 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
699 | __SetPageTail(p); | 701 | __SetPageTail(p); |
702 | /* | ||
703 | * For gigantic hugepages allocated through bootmem at | ||
704 | * boot, it's safer to be consistent with the not-gigantic | ||
705 | * hugepages and clear the PG_reserved bit from all tail pages | ||
706 | * too. Otherwse drivers using get_user_pages() to access tail | ||
707 | * pages may get the reference counting wrong if they see | ||
708 | * PG_reserved set on a tail page (despite the head page not | ||
709 | * having PG_reserved set). Enforcing this consistency between | ||
710 | * head and tail pages allows drivers to optimize away a check | ||
711 | * on the head page when they need know if put_page() is needed | ||
712 | * after get_user_pages(). | ||
713 | */ | ||
714 | __ClearPageReserved(p); | ||
700 | set_page_count(p, 0); | 715 | set_page_count(p, 0); |
701 | p->first_page = page; | 716 | p->first_page = page; |
702 | } | 717 | } |
@@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void) | |||
1329 | #else | 1344 | #else |
1330 | page = virt_to_page(m); | 1345 | page = virt_to_page(m); |
1331 | #endif | 1346 | #endif |
1332 | __ClearPageReserved(page); | ||
1333 | WARN_ON(page_count(page) != 1); | 1347 | WARN_ON(page_count(page) != 1); |
1334 | prep_compound_huge_page(page, h->order); | 1348 | prep_compound_huge_page(page, h->order); |
1349 | WARN_ON(PageReserved(page)); | ||
1335 | prep_new_huge_page(h, page, page_to_nid(page)); | 1350 | prep_new_huge_page(h, page, page_to_nid(page)); |
1336 | /* | 1351 | /* |
1337 | * If we had gigantic hugepages allocated at boot time, we need | 1352 | * If we had gigantic hugepages allocated at boot time, we need |
@@ -2361,6 +2376,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2361 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 2376 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
2362 | 2377 | ||
2363 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { | 2378 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
2379 | spinlock_t *src_ptl, *dst_ptl; | ||
2364 | src_pte = huge_pte_offset(src, addr); | 2380 | src_pte = huge_pte_offset(src, addr); |
2365 | if (!src_pte) | 2381 | if (!src_pte) |
2366 | continue; | 2382 | continue; |
@@ -2372,8 +2388,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2372 | if (dst_pte == src_pte) | 2388 | if (dst_pte == src_pte) |
2373 | continue; | 2389 | continue; |
2374 | 2390 | ||
2375 | spin_lock(&dst->page_table_lock); | 2391 | dst_ptl = huge_pte_lock(h, dst, dst_pte); |
2376 | spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); | 2392 | src_ptl = huge_pte_lockptr(h, src, src_pte); |
2393 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | ||
2377 | if (!huge_pte_none(huge_ptep_get(src_pte))) { | 2394 | if (!huge_pte_none(huge_ptep_get(src_pte))) { |
2378 | if (cow) | 2395 | if (cow) |
2379 | huge_ptep_set_wrprotect(src, addr, src_pte); | 2396 | huge_ptep_set_wrprotect(src, addr, src_pte); |
@@ -2383,8 +2400,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2383 | page_dup_rmap(ptepage); | 2400 | page_dup_rmap(ptepage); |
2384 | set_huge_pte_at(dst, addr, dst_pte, entry); | 2401 | set_huge_pte_at(dst, addr, dst_pte, entry); |
2385 | } | 2402 | } |
2386 | spin_unlock(&src->page_table_lock); | 2403 | spin_unlock(src_ptl); |
2387 | spin_unlock(&dst->page_table_lock); | 2404 | spin_unlock(dst_ptl); |
2388 | } | 2405 | } |
2389 | return 0; | 2406 | return 0; |
2390 | 2407 | ||
@@ -2427,6 +2444,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
2427 | unsigned long address; | 2444 | unsigned long address; |
2428 | pte_t *ptep; | 2445 | pte_t *ptep; |
2429 | pte_t pte; | 2446 | pte_t pte; |
2447 | spinlock_t *ptl; | ||
2430 | struct page *page; | 2448 | struct page *page; |
2431 | struct hstate *h = hstate_vma(vma); | 2449 | struct hstate *h = hstate_vma(vma); |
2432 | unsigned long sz = huge_page_size(h); | 2450 | unsigned long sz = huge_page_size(h); |
@@ -2440,25 +2458,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
2440 | tlb_start_vma(tlb, vma); | 2458 | tlb_start_vma(tlb, vma); |
2441 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2459 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2442 | again: | 2460 | again: |
2443 | spin_lock(&mm->page_table_lock); | ||
2444 | for (address = start; address < end; address += sz) { | 2461 | for (address = start; address < end; address += sz) { |
2445 | ptep = huge_pte_offset(mm, address); | 2462 | ptep = huge_pte_offset(mm, address); |
2446 | if (!ptep) | 2463 | if (!ptep) |
2447 | continue; | 2464 | continue; |
2448 | 2465 | ||
2466 | ptl = huge_pte_lock(h, mm, ptep); | ||
2449 | if (huge_pmd_unshare(mm, &address, ptep)) | 2467 | if (huge_pmd_unshare(mm, &address, ptep)) |
2450 | continue; | 2468 | goto unlock; |
2451 | 2469 | ||
2452 | pte = huge_ptep_get(ptep); | 2470 | pte = huge_ptep_get(ptep); |
2453 | if (huge_pte_none(pte)) | 2471 | if (huge_pte_none(pte)) |
2454 | continue; | 2472 | goto unlock; |
2455 | 2473 | ||
2456 | /* | 2474 | /* |
2457 | * HWPoisoned hugepage is already unmapped and dropped reference | 2475 | * HWPoisoned hugepage is already unmapped and dropped reference |
2458 | */ | 2476 | */ |
2459 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { | 2477 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { |
2460 | huge_pte_clear(mm, address, ptep); | 2478 | huge_pte_clear(mm, address, ptep); |
2461 | continue; | 2479 | goto unlock; |
2462 | } | 2480 | } |
2463 | 2481 | ||
2464 | page = pte_page(pte); | 2482 | page = pte_page(pte); |
@@ -2469,7 +2487,7 @@ again: | |||
2469 | */ | 2487 | */ |
2470 | if (ref_page) { | 2488 | if (ref_page) { |
2471 | if (page != ref_page) | 2489 | if (page != ref_page) |
2472 | continue; | 2490 | goto unlock; |
2473 | 2491 | ||
2474 | /* | 2492 | /* |
2475 | * Mark the VMA as having unmapped its page so that | 2493 | * Mark the VMA as having unmapped its page so that |
@@ -2486,13 +2504,18 @@ again: | |||
2486 | 2504 | ||
2487 | page_remove_rmap(page); | 2505 | page_remove_rmap(page); |
2488 | force_flush = !__tlb_remove_page(tlb, page); | 2506 | force_flush = !__tlb_remove_page(tlb, page); |
2489 | if (force_flush) | 2507 | if (force_flush) { |
2508 | spin_unlock(ptl); | ||
2490 | break; | 2509 | break; |
2510 | } | ||
2491 | /* Bail out after unmapping reference page if supplied */ | 2511 | /* Bail out after unmapping reference page if supplied */ |
2492 | if (ref_page) | 2512 | if (ref_page) { |
2513 | spin_unlock(ptl); | ||
2493 | break; | 2514 | break; |
2515 | } | ||
2516 | unlock: | ||
2517 | spin_unlock(ptl); | ||
2494 | } | 2518 | } |
2495 | spin_unlock(&mm->page_table_lock); | ||
2496 | /* | 2519 | /* |
2497 | * mmu_gather ran out of room to batch pages, we break out of | 2520 | * mmu_gather ran out of room to batch pages, we break out of |
2498 | * the PTE lock to avoid doing the potential expensive TLB invalidate | 2521 | * the PTE lock to avoid doing the potential expensive TLB invalidate |
@@ -2598,7 +2621,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2598 | */ | 2621 | */ |
2599 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | 2622 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
2600 | unsigned long address, pte_t *ptep, pte_t pte, | 2623 | unsigned long address, pte_t *ptep, pte_t pte, |
2601 | struct page *pagecache_page) | 2624 | struct page *pagecache_page, spinlock_t *ptl) |
2602 | { | 2625 | { |
2603 | struct hstate *h = hstate_vma(vma); | 2626 | struct hstate *h = hstate_vma(vma); |
2604 | struct page *old_page, *new_page; | 2627 | struct page *old_page, *new_page; |
@@ -2632,8 +2655,8 @@ retry_avoidcopy: | |||
2632 | 2655 | ||
2633 | page_cache_get(old_page); | 2656 | page_cache_get(old_page); |
2634 | 2657 | ||
2635 | /* Drop page_table_lock as buddy allocator may be called */ | 2658 | /* Drop page table lock as buddy allocator may be called */ |
2636 | spin_unlock(&mm->page_table_lock); | 2659 | spin_unlock(ptl); |
2637 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2660 | new_page = alloc_huge_page(vma, address, outside_reserve); |
2638 | 2661 | ||
2639 | if (IS_ERR(new_page)) { | 2662 | if (IS_ERR(new_page)) { |
@@ -2651,13 +2674,13 @@ retry_avoidcopy: | |||
2651 | BUG_ON(huge_pte_none(pte)); | 2674 | BUG_ON(huge_pte_none(pte)); |
2652 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2675 | if (unmap_ref_private(mm, vma, old_page, address)) { |
2653 | BUG_ON(huge_pte_none(pte)); | 2676 | BUG_ON(huge_pte_none(pte)); |
2654 | spin_lock(&mm->page_table_lock); | 2677 | spin_lock(ptl); |
2655 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2678 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2656 | if (likely(pte_same(huge_ptep_get(ptep), pte))) | 2679 | if (likely(pte_same(huge_ptep_get(ptep), pte))) |
2657 | goto retry_avoidcopy; | 2680 | goto retry_avoidcopy; |
2658 | /* | 2681 | /* |
2659 | * race occurs while re-acquiring page_table_lock, and | 2682 | * race occurs while re-acquiring page table |
2660 | * our job is done. | 2683 | * lock, and our job is done. |
2661 | */ | 2684 | */ |
2662 | return 0; | 2685 | return 0; |
2663 | } | 2686 | } |
@@ -2665,7 +2688,7 @@ retry_avoidcopy: | |||
2665 | } | 2688 | } |
2666 | 2689 | ||
2667 | /* Caller expects lock to be held */ | 2690 | /* Caller expects lock to be held */ |
2668 | spin_lock(&mm->page_table_lock); | 2691 | spin_lock(ptl); |
2669 | if (err == -ENOMEM) | 2692 | if (err == -ENOMEM) |
2670 | return VM_FAULT_OOM; | 2693 | return VM_FAULT_OOM; |
2671 | else | 2694 | else |
@@ -2680,7 +2703,7 @@ retry_avoidcopy: | |||
2680 | page_cache_release(new_page); | 2703 | page_cache_release(new_page); |
2681 | page_cache_release(old_page); | 2704 | page_cache_release(old_page); |
2682 | /* Caller expects lock to be held */ | 2705 | /* Caller expects lock to be held */ |
2683 | spin_lock(&mm->page_table_lock); | 2706 | spin_lock(ptl); |
2684 | return VM_FAULT_OOM; | 2707 | return VM_FAULT_OOM; |
2685 | } | 2708 | } |
2686 | 2709 | ||
@@ -2692,10 +2715,10 @@ retry_avoidcopy: | |||
2692 | mmun_end = mmun_start + huge_page_size(h); | 2715 | mmun_end = mmun_start + huge_page_size(h); |
2693 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2716 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2694 | /* | 2717 | /* |
2695 | * Retake the page_table_lock to check for racing updates | 2718 | * Retake the page table lock to check for racing updates |
2696 | * before the page tables are altered | 2719 | * before the page tables are altered |
2697 | */ | 2720 | */ |
2698 | spin_lock(&mm->page_table_lock); | 2721 | spin_lock(ptl); |
2699 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2722 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2700 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2723 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
2701 | ClearPagePrivate(new_page); | 2724 | ClearPagePrivate(new_page); |
@@ -2709,13 +2732,13 @@ retry_avoidcopy: | |||
2709 | /* Make the old page be freed below */ | 2732 | /* Make the old page be freed below */ |
2710 | new_page = old_page; | 2733 | new_page = old_page; |
2711 | } | 2734 | } |
2712 | spin_unlock(&mm->page_table_lock); | 2735 | spin_unlock(ptl); |
2713 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2736 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2714 | page_cache_release(new_page); | 2737 | page_cache_release(new_page); |
2715 | page_cache_release(old_page); | 2738 | page_cache_release(old_page); |
2716 | 2739 | ||
2717 | /* Caller expects lock to be held */ | 2740 | /* Caller expects lock to be held */ |
2718 | spin_lock(&mm->page_table_lock); | 2741 | spin_lock(ptl); |
2719 | return 0; | 2742 | return 0; |
2720 | } | 2743 | } |
2721 | 2744 | ||
@@ -2763,6 +2786,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2763 | struct page *page; | 2786 | struct page *page; |
2764 | struct address_space *mapping; | 2787 | struct address_space *mapping; |
2765 | pte_t new_pte; | 2788 | pte_t new_pte; |
2789 | spinlock_t *ptl; | ||
2766 | 2790 | ||
2767 | /* | 2791 | /* |
2768 | * Currently, we are forced to kill the process in the event the | 2792 | * Currently, we are forced to kill the process in the event the |
@@ -2849,7 +2873,8 @@ retry: | |||
2849 | goto backout_unlocked; | 2873 | goto backout_unlocked; |
2850 | } | 2874 | } |
2851 | 2875 | ||
2852 | spin_lock(&mm->page_table_lock); | 2876 | ptl = huge_pte_lockptr(h, mm, ptep); |
2877 | spin_lock(ptl); | ||
2853 | size = i_size_read(mapping->host) >> huge_page_shift(h); | 2878 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
2854 | if (idx >= size) | 2879 | if (idx >= size) |
2855 | goto backout; | 2880 | goto backout; |
@@ -2870,16 +2895,16 @@ retry: | |||
2870 | 2895 | ||
2871 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { | 2896 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { |
2872 | /* Optimization, do the COW without a second fault */ | 2897 | /* Optimization, do the COW without a second fault */ |
2873 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); | 2898 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); |
2874 | } | 2899 | } |
2875 | 2900 | ||
2876 | spin_unlock(&mm->page_table_lock); | 2901 | spin_unlock(ptl); |
2877 | unlock_page(page); | 2902 | unlock_page(page); |
2878 | out: | 2903 | out: |
2879 | return ret; | 2904 | return ret; |
2880 | 2905 | ||
2881 | backout: | 2906 | backout: |
2882 | spin_unlock(&mm->page_table_lock); | 2907 | spin_unlock(ptl); |
2883 | backout_unlocked: | 2908 | backout_unlocked: |
2884 | unlock_page(page); | 2909 | unlock_page(page); |
2885 | put_page(page); | 2910 | put_page(page); |
@@ -2891,6 +2916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2891 | { | 2916 | { |
2892 | pte_t *ptep; | 2917 | pte_t *ptep; |
2893 | pte_t entry; | 2918 | pte_t entry; |
2919 | spinlock_t *ptl; | ||
2894 | int ret; | 2920 | int ret; |
2895 | struct page *page = NULL; | 2921 | struct page *page = NULL; |
2896 | struct page *pagecache_page = NULL; | 2922 | struct page *pagecache_page = NULL; |
@@ -2903,7 +2929,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2903 | if (ptep) { | 2929 | if (ptep) { |
2904 | entry = huge_ptep_get(ptep); | 2930 | entry = huge_ptep_get(ptep); |
2905 | if (unlikely(is_hugetlb_entry_migration(entry))) { | 2931 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
2906 | migration_entry_wait_huge(mm, ptep); | 2932 | migration_entry_wait_huge(vma, mm, ptep); |
2907 | return 0; | 2933 | return 0; |
2908 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2934 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2909 | return VM_FAULT_HWPOISON_LARGE | | 2935 | return VM_FAULT_HWPOISON_LARGE | |
@@ -2959,17 +2985,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2959 | if (page != pagecache_page) | 2985 | if (page != pagecache_page) |
2960 | lock_page(page); | 2986 | lock_page(page); |
2961 | 2987 | ||
2962 | spin_lock(&mm->page_table_lock); | 2988 | ptl = huge_pte_lockptr(h, mm, ptep); |
2989 | spin_lock(ptl); | ||
2963 | /* Check for a racing update before calling hugetlb_cow */ | 2990 | /* Check for a racing update before calling hugetlb_cow */ |
2964 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | 2991 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) |
2965 | goto out_page_table_lock; | 2992 | goto out_ptl; |
2966 | 2993 | ||
2967 | 2994 | ||
2968 | if (flags & FAULT_FLAG_WRITE) { | 2995 | if (flags & FAULT_FLAG_WRITE) { |
2969 | if (!huge_pte_write(entry)) { | 2996 | if (!huge_pte_write(entry)) { |
2970 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 2997 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
2971 | pagecache_page); | 2998 | pagecache_page, ptl); |
2972 | goto out_page_table_lock; | 2999 | goto out_ptl; |
2973 | } | 3000 | } |
2974 | entry = huge_pte_mkdirty(entry); | 3001 | entry = huge_pte_mkdirty(entry); |
2975 | } | 3002 | } |
@@ -2978,8 +3005,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2978 | flags & FAULT_FLAG_WRITE)) | 3005 | flags & FAULT_FLAG_WRITE)) |
2979 | update_mmu_cache(vma, address, ptep); | 3006 | update_mmu_cache(vma, address, ptep); |
2980 | 3007 | ||
2981 | out_page_table_lock: | 3008 | out_ptl: |
2982 | spin_unlock(&mm->page_table_lock); | 3009 | spin_unlock(ptl); |
2983 | 3010 | ||
2984 | if (pagecache_page) { | 3011 | if (pagecache_page) { |
2985 | unlock_page(pagecache_page); | 3012 | unlock_page(pagecache_page); |
@@ -3005,9 +3032,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3005 | unsigned long remainder = *nr_pages; | 3032 | unsigned long remainder = *nr_pages; |
3006 | struct hstate *h = hstate_vma(vma); | 3033 | struct hstate *h = hstate_vma(vma); |
3007 | 3034 | ||
3008 | spin_lock(&mm->page_table_lock); | ||
3009 | while (vaddr < vma->vm_end && remainder) { | 3035 | while (vaddr < vma->vm_end && remainder) { |
3010 | pte_t *pte; | 3036 | pte_t *pte; |
3037 | spinlock_t *ptl = NULL; | ||
3011 | int absent; | 3038 | int absent; |
3012 | struct page *page; | 3039 | struct page *page; |
3013 | 3040 | ||
@@ -3015,8 +3042,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3015 | * Some archs (sparc64, sh*) have multiple pte_ts to | 3042 | * Some archs (sparc64, sh*) have multiple pte_ts to |
3016 | * each hugepage. We have to make sure we get the | 3043 | * each hugepage. We have to make sure we get the |
3017 | * first, for the page indexing below to work. | 3044 | * first, for the page indexing below to work. |
3045 | * | ||
3046 | * Note that page table lock is not held when pte is null. | ||
3018 | */ | 3047 | */ |
3019 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); | 3048 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
3049 | if (pte) | ||
3050 | ptl = huge_pte_lock(h, mm, pte); | ||
3020 | absent = !pte || huge_pte_none(huge_ptep_get(pte)); | 3051 | absent = !pte || huge_pte_none(huge_ptep_get(pte)); |
3021 | 3052 | ||
3022 | /* | 3053 | /* |
@@ -3028,6 +3059,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3028 | */ | 3059 | */ |
3029 | if (absent && (flags & FOLL_DUMP) && | 3060 | if (absent && (flags & FOLL_DUMP) && |
3030 | !hugetlbfs_pagecache_present(h, vma, vaddr)) { | 3061 | !hugetlbfs_pagecache_present(h, vma, vaddr)) { |
3062 | if (pte) | ||
3063 | spin_unlock(ptl); | ||
3031 | remainder = 0; | 3064 | remainder = 0; |
3032 | break; | 3065 | break; |
3033 | } | 3066 | } |
@@ -3047,10 +3080,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3047 | !huge_pte_write(huge_ptep_get(pte)))) { | 3080 | !huge_pte_write(huge_ptep_get(pte)))) { |
3048 | int ret; | 3081 | int ret; |
3049 | 3082 | ||
3050 | spin_unlock(&mm->page_table_lock); | 3083 | if (pte) |
3084 | spin_unlock(ptl); | ||
3051 | ret = hugetlb_fault(mm, vma, vaddr, | 3085 | ret = hugetlb_fault(mm, vma, vaddr, |
3052 | (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); | 3086 | (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); |
3053 | spin_lock(&mm->page_table_lock); | ||
3054 | if (!(ret & VM_FAULT_ERROR)) | 3087 | if (!(ret & VM_FAULT_ERROR)) |
3055 | continue; | 3088 | continue; |
3056 | 3089 | ||
@@ -3081,8 +3114,8 @@ same_page: | |||
3081 | */ | 3114 | */ |
3082 | goto same_page; | 3115 | goto same_page; |
3083 | } | 3116 | } |
3117 | spin_unlock(ptl); | ||
3084 | } | 3118 | } |
3085 | spin_unlock(&mm->page_table_lock); | ||
3086 | *nr_pages = remainder; | 3119 | *nr_pages = remainder; |
3087 | *position = vaddr; | 3120 | *position = vaddr; |
3088 | 3121 | ||
@@ -3103,13 +3136,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3103 | flush_cache_range(vma, address, end); | 3136 | flush_cache_range(vma, address, end); |
3104 | 3137 | ||
3105 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3138 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
3106 | spin_lock(&mm->page_table_lock); | ||
3107 | for (; address < end; address += huge_page_size(h)) { | 3139 | for (; address < end; address += huge_page_size(h)) { |
3140 | spinlock_t *ptl; | ||
3108 | ptep = huge_pte_offset(mm, address); | 3141 | ptep = huge_pte_offset(mm, address); |
3109 | if (!ptep) | 3142 | if (!ptep) |
3110 | continue; | 3143 | continue; |
3144 | ptl = huge_pte_lock(h, mm, ptep); | ||
3111 | if (huge_pmd_unshare(mm, &address, ptep)) { | 3145 | if (huge_pmd_unshare(mm, &address, ptep)) { |
3112 | pages++; | 3146 | pages++; |
3147 | spin_unlock(ptl); | ||
3113 | continue; | 3148 | continue; |
3114 | } | 3149 | } |
3115 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3150 | if (!huge_pte_none(huge_ptep_get(ptep))) { |
@@ -3119,8 +3154,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3119 | set_huge_pte_at(mm, address, ptep, pte); | 3154 | set_huge_pte_at(mm, address, ptep, pte); |
3120 | pages++; | 3155 | pages++; |
3121 | } | 3156 | } |
3157 | spin_unlock(ptl); | ||
3122 | } | 3158 | } |
3123 | spin_unlock(&mm->page_table_lock); | ||
3124 | /* | 3159 | /* |
3125 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | 3160 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare |
3126 | * may have cleared our pud entry and done put_page on the page table: | 3161 | * may have cleared our pud entry and done put_page on the page table: |
@@ -3283,6 +3318,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3283 | unsigned long saddr; | 3318 | unsigned long saddr; |
3284 | pte_t *spte = NULL; | 3319 | pte_t *spte = NULL; |
3285 | pte_t *pte; | 3320 | pte_t *pte; |
3321 | spinlock_t *ptl; | ||
3286 | 3322 | ||
3287 | if (!vma_shareable(vma, addr)) | 3323 | if (!vma_shareable(vma, addr)) |
3288 | return (pte_t *)pmd_alloc(mm, pud, addr); | 3324 | return (pte_t *)pmd_alloc(mm, pud, addr); |
@@ -3305,13 +3341,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3305 | if (!spte) | 3341 | if (!spte) |
3306 | goto out; | 3342 | goto out; |
3307 | 3343 | ||
3308 | spin_lock(&mm->page_table_lock); | 3344 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); |
3345 | spin_lock(ptl); | ||
3309 | if (pud_none(*pud)) | 3346 | if (pud_none(*pud)) |
3310 | pud_populate(mm, pud, | 3347 | pud_populate(mm, pud, |
3311 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); | 3348 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
3312 | else | 3349 | else |
3313 | put_page(virt_to_page(spte)); | 3350 | put_page(virt_to_page(spte)); |
3314 | spin_unlock(&mm->page_table_lock); | 3351 | spin_unlock(ptl); |
3315 | out: | 3352 | out: |
3316 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 3353 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
3317 | mutex_unlock(&mapping->i_mmap_mutex); | 3354 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -3325,7 +3362,7 @@ out: | |||
3325 | * indicated by page_count > 1, unmap is achieved by clearing pud and | 3362 | * indicated by page_count > 1, unmap is achieved by clearing pud and |
3326 | * decrementing the ref count. If count == 1, the pte page is not shared. | 3363 | * decrementing the ref count. If count == 1, the pte page is not shared. |
3327 | * | 3364 | * |
3328 | * called with vma->vm_mm->page_table_lock held. | 3365 | * called with page table lock held. |
3329 | * | 3366 | * |
3330 | * returns: 1 successfully unmapped a shared pte page | 3367 | * returns: 1 successfully unmapped a shared pte page |
3331 | * 0 the underlying pte page is not shared, or it is the last user | 3368 | * 0 the underlying pte page is not shared, or it is the last user |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index afc2daa91c60..4c84678371eb 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -20,8 +20,6 @@ static int hwpoison_inject(void *data, u64 val) | |||
20 | if (!capable(CAP_SYS_ADMIN)) | 20 | if (!capable(CAP_SYS_ADMIN)) |
21 | return -EPERM; | 21 | return -EPERM; |
22 | 22 | ||
23 | if (!hwpoison_filter_enable) | ||
24 | goto inject; | ||
25 | if (!pfn_valid(pfn)) | 23 | if (!pfn_valid(pfn)) |
26 | return -ENXIO; | 24 | return -ENXIO; |
27 | 25 | ||
@@ -33,6 +31,9 @@ static int hwpoison_inject(void *data, u64 val) | |||
33 | if (!get_page_unless_zero(hpage)) | 31 | if (!get_page_unless_zero(hpage)) |
34 | return 0; | 32 | return 0; |
35 | 33 | ||
34 | if (!hwpoison_filter_enable) | ||
35 | goto inject; | ||
36 | |||
36 | if (!PageLRU(p) && !PageHuge(p)) | 37 | if (!PageLRU(p) && !PageHuge(p)) |
37 | shake_page(p, 0); | 38 | shake_page(p, 0); |
38 | /* | 39 | /* |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e126b0ef9ad2..31f01c5011e5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) | |||
753 | } | 753 | } |
754 | 754 | ||
755 | spin_lock_irqsave(&object->lock, flags); | 755 | spin_lock_irqsave(&object->lock, flags); |
756 | if (ptr + size > object->pointer + object->size) { | 756 | if (size == SIZE_MAX) { |
757 | size = object->pointer + object->size - ptr; | ||
758 | } else if (ptr + size > object->pointer + object->size) { | ||
757 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | 759 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); |
758 | dump_object_info(object); | 760 | dump_object_info(object); |
759 | kmem_cache_free(scan_area_cache, area); | 761 | kmem_cache_free(scan_area_cache, area); |
@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj, | |||
2309 | * Allocate stable and unstable together: | 2309 | * Allocate stable and unstable together: |
2310 | * MAXSMP NODES_SHIFT 10 will use 16kB. | 2310 | * MAXSMP NODES_SHIFT 10 will use 16kB. |
2311 | */ | 2311 | */ |
2312 | buf = kcalloc(nr_node_ids + nr_node_ids, | 2312 | buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), |
2313 | sizeof(*buf), GFP_KERNEL | __GFP_ZERO); | 2313 | GFP_KERNEL); |
2314 | /* Let us assume that RB_ROOT is NULL is zero */ | 2314 | /* Let us assume that RB_ROOT is NULL is zero */ |
2315 | if (!buf) | 2315 | if (!buf) |
2316 | err = -ENOMEM; | 2316 | err = -ENOMEM; |
diff --git a/mm/list_lru.c b/mm/list_lru.c index 72467914b856..72f9decb0104 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c | |||
@@ -81,8 +81,9 @@ restart: | |||
81 | * decrement nr_to_walk first so that we don't livelock if we | 81 | * decrement nr_to_walk first so that we don't livelock if we |
82 | * get stuck on large numbesr of LRU_RETRY items | 82 | * get stuck on large numbesr of LRU_RETRY items |
83 | */ | 83 | */ |
84 | if (--(*nr_to_walk) == 0) | 84 | if (!*nr_to_walk) |
85 | break; | 85 | break; |
86 | --*nr_to_walk; | ||
86 | 87 | ||
87 | ret = isolate(item, &nlru->lock, cb_arg); | 88 | ret = isolate(item, &nlru->lock, cb_arg); |
88 | switch (ret) { | 89 | switch (ret) { |
diff --git a/mm/madvise.c b/mm/madvise.c index 6975bc812542..539eeb96b323 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -343,10 +343,11 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
343 | */ | 343 | */ |
344 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) | 344 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
345 | { | 345 | { |
346 | struct page *p; | ||
346 | if (!capable(CAP_SYS_ADMIN)) | 347 | if (!capable(CAP_SYS_ADMIN)) |
347 | return -EPERM; | 348 | return -EPERM; |
348 | for (; start < end; start += PAGE_SIZE) { | 349 | for (; start < end; start += PAGE_SIZE << |
349 | struct page *p; | 350 | compound_order(compound_head(p))) { |
350 | int ret; | 351 | int ret; |
351 | 352 | ||
352 | ret = get_user_pages_fast(start, 1, 0, &p); | 353 | ret = get_user_pages_fast(start, 1, 0, &p); |
diff --git a/mm/memblock.c b/mm/memblock.c index 0ac412a0a7ee..53e477bb5558 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -20,6 +20,8 @@ | |||
20 | #include <linux/seq_file.h> | 20 | #include <linux/seq_file.h> |
21 | #include <linux/memblock.h> | 21 | #include <linux/memblock.h> |
22 | 22 | ||
23 | #include <asm-generic/sections.h> | ||
24 | |||
23 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 25 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
24 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 26 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
25 | 27 | ||
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = { | |||
32 | .reserved.cnt = 1, /* empty dummy entry */ | 34 | .reserved.cnt = 1, /* empty dummy entry */ |
33 | .reserved.max = INIT_MEMBLOCK_REGIONS, | 35 | .reserved.max = INIT_MEMBLOCK_REGIONS, |
34 | 36 | ||
37 | .bottom_up = false, | ||
35 | .current_limit = MEMBLOCK_ALLOC_ANYWHERE, | 38 | .current_limit = MEMBLOCK_ALLOC_ANYWHERE, |
36 | }; | 39 | }; |
37 | 40 | ||
@@ -82,6 +85,73 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
82 | return (i < type->cnt) ? i : -1; | 85 | return (i < type->cnt) ? i : -1; |
83 | } | 86 | } |
84 | 87 | ||
88 | /* | ||
89 | * __memblock_find_range_bottom_up - find free area utility in bottom-up | ||
90 | * @start: start of candidate range | ||
91 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
92 | * @size: size of free area to find | ||
93 | * @align: alignment of free area to find | ||
94 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | ||
95 | * | ||
96 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. | ||
97 | * | ||
98 | * RETURNS: | ||
99 | * Found address on success, 0 on failure. | ||
100 | */ | ||
101 | static phys_addr_t __init_memblock | ||
102 | __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | ||
103 | phys_addr_t size, phys_addr_t align, int nid) | ||
104 | { | ||
105 | phys_addr_t this_start, this_end, cand; | ||
106 | u64 i; | ||
107 | |||
108 | for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { | ||
109 | this_start = clamp(this_start, start, end); | ||
110 | this_end = clamp(this_end, start, end); | ||
111 | |||
112 | cand = round_up(this_start, align); | ||
113 | if (cand < this_end && this_end - cand >= size) | ||
114 | return cand; | ||
115 | } | ||
116 | |||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * __memblock_find_range_top_down - find free area utility, in top-down | ||
122 | * @start: start of candidate range | ||
123 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
124 | * @size: size of free area to find | ||
125 | * @align: alignment of free area to find | ||
126 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | ||
127 | * | ||
128 | * Utility called from memblock_find_in_range_node(), find free area top-down. | ||
129 | * | ||
130 | * RETURNS: | ||
131 | * Found address on success, 0 on failure. | ||
132 | */ | ||
133 | static phys_addr_t __init_memblock | ||
134 | __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | ||
135 | phys_addr_t size, phys_addr_t align, int nid) | ||
136 | { | ||
137 | phys_addr_t this_start, this_end, cand; | ||
138 | u64 i; | ||
139 | |||
140 | for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { | ||
141 | this_start = clamp(this_start, start, end); | ||
142 | this_end = clamp(this_end, start, end); | ||
143 | |||
144 | if (this_end < size) | ||
145 | continue; | ||
146 | |||
147 | cand = round_down(this_end - size, align); | ||
148 | if (cand >= this_start) | ||
149 | return cand; | ||
150 | } | ||
151 | |||
152 | return 0; | ||
153 | } | ||
154 | |||
85 | /** | 155 | /** |
86 | * memblock_find_in_range_node - find free area in given range and node | 156 | * memblock_find_in_range_node - find free area in given range and node |
87 | * @start: start of candidate range | 157 | * @start: start of candidate range |
@@ -92,15 +162,23 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
92 | * | 162 | * |
93 | * Find @size free area aligned to @align in the specified range and node. | 163 | * Find @size free area aligned to @align in the specified range and node. |
94 | * | 164 | * |
165 | * When allocation direction is bottom-up, the @start should be greater | ||
166 | * than the end of the kernel image. Otherwise, it will be trimmed. The | ||
167 | * reason is that we want the bottom-up allocation just near the kernel | ||
168 | * image so it is highly likely that the allocated memory and the kernel | ||
169 | * will reside in the same node. | ||
170 | * | ||
171 | * If bottom-up allocation failed, will try to allocate memory top-down. | ||
172 | * | ||
95 | * RETURNS: | 173 | * RETURNS: |
96 | * Found address on success, %0 on failure. | 174 | * Found address on success, 0 on failure. |
97 | */ | 175 | */ |
98 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 176 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, |
99 | phys_addr_t end, phys_addr_t size, | 177 | phys_addr_t end, phys_addr_t size, |
100 | phys_addr_t align, int nid) | 178 | phys_addr_t align, int nid) |
101 | { | 179 | { |
102 | phys_addr_t this_start, this_end, cand; | 180 | int ret; |
103 | u64 i; | 181 | phys_addr_t kernel_end; |
104 | 182 | ||
105 | /* pump up @end */ | 183 | /* pump up @end */ |
106 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | 184 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) |
@@ -109,19 +187,39 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | |||
109 | /* avoid allocating the first page */ | 187 | /* avoid allocating the first page */ |
110 | start = max_t(phys_addr_t, start, PAGE_SIZE); | 188 | start = max_t(phys_addr_t, start, PAGE_SIZE); |
111 | end = max(start, end); | 189 | end = max(start, end); |
190 | kernel_end = __pa_symbol(_end); | ||
112 | 191 | ||
113 | for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { | 192 | /* |
114 | this_start = clamp(this_start, start, end); | 193 | * try bottom-up allocation only when bottom-up mode |
115 | this_end = clamp(this_end, start, end); | 194 | * is set and @end is above the kernel image. |
195 | */ | ||
196 | if (memblock_bottom_up() && end > kernel_end) { | ||
197 | phys_addr_t bottom_up_start; | ||
116 | 198 | ||
117 | if (this_end < size) | 199 | /* make sure we will allocate above the kernel */ |
118 | continue; | 200 | bottom_up_start = max(start, kernel_end); |
119 | 201 | ||
120 | cand = round_down(this_end - size, align); | 202 | /* ok, try bottom-up allocation first */ |
121 | if (cand >= this_start) | 203 | ret = __memblock_find_range_bottom_up(bottom_up_start, end, |
122 | return cand; | 204 | size, align, nid); |
205 | if (ret) | ||
206 | return ret; | ||
207 | |||
208 | /* | ||
209 | * we always limit bottom-up allocation above the kernel, | ||
210 | * but top-down allocation doesn't have the limit, so | ||
211 | * retrying top-down allocation may succeed when bottom-up | ||
212 | * allocation failed. | ||
213 | * | ||
214 | * bottom-up allocation is expected to be fail very rarely, | ||
215 | * so we use WARN_ONCE() here to see the stack trace if | ||
216 | * fail happens. | ||
217 | */ | ||
218 | WARN_ONCE(1, "memblock: bottom-up allocation failed, " | ||
219 | "memory hotunplug may be affected\n"); | ||
123 | } | 220 | } |
124 | return 0; | 221 | |
222 | return __memblock_find_range_top_down(start, end, size, align, nid); | ||
125 | } | 223 | } |
126 | 224 | ||
127 | /** | 225 | /** |
@@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | |||
134 | * Find @size free area aligned to @align in the specified range. | 232 | * Find @size free area aligned to @align in the specified range. |
135 | * | 233 | * |
136 | * RETURNS: | 234 | * RETURNS: |
137 | * Found address on success, %0 on failure. | 235 | * Found address on success, 0 on failure. |
138 | */ | 236 | */ |
139 | phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | 237 | phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, |
140 | phys_addr_t end, phys_addr_t size, | 238 | phys_addr_t end, phys_addr_t size, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..f1a0ae6e11b8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/rbtree.h> | ||
42 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
43 | #include <linux/swap.h> | 44 | #include <linux/swap.h> |
44 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
@@ -53,10 +54,12 @@ | |||
53 | #include <linux/page_cgroup.h> | 54 | #include <linux/page_cgroup.h> |
54 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
55 | #include <linux/oom.h> | 56 | #include <linux/oom.h> |
57 | #include <linux/lockdep.h> | ||
56 | #include "internal.h" | 58 | #include "internal.h" |
57 | #include <net/sock.h> | 59 | #include <net/sock.h> |
58 | #include <net/ip.h> | 60 | #include <net/ip.h> |
59 | #include <net/tcp_memcontrol.h> | 61 | #include <net/tcp_memcontrol.h> |
62 | #include "slab.h" | ||
60 | 63 | ||
61 | #include <asm/uaccess.h> | 64 | #include <asm/uaccess.h> |
62 | 65 | ||
@@ -160,6 +163,10 @@ struct mem_cgroup_per_zone { | |||
160 | 163 | ||
161 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 164 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
162 | 165 | ||
166 | struct rb_node tree_node; /* RB tree node */ | ||
167 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
168 | /* the soft limit is exceeded*/ | ||
169 | bool on_tree; | ||
163 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 170 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
164 | /* use container_of */ | 171 | /* use container_of */ |
165 | }; | 172 | }; |
@@ -168,6 +175,26 @@ struct mem_cgroup_per_node { | |||
168 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 175 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
169 | }; | 176 | }; |
170 | 177 | ||
178 | /* | ||
179 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
180 | * their hierarchy representation | ||
181 | */ | ||
182 | |||
183 | struct mem_cgroup_tree_per_zone { | ||
184 | struct rb_root rb_root; | ||
185 | spinlock_t lock; | ||
186 | }; | ||
187 | |||
188 | struct mem_cgroup_tree_per_node { | ||
189 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
190 | }; | ||
191 | |||
192 | struct mem_cgroup_tree { | ||
193 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
194 | }; | ||
195 | |||
196 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
197 | |||
171 | struct mem_cgroup_threshold { | 198 | struct mem_cgroup_threshold { |
172 | struct eventfd_ctx *eventfd; | 199 | struct eventfd_ctx *eventfd; |
173 | u64 threshold; | 200 | u64 threshold; |
@@ -286,7 +313,7 @@ struct mem_cgroup { | |||
286 | 313 | ||
287 | atomic_t dead_count; | 314 | atomic_t dead_count; |
288 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) | 315 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
289 | struct tcp_memcontrol tcp_mem; | 316 | struct cg_proto tcp_mem; |
290 | #endif | 317 | #endif |
291 | #if defined(CONFIG_MEMCG_KMEM) | 318 | #if defined(CONFIG_MEMCG_KMEM) |
292 | /* analogous to slab_common's slab_caches list. per-memcg */ | 319 | /* analogous to slab_common's slab_caches list. per-memcg */ |
@@ -303,22 +330,6 @@ struct mem_cgroup { | |||
303 | atomic_t numainfo_events; | 330 | atomic_t numainfo_events; |
304 | atomic_t numainfo_updating; | 331 | atomic_t numainfo_updating; |
305 | #endif | 332 | #endif |
306 | /* | ||
307 | * Protects soft_contributed transitions. | ||
308 | * See mem_cgroup_update_soft_limit | ||
309 | */ | ||
310 | spinlock_t soft_lock; | ||
311 | |||
312 | /* | ||
313 | * If true then this group has increased parents' children_in_excess | ||
314 | * when it got over the soft limit. | ||
315 | * When a group falls bellow the soft limit, parents' children_in_excess | ||
316 | * is decreased and soft_contributed changed to false. | ||
317 | */ | ||
318 | bool soft_contributed; | ||
319 | |||
320 | /* Number of children that are in soft limit excess */ | ||
321 | atomic_t children_in_excess; | ||
322 | 333 | ||
323 | struct mem_cgroup_per_node *nodeinfo[0]; | 334 | struct mem_cgroup_per_node *nodeinfo[0]; |
324 | /* WARNING: nodeinfo must be the last member here */ | 335 | /* WARNING: nodeinfo must be the last member here */ |
@@ -422,6 +433,7 @@ static bool move_file(void) | |||
422 | * limit reclaim to prevent infinite loops, if they ever occur. | 433 | * limit reclaim to prevent infinite loops, if they ever occur. |
423 | */ | 434 | */ |
424 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 435 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
436 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
425 | 437 | ||
426 | enum charge_type { | 438 | enum charge_type { |
427 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 439 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
@@ -488,6 +500,29 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | |||
488 | return (memcg == root_mem_cgroup); | 500 | return (memcg == root_mem_cgroup); |
489 | } | 501 | } |
490 | 502 | ||
503 | /* | ||
504 | * We restrict the id in the range of [1, 65535], so it can fit into | ||
505 | * an unsigned short. | ||
506 | */ | ||
507 | #define MEM_CGROUP_ID_MAX USHRT_MAX | ||
508 | |||
509 | static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) | ||
510 | { | ||
511 | /* | ||
512 | * The ID of the root cgroup is 0, but memcg treat 0 as an | ||
513 | * invalid ID, so we return (cgroup_id + 1). | ||
514 | */ | ||
515 | return memcg->css.cgroup->id + 1; | ||
516 | } | ||
517 | |||
518 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | ||
519 | { | ||
520 | struct cgroup_subsys_state *css; | ||
521 | |||
522 | css = css_from_id(id - 1, &mem_cgroup_subsys); | ||
523 | return mem_cgroup_from_css(css); | ||
524 | } | ||
525 | |||
491 | /* Writing them here to avoid exposing memcg's inner layout */ | 526 | /* Writing them here to avoid exposing memcg's inner layout */ |
492 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) | 527 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) |
493 | 528 | ||
@@ -540,13 +575,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
540 | if (!memcg || mem_cgroup_is_root(memcg)) | 575 | if (!memcg || mem_cgroup_is_root(memcg)) |
541 | return NULL; | 576 | return NULL; |
542 | 577 | ||
543 | return &memcg->tcp_mem.cg_proto; | 578 | return &memcg->tcp_mem; |
544 | } | 579 | } |
545 | EXPORT_SYMBOL(tcp_proto_cgroup); | 580 | EXPORT_SYMBOL(tcp_proto_cgroup); |
546 | 581 | ||
547 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 582 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
548 | { | 583 | { |
549 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | 584 | if (!memcg_proto_activated(&memcg->tcp_mem)) |
550 | return; | 585 | return; |
551 | static_key_slow_dec(&memcg_socket_limit_enabled); | 586 | static_key_slow_dec(&memcg_socket_limit_enabled); |
552 | } | 587 | } |
@@ -559,16 +594,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) | |||
559 | #ifdef CONFIG_MEMCG_KMEM | 594 | #ifdef CONFIG_MEMCG_KMEM |
560 | /* | 595 | /* |
561 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. | 596 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. |
562 | * There are two main reasons for not using the css_id for this: | 597 | * The main reason for not using cgroup id for this: |
563 | * 1) this works better in sparse environments, where we have a lot of memcgs, | 598 | * this works better in sparse environments, where we have a lot of memcgs, |
564 | * but only a few kmem-limited. Or also, if we have, for instance, 200 | 599 | * but only a few kmem-limited. Or also, if we have, for instance, 200 |
565 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a | 600 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a |
566 | * 200 entry array for that. | 601 | * 200 entry array for that. |
567 | * | ||
568 | * 2) In order not to violate the cgroup API, we would like to do all memory | ||
569 | * allocation in ->create(). At that point, we haven't yet allocated the | ||
570 | * css_id. Having a separate index prevents us from messing with the cgroup | ||
571 | * core for this | ||
572 | * | 602 | * |
573 | * The current size of the caches array is stored in | 603 | * The current size of the caches array is stored in |
574 | * memcg_limited_groups_array_size. It will double each time we have to | 604 | * memcg_limited_groups_array_size. It will double each time we have to |
@@ -583,14 +613,14 @@ int memcg_limited_groups_array_size; | |||
583 | * cgroups is a reasonable guess. In the future, it could be a parameter or | 613 | * cgroups is a reasonable guess. In the future, it could be a parameter or |
584 | * tunable, but that is strictly not necessary. | 614 | * tunable, but that is strictly not necessary. |
585 | * | 615 | * |
586 | * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get | 616 | * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get |
587 | * this constant directly from cgroup, but it is understandable that this is | 617 | * this constant directly from cgroup, but it is understandable that this is |
588 | * better kept as an internal representation in cgroup.c. In any case, the | 618 | * better kept as an internal representation in cgroup.c. In any case, the |
589 | * css_id space is not getting any smaller, and we don't have to necessarily | 619 | * cgrp_id space is not getting any smaller, and we don't have to necessarily |
590 | * increase ours as well if it increases. | 620 | * increase ours as well if it increases. |
591 | */ | 621 | */ |
592 | #define MEMCG_CACHES_MIN_SIZE 4 | 622 | #define MEMCG_CACHES_MIN_SIZE 4 |
593 | #define MEMCG_CACHES_MAX_SIZE 65535 | 623 | #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX |
594 | 624 | ||
595 | /* | 625 | /* |
596 | * A lot of the calls to the cache allocation functions are expected to be | 626 | * A lot of the calls to the cache allocation functions are expected to be |
@@ -648,6 +678,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
648 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 678 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
649 | } | 679 | } |
650 | 680 | ||
681 | static struct mem_cgroup_tree_per_zone * | ||
682 | soft_limit_tree_node_zone(int nid, int zid) | ||
683 | { | ||
684 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
685 | } | ||
686 | |||
687 | static struct mem_cgroup_tree_per_zone * | ||
688 | soft_limit_tree_from_page(struct page *page) | ||
689 | { | ||
690 | int nid = page_to_nid(page); | ||
691 | int zid = page_zonenum(page); | ||
692 | |||
693 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
694 | } | ||
695 | |||
696 | static void | ||
697 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
698 | struct mem_cgroup_per_zone *mz, | ||
699 | struct mem_cgroup_tree_per_zone *mctz, | ||
700 | unsigned long long new_usage_in_excess) | ||
701 | { | ||
702 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
703 | struct rb_node *parent = NULL; | ||
704 | struct mem_cgroup_per_zone *mz_node; | ||
705 | |||
706 | if (mz->on_tree) | ||
707 | return; | ||
708 | |||
709 | mz->usage_in_excess = new_usage_in_excess; | ||
710 | if (!mz->usage_in_excess) | ||
711 | return; | ||
712 | while (*p) { | ||
713 | parent = *p; | ||
714 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
715 | tree_node); | ||
716 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
717 | p = &(*p)->rb_left; | ||
718 | /* | ||
719 | * We can't avoid mem cgroups that are over their soft | ||
720 | * limit by the same amount | ||
721 | */ | ||
722 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
723 | p = &(*p)->rb_right; | ||
724 | } | ||
725 | rb_link_node(&mz->tree_node, parent, p); | ||
726 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
727 | mz->on_tree = true; | ||
728 | } | ||
729 | |||
730 | static void | ||
731 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
732 | struct mem_cgroup_per_zone *mz, | ||
733 | struct mem_cgroup_tree_per_zone *mctz) | ||
734 | { | ||
735 | if (!mz->on_tree) | ||
736 | return; | ||
737 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
738 | mz->on_tree = false; | ||
739 | } | ||
740 | |||
741 | static void | ||
742 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
743 | struct mem_cgroup_per_zone *mz, | ||
744 | struct mem_cgroup_tree_per_zone *mctz) | ||
745 | { | ||
746 | spin_lock(&mctz->lock); | ||
747 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
748 | spin_unlock(&mctz->lock); | ||
749 | } | ||
750 | |||
751 | |||
752 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
753 | { | ||
754 | unsigned long long excess; | ||
755 | struct mem_cgroup_per_zone *mz; | ||
756 | struct mem_cgroup_tree_per_zone *mctz; | ||
757 | int nid = page_to_nid(page); | ||
758 | int zid = page_zonenum(page); | ||
759 | mctz = soft_limit_tree_from_page(page); | ||
760 | |||
761 | /* | ||
762 | * Necessary to update all ancestors when hierarchy is used. | ||
763 | * because their event counter is not touched. | ||
764 | */ | ||
765 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
766 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
767 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
768 | /* | ||
769 | * We have to update the tree if mz is on RB-tree or | ||
770 | * mem is over its softlimit. | ||
771 | */ | ||
772 | if (excess || mz->on_tree) { | ||
773 | spin_lock(&mctz->lock); | ||
774 | /* if on-tree, remove it */ | ||
775 | if (mz->on_tree) | ||
776 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
777 | /* | ||
778 | * Insert again. mz->usage_in_excess will be updated. | ||
779 | * If excess is 0, no tree ops. | ||
780 | */ | ||
781 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
782 | spin_unlock(&mctz->lock); | ||
783 | } | ||
784 | } | ||
785 | } | ||
786 | |||
787 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
788 | { | ||
789 | int node, zone; | ||
790 | struct mem_cgroup_per_zone *mz; | ||
791 | struct mem_cgroup_tree_per_zone *mctz; | ||
792 | |||
793 | for_each_node(node) { | ||
794 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
795 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
796 | mctz = soft_limit_tree_node_zone(node, zone); | ||
797 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
798 | } | ||
799 | } | ||
800 | } | ||
801 | |||
802 | static struct mem_cgroup_per_zone * | ||
803 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
804 | { | ||
805 | struct rb_node *rightmost = NULL; | ||
806 | struct mem_cgroup_per_zone *mz; | ||
807 | |||
808 | retry: | ||
809 | mz = NULL; | ||
810 | rightmost = rb_last(&mctz->rb_root); | ||
811 | if (!rightmost) | ||
812 | goto done; /* Nothing to reclaim from */ | ||
813 | |||
814 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
815 | /* | ||
816 | * Remove the node now but someone else can add it back, | ||
817 | * we will to add it back at the end of reclaim to its correct | ||
818 | * position in the tree. | ||
819 | */ | ||
820 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
821 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
822 | !css_tryget(&mz->memcg->css)) | ||
823 | goto retry; | ||
824 | done: | ||
825 | return mz; | ||
826 | } | ||
827 | |||
828 | static struct mem_cgroup_per_zone * | ||
829 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
830 | { | ||
831 | struct mem_cgroup_per_zone *mz; | ||
832 | |||
833 | spin_lock(&mctz->lock); | ||
834 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
835 | spin_unlock(&mctz->lock); | ||
836 | return mz; | ||
837 | } | ||
838 | |||
651 | /* | 839 | /* |
652 | * Implementation Note: reading percpu statistics for memcg. | 840 | * Implementation Note: reading percpu statistics for memcg. |
653 | * | 841 | * |
@@ -698,6 +886,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
698 | unsigned long val = 0; | 886 | unsigned long val = 0; |
699 | int cpu; | 887 | int cpu; |
700 | 888 | ||
889 | get_online_cpus(); | ||
701 | for_each_online_cpu(cpu) | 890 | for_each_online_cpu(cpu) |
702 | val += per_cpu(memcg->stat->events[idx], cpu); | 891 | val += per_cpu(memcg->stat->events[idx], cpu); |
703 | #ifdef CONFIG_HOTPLUG_CPU | 892 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -705,6 +894,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
705 | val += memcg->nocpu_base.events[idx]; | 894 | val += memcg->nocpu_base.events[idx]; |
706 | spin_unlock(&memcg->pcp_counter_lock); | 895 | spin_unlock(&memcg->pcp_counter_lock); |
707 | #endif | 896 | #endif |
897 | put_online_cpus(); | ||
708 | return val; | 898 | return val; |
709 | } | 899 | } |
710 | 900 | ||
@@ -822,48 +1012,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
822 | } | 1012 | } |
823 | 1013 | ||
824 | /* | 1014 | /* |
825 | * Called from rate-limited memcg_check_events when enough | ||
826 | * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure | ||
827 | * that all the parents up the hierarchy will be notified that this group | ||
828 | * is in excess or that it is not in excess anymore. mmecg->soft_contributed | ||
829 | * makes the transition a single action whenever the state flips from one to | ||
830 | * the other. | ||
831 | */ | ||
832 | static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) | ||
833 | { | ||
834 | unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); | ||
835 | struct mem_cgroup *parent = memcg; | ||
836 | int delta = 0; | ||
837 | |||
838 | spin_lock(&memcg->soft_lock); | ||
839 | if (excess) { | ||
840 | if (!memcg->soft_contributed) { | ||
841 | delta = 1; | ||
842 | memcg->soft_contributed = true; | ||
843 | } | ||
844 | } else { | ||
845 | if (memcg->soft_contributed) { | ||
846 | delta = -1; | ||
847 | memcg->soft_contributed = false; | ||
848 | } | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * Necessary to update all ancestors when hierarchy is used | ||
853 | * because their event counter is not touched. | ||
854 | * We track children even outside the hierarchy for the root | ||
855 | * cgroup because tree walk starting at root should visit | ||
856 | * all cgroups and we want to prevent from pointless tree | ||
857 | * walk if no children is below the limit. | ||
858 | */ | ||
859 | while (delta && (parent = parent_mem_cgroup(parent))) | ||
860 | atomic_add(delta, &parent->children_in_excess); | ||
861 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
862 | atomic_add(delta, &root_mem_cgroup->children_in_excess); | ||
863 | spin_unlock(&memcg->soft_lock); | ||
864 | } | ||
865 | |||
866 | /* | ||
867 | * Check events in order. | 1015 | * Check events in order. |
868 | * | 1016 | * |
869 | */ | 1017 | */ |
@@ -886,7 +1034,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
886 | 1034 | ||
887 | mem_cgroup_threshold(memcg); | 1035 | mem_cgroup_threshold(memcg); |
888 | if (unlikely(do_softlimit)) | 1036 | if (unlikely(do_softlimit)) |
889 | mem_cgroup_update_soft_limit(memcg); | 1037 | mem_cgroup_update_tree(memcg, page); |
890 | #if MAX_NUMNODES > 1 | 1038 | #if MAX_NUMNODES > 1 |
891 | if (unlikely(do_numainfo)) | 1039 | if (unlikely(do_numainfo)) |
892 | atomic_inc(&memcg->numainfo_events); | 1040 | atomic_inc(&memcg->numainfo_events); |
@@ -929,15 +1077,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
929 | return memcg; | 1077 | return memcg; |
930 | } | 1078 | } |
931 | 1079 | ||
932 | static enum mem_cgroup_filter_t | ||
933 | mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | ||
934 | mem_cgroup_iter_filter cond) | ||
935 | { | ||
936 | if (!cond) | ||
937 | return VISIT; | ||
938 | return cond(memcg, root); | ||
939 | } | ||
940 | |||
941 | /* | 1080 | /* |
942 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | 1081 | * Returns a next (in a pre-order walk) alive memcg (with elevated css |
943 | * ref. count) or NULL if the whole root's subtree has been visited. | 1082 | * ref. count) or NULL if the whole root's subtree has been visited. |
@@ -945,7 +1084,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | |||
945 | * helper function to be used by mem_cgroup_iter | 1084 | * helper function to be used by mem_cgroup_iter |
946 | */ | 1085 | */ |
947 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | 1086 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, |
948 | struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) | 1087 | struct mem_cgroup *last_visited) |
949 | { | 1088 | { |
950 | struct cgroup_subsys_state *prev_css, *next_css; | 1089 | struct cgroup_subsys_state *prev_css, *next_css; |
951 | 1090 | ||
@@ -963,31 +1102,11 @@ skip_node: | |||
963 | if (next_css) { | 1102 | if (next_css) { |
964 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 1103 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); |
965 | 1104 | ||
966 | switch (mem_cgroup_filter(mem, root, cond)) { | 1105 | if (css_tryget(&mem->css)) |
967 | case SKIP: | 1106 | return mem; |
1107 | else { | ||
968 | prev_css = next_css; | 1108 | prev_css = next_css; |
969 | goto skip_node; | 1109 | goto skip_node; |
970 | case SKIP_TREE: | ||
971 | if (mem == root) | ||
972 | return NULL; | ||
973 | /* | ||
974 | * css_rightmost_descendant is not an optimal way to | ||
975 | * skip through a subtree (especially for imbalanced | ||
976 | * trees leaning to right) but that's what we have right | ||
977 | * now. More effective solution would be traversing | ||
978 | * right-up for first non-NULL without calling | ||
979 | * css_next_descendant_pre afterwards. | ||
980 | */ | ||
981 | prev_css = css_rightmost_descendant(next_css); | ||
982 | goto skip_node; | ||
983 | case VISIT: | ||
984 | if (css_tryget(&mem->css)) | ||
985 | return mem; | ||
986 | else { | ||
987 | prev_css = next_css; | ||
988 | goto skip_node; | ||
989 | } | ||
990 | break; | ||
991 | } | 1110 | } |
992 | } | 1111 | } |
993 | 1112 | ||
@@ -1051,7 +1170,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1051 | * @root: hierarchy root | 1170 | * @root: hierarchy root |
1052 | * @prev: previously returned memcg, NULL on first invocation | 1171 | * @prev: previously returned memcg, NULL on first invocation |
1053 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1172 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
1054 | * @cond: filter for visited nodes, NULL for no filter | ||
1055 | * | 1173 | * |
1056 | * Returns references to children of the hierarchy below @root, or | 1174 | * Returns references to children of the hierarchy below @root, or |
1057 | * @root itself, or %NULL after a full round-trip. | 1175 | * @root itself, or %NULL after a full round-trip. |
@@ -1064,18 +1182,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1064 | * divide up the memcgs in the hierarchy among all concurrent | 1182 | * divide up the memcgs in the hierarchy among all concurrent |
1065 | * reclaimers operating on the same zone and priority. | 1183 | * reclaimers operating on the same zone and priority. |
1066 | */ | 1184 | */ |
1067 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | 1185 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, |
1068 | struct mem_cgroup *prev, | 1186 | struct mem_cgroup *prev, |
1069 | struct mem_cgroup_reclaim_cookie *reclaim, | 1187 | struct mem_cgroup_reclaim_cookie *reclaim) |
1070 | mem_cgroup_iter_filter cond) | ||
1071 | { | 1188 | { |
1072 | struct mem_cgroup *memcg = NULL; | 1189 | struct mem_cgroup *memcg = NULL; |
1073 | struct mem_cgroup *last_visited = NULL; | 1190 | struct mem_cgroup *last_visited = NULL; |
1074 | 1191 | ||
1075 | if (mem_cgroup_disabled()) { | 1192 | if (mem_cgroup_disabled()) |
1076 | /* first call must return non-NULL, second return NULL */ | 1193 | return NULL; |
1077 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
1078 | } | ||
1079 | 1194 | ||
1080 | if (!root) | 1195 | if (!root) |
1081 | root = root_mem_cgroup; | 1196 | root = root_mem_cgroup; |
@@ -1086,9 +1201,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1086 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1201 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1087 | if (prev) | 1202 | if (prev) |
1088 | goto out_css_put; | 1203 | goto out_css_put; |
1089 | if (mem_cgroup_filter(root, root, cond) == VISIT) | 1204 | return root; |
1090 | return root; | ||
1091 | return NULL; | ||
1092 | } | 1205 | } |
1093 | 1206 | ||
1094 | rcu_read_lock(); | 1207 | rcu_read_lock(); |
@@ -1111,7 +1224,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1111 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1224 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
1112 | } | 1225 | } |
1113 | 1226 | ||
1114 | memcg = __mem_cgroup_iter_next(root, last_visited, cond); | 1227 | memcg = __mem_cgroup_iter_next(root, last_visited); |
1115 | 1228 | ||
1116 | if (reclaim) { | 1229 | if (reclaim) { |
1117 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1230 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
@@ -1122,11 +1235,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1122 | reclaim->generation = iter->generation; | 1235 | reclaim->generation = iter->generation; |
1123 | } | 1236 | } |
1124 | 1237 | ||
1125 | /* | 1238 | if (prev && !memcg) |
1126 | * We have finished the whole tree walk or no group has been | ||
1127 | * visited because filter told us to skip the root node. | ||
1128 | */ | ||
1129 | if (!memcg && (prev || (cond && !last_visited))) | ||
1130 | goto out_unlock; | 1239 | goto out_unlock; |
1131 | } | 1240 | } |
1132 | out_unlock: | 1241 | out_unlock: |
@@ -1318,7 +1427,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | |||
1318 | return true; | 1427 | return true; |
1319 | if (!root_memcg->use_hierarchy || !memcg) | 1428 | if (!root_memcg->use_hierarchy || !memcg) |
1320 | return false; | 1429 | return false; |
1321 | return css_is_ancestor(&memcg->css, &root_memcg->css); | 1430 | return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); |
1322 | } | 1431 | } |
1323 | 1432 | ||
1324 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | 1433 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, |
@@ -1767,7 +1876,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1767 | return total; | 1876 | return total; |
1768 | } | 1877 | } |
1769 | 1878 | ||
1770 | #if MAX_NUMNODES > 1 | ||
1771 | /** | 1879 | /** |
1772 | * test_mem_cgroup_node_reclaimable | 1880 | * test_mem_cgroup_node_reclaimable |
1773 | * @memcg: the target memcg | 1881 | * @memcg: the target memcg |
@@ -1790,6 +1898,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1790 | return false; | 1898 | return false; |
1791 | 1899 | ||
1792 | } | 1900 | } |
1901 | #if MAX_NUMNODES > 1 | ||
1793 | 1902 | ||
1794 | /* | 1903 | /* |
1795 | * Always updating the nodemask is not very good - even if we have an empty | 1904 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -1857,52 +1966,112 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1857 | return node; | 1966 | return node; |
1858 | } | 1967 | } |
1859 | 1968 | ||
1969 | /* | ||
1970 | * Check all nodes whether it contains reclaimable pages or not. | ||
1971 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1972 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1973 | * enough new information. We need to do double check. | ||
1974 | */ | ||
1975 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1976 | { | ||
1977 | int nid; | ||
1978 | |||
1979 | /* | ||
1980 | * quick check...making use of scan_node. | ||
1981 | * We can skip unused nodes. | ||
1982 | */ | ||
1983 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1984 | for (nid = first_node(memcg->scan_nodes); | ||
1985 | nid < MAX_NUMNODES; | ||
1986 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1987 | |||
1988 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1989 | return true; | ||
1990 | } | ||
1991 | } | ||
1992 | /* | ||
1993 | * Check rest of nodes. | ||
1994 | */ | ||
1995 | for_each_node_state(nid, N_MEMORY) { | ||
1996 | if (node_isset(nid, memcg->scan_nodes)) | ||
1997 | continue; | ||
1998 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1999 | return true; | ||
2000 | } | ||
2001 | return false; | ||
2002 | } | ||
2003 | |||
1860 | #else | 2004 | #else |
1861 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 2005 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1862 | { | 2006 | { |
1863 | return 0; | 2007 | return 0; |
1864 | } | 2008 | } |
1865 | 2009 | ||
1866 | #endif | 2010 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1867 | |||
1868 | /* | ||
1869 | * A group is eligible for the soft limit reclaim under the given root | ||
1870 | * hierarchy if | ||
1871 | * a) it is over its soft limit | ||
1872 | * b) any parent up the hierarchy is over its soft limit | ||
1873 | * | ||
1874 | * If the given group doesn't have any children over the limit then it | ||
1875 | * doesn't make any sense to iterate its subtree. | ||
1876 | */ | ||
1877 | enum mem_cgroup_filter_t | ||
1878 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, | ||
1879 | struct mem_cgroup *root) | ||
1880 | { | 2011 | { |
1881 | struct mem_cgroup *parent; | 2012 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1882 | 2013 | } | |
1883 | if (!memcg) | 2014 | #endif |
1884 | memcg = root_mem_cgroup; | ||
1885 | parent = memcg; | ||
1886 | |||
1887 | if (res_counter_soft_limit_excess(&memcg->res)) | ||
1888 | return VISIT; | ||
1889 | 2015 | ||
1890 | /* | 2016 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
1891 | * If any parent up to the root in the hierarchy is over its soft limit | 2017 | struct zone *zone, |
1892 | * then we have to obey and reclaim from this group as well. | 2018 | gfp_t gfp_mask, |
1893 | */ | 2019 | unsigned long *total_scanned) |
1894 | while ((parent = parent_mem_cgroup(parent))) { | 2020 | { |
1895 | if (res_counter_soft_limit_excess(&parent->res)) | 2021 | struct mem_cgroup *victim = NULL; |
1896 | return VISIT; | 2022 | int total = 0; |
1897 | if (parent == root) | 2023 | int loop = 0; |
2024 | unsigned long excess; | ||
2025 | unsigned long nr_scanned; | ||
2026 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
2027 | .zone = zone, | ||
2028 | .priority = 0, | ||
2029 | }; | ||
2030 | |||
2031 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | ||
2032 | |||
2033 | while (1) { | ||
2034 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | ||
2035 | if (!victim) { | ||
2036 | loop++; | ||
2037 | if (loop >= 2) { | ||
2038 | /* | ||
2039 | * If we have not been able to reclaim | ||
2040 | * anything, it might because there are | ||
2041 | * no reclaimable pages under this hierarchy | ||
2042 | */ | ||
2043 | if (!total) | ||
2044 | break; | ||
2045 | /* | ||
2046 | * We want to do more targeted reclaim. | ||
2047 | * excess >> 2 is not to excessive so as to | ||
2048 | * reclaim too much, nor too less that we keep | ||
2049 | * coming back to reclaim from this cgroup | ||
2050 | */ | ||
2051 | if (total >= (excess >> 2) || | ||
2052 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | ||
2053 | break; | ||
2054 | } | ||
2055 | continue; | ||
2056 | } | ||
2057 | if (!mem_cgroup_reclaimable(victim, false)) | ||
2058 | continue; | ||
2059 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | ||
2060 | zone, &nr_scanned); | ||
2061 | *total_scanned += nr_scanned; | ||
2062 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
1898 | break; | 2063 | break; |
1899 | } | 2064 | } |
1900 | 2065 | mem_cgroup_iter_break(root_memcg, victim); | |
1901 | if (!atomic_read(&memcg->children_in_excess)) | 2066 | return total; |
1902 | return SKIP_TREE; | ||
1903 | return SKIP; | ||
1904 | } | 2067 | } |
1905 | 2068 | ||
2069 | #ifdef CONFIG_LOCKDEP | ||
2070 | static struct lockdep_map memcg_oom_lock_dep_map = { | ||
2071 | .name = "memcg_oom_lock", | ||
2072 | }; | ||
2073 | #endif | ||
2074 | |||
1906 | static DEFINE_SPINLOCK(memcg_oom_lock); | 2075 | static DEFINE_SPINLOCK(memcg_oom_lock); |
1907 | 2076 | ||
1908 | /* | 2077 | /* |
@@ -1940,7 +2109,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) | |||
1940 | } | 2109 | } |
1941 | iter->oom_lock = false; | 2110 | iter->oom_lock = false; |
1942 | } | 2111 | } |
1943 | } | 2112 | } else |
2113 | mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); | ||
1944 | 2114 | ||
1945 | spin_unlock(&memcg_oom_lock); | 2115 | spin_unlock(&memcg_oom_lock); |
1946 | 2116 | ||
@@ -1952,6 +2122,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) | |||
1952 | struct mem_cgroup *iter; | 2122 | struct mem_cgroup *iter; |
1953 | 2123 | ||
1954 | spin_lock(&memcg_oom_lock); | 2124 | spin_lock(&memcg_oom_lock); |
2125 | mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); | ||
1955 | for_each_mem_cgroup_tree(iter, memcg) | 2126 | for_each_mem_cgroup_tree(iter, memcg) |
1956 | iter->oom_lock = false; | 2127 | iter->oom_lock = false; |
1957 | spin_unlock(&memcg_oom_lock); | 2128 | spin_unlock(&memcg_oom_lock); |
@@ -2018,110 +2189,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
2018 | memcg_wakeup_oom(memcg); | 2189 | memcg_wakeup_oom(memcg); |
2019 | } | 2190 | } |
2020 | 2191 | ||
2021 | /* | ||
2022 | * try to call OOM killer | ||
2023 | */ | ||
2024 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 2192 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2025 | { | 2193 | { |
2026 | bool locked; | ||
2027 | int wakeups; | ||
2028 | |||
2029 | if (!current->memcg_oom.may_oom) | 2194 | if (!current->memcg_oom.may_oom) |
2030 | return; | 2195 | return; |
2031 | |||
2032 | current->memcg_oom.in_memcg_oom = 1; | ||
2033 | |||
2034 | /* | 2196 | /* |
2035 | * As with any blocking lock, a contender needs to start | 2197 | * We are in the middle of the charge context here, so we |
2036 | * listening for wakeups before attempting the trylock, | 2198 | * don't want to block when potentially sitting on a callstack |
2037 | * otherwise it can miss the wakeup from the unlock and sleep | 2199 | * that holds all kinds of filesystem and mm locks. |
2038 | * indefinitely. This is just open-coded because our locking | 2200 | * |
2039 | * is so particular to memcg hierarchies. | 2201 | * Also, the caller may handle a failed allocation gracefully |
2202 | * (like optional page cache readahead) and so an OOM killer | ||
2203 | * invocation might not even be necessary. | ||
2204 | * | ||
2205 | * That's why we don't do anything here except remember the | ||
2206 | * OOM context and then deal with it at the end of the page | ||
2207 | * fault when the stack is unwound, the locks are released, | ||
2208 | * and when we know whether the fault was overall successful. | ||
2040 | */ | 2209 | */ |
2041 | wakeups = atomic_read(&memcg->oom_wakeups); | 2210 | css_get(&memcg->css); |
2042 | mem_cgroup_mark_under_oom(memcg); | 2211 | current->memcg_oom.memcg = memcg; |
2043 | 2212 | current->memcg_oom.gfp_mask = mask; | |
2044 | locked = mem_cgroup_oom_trylock(memcg); | 2213 | current->memcg_oom.order = order; |
2045 | |||
2046 | if (locked) | ||
2047 | mem_cgroup_oom_notify(memcg); | ||
2048 | |||
2049 | if (locked && !memcg->oom_kill_disable) { | ||
2050 | mem_cgroup_unmark_under_oom(memcg); | ||
2051 | mem_cgroup_out_of_memory(memcg, mask, order); | ||
2052 | mem_cgroup_oom_unlock(memcg); | ||
2053 | /* | ||
2054 | * There is no guarantee that an OOM-lock contender | ||
2055 | * sees the wakeups triggered by the OOM kill | ||
2056 | * uncharges. Wake any sleepers explicitely. | ||
2057 | */ | ||
2058 | memcg_oom_recover(memcg); | ||
2059 | } else { | ||
2060 | /* | ||
2061 | * A system call can just return -ENOMEM, but if this | ||
2062 | * is a page fault and somebody else is handling the | ||
2063 | * OOM already, we need to sleep on the OOM waitqueue | ||
2064 | * for this memcg until the situation is resolved. | ||
2065 | * Which can take some time because it might be | ||
2066 | * handled by a userspace task. | ||
2067 | * | ||
2068 | * However, this is the charge context, which means | ||
2069 | * that we may sit on a large call stack and hold | ||
2070 | * various filesystem locks, the mmap_sem etc. and we | ||
2071 | * don't want the OOM handler to deadlock on them | ||
2072 | * while we sit here and wait. Store the current OOM | ||
2073 | * context in the task_struct, then return -ENOMEM. | ||
2074 | * At the end of the page fault handler, with the | ||
2075 | * stack unwound, pagefault_out_of_memory() will check | ||
2076 | * back with us by calling | ||
2077 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
2078 | * task to sleep. | ||
2079 | */ | ||
2080 | current->memcg_oom.oom_locked = locked; | ||
2081 | current->memcg_oom.wakeups = wakeups; | ||
2082 | css_get(&memcg->css); | ||
2083 | current->memcg_oom.wait_on_memcg = memcg; | ||
2084 | } | ||
2085 | } | 2214 | } |
2086 | 2215 | ||
2087 | /** | 2216 | /** |
2088 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | 2217 | * mem_cgroup_oom_synchronize - complete memcg OOM handling |
2218 | * @handle: actually kill/wait or just clean up the OOM state | ||
2089 | * | 2219 | * |
2090 | * This has to be called at the end of a page fault if the the memcg | 2220 | * This has to be called at the end of a page fault if the memcg OOM |
2091 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | 2221 | * handler was enabled. |
2092 | * | 2222 | * |
2093 | * Memcg supports userspace OOM handling, so failed allocations must | 2223 | * Memcg supports userspace OOM handling where failed allocations must |
2094 | * sleep on a waitqueue until the userspace task resolves the | 2224 | * sleep on a waitqueue until the userspace task resolves the |
2095 | * situation. Sleeping directly in the charge context with all kinds | 2225 | * situation. Sleeping directly in the charge context with all kinds |
2096 | * of locks held is not a good idea, instead we remember an OOM state | 2226 | * of locks held is not a good idea, instead we remember an OOM state |
2097 | * in the task and mem_cgroup_oom_synchronize() has to be called at | 2227 | * in the task and mem_cgroup_oom_synchronize() has to be called at |
2098 | * the end of the page fault to put the task to sleep and clean up the | 2228 | * the end of the page fault to complete the OOM handling. |
2099 | * OOM state. | ||
2100 | * | 2229 | * |
2101 | * Returns %true if an ongoing memcg OOM situation was detected and | 2230 | * Returns %true if an ongoing memcg OOM situation was detected and |
2102 | * finalized, %false otherwise. | 2231 | * completed, %false otherwise. |
2103 | */ | 2232 | */ |
2104 | bool mem_cgroup_oom_synchronize(void) | 2233 | bool mem_cgroup_oom_synchronize(bool handle) |
2105 | { | 2234 | { |
2235 | struct mem_cgroup *memcg = current->memcg_oom.memcg; | ||
2106 | struct oom_wait_info owait; | 2236 | struct oom_wait_info owait; |
2107 | struct mem_cgroup *memcg; | 2237 | bool locked; |
2108 | 2238 | ||
2109 | /* OOM is global, do not handle */ | 2239 | /* OOM is global, do not handle */ |
2110 | if (!current->memcg_oom.in_memcg_oom) | ||
2111 | return false; | ||
2112 | |||
2113 | /* | ||
2114 | * We invoked the OOM killer but there is a chance that a kill | ||
2115 | * did not free up any charges. Everybody else might already | ||
2116 | * be sleeping, so restart the fault and keep the rampage | ||
2117 | * going until some charges are released. | ||
2118 | */ | ||
2119 | memcg = current->memcg_oom.wait_on_memcg; | ||
2120 | if (!memcg) | 2240 | if (!memcg) |
2121 | goto out; | 2241 | return false; |
2122 | 2242 | ||
2123 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2243 | if (!handle) |
2124 | goto out_memcg; | 2244 | goto cleanup; |
2125 | 2245 | ||
2126 | owait.memcg = memcg; | 2246 | owait.memcg = memcg; |
2127 | owait.wait.flags = 0; | 2247 | owait.wait.flags = 0; |
@@ -2130,13 +2250,25 @@ bool mem_cgroup_oom_synchronize(void) | |||
2130 | INIT_LIST_HEAD(&owait.wait.task_list); | 2250 | INIT_LIST_HEAD(&owait.wait.task_list); |
2131 | 2251 | ||
2132 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2252 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
2133 | /* Only sleep if we didn't miss any wakeups since OOM */ | 2253 | mem_cgroup_mark_under_oom(memcg); |
2134 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | 2254 | |
2255 | locked = mem_cgroup_oom_trylock(memcg); | ||
2256 | |||
2257 | if (locked) | ||
2258 | mem_cgroup_oom_notify(memcg); | ||
2259 | |||
2260 | if (locked && !memcg->oom_kill_disable) { | ||
2261 | mem_cgroup_unmark_under_oom(memcg); | ||
2262 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2263 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, | ||
2264 | current->memcg_oom.order); | ||
2265 | } else { | ||
2135 | schedule(); | 2266 | schedule(); |
2136 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2267 | mem_cgroup_unmark_under_oom(memcg); |
2137 | out_memcg: | 2268 | finish_wait(&memcg_oom_waitq, &owait.wait); |
2138 | mem_cgroup_unmark_under_oom(memcg); | 2269 | } |
2139 | if (current->memcg_oom.oom_locked) { | 2270 | |
2271 | if (locked) { | ||
2140 | mem_cgroup_oom_unlock(memcg); | 2272 | mem_cgroup_oom_unlock(memcg); |
2141 | /* | 2273 | /* |
2142 | * There is no guarantee that an OOM-lock contender | 2274 | * There is no guarantee that an OOM-lock contender |
@@ -2145,10 +2277,9 @@ out_memcg: | |||
2145 | */ | 2277 | */ |
2146 | memcg_oom_recover(memcg); | 2278 | memcg_oom_recover(memcg); |
2147 | } | 2279 | } |
2280 | cleanup: | ||
2281 | current->memcg_oom.memcg = NULL; | ||
2148 | css_put(&memcg->css); | 2282 | css_put(&memcg->css); |
2149 | current->memcg_oom.wait_on_memcg = NULL; | ||
2150 | out: | ||
2151 | current->memcg_oom.in_memcg_oom = 0; | ||
2152 | return true; | 2283 | return true; |
2153 | } | 2284 | } |
2154 | 2285 | ||
@@ -2562,6 +2693,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2562 | || fatal_signal_pending(current))) | 2693 | || fatal_signal_pending(current))) |
2563 | goto bypass; | 2694 | goto bypass; |
2564 | 2695 | ||
2696 | if (unlikely(task_in_memcg_oom(current))) | ||
2697 | goto bypass; | ||
2698 | |||
2565 | /* | 2699 | /* |
2566 | * We always charge the cgroup the mm_struct belongs to. | 2700 | * We always charge the cgroup the mm_struct belongs to. |
2567 | * The mm_struct's mem_cgroup changes on task migration if the | 2701 | * The mm_struct's mem_cgroup changes on task migration if the |
@@ -2659,8 +2793,10 @@ done: | |||
2659 | *ptr = memcg; | 2793 | *ptr = memcg; |
2660 | return 0; | 2794 | return 0; |
2661 | nomem: | 2795 | nomem: |
2662 | *ptr = NULL; | 2796 | if (!(gfp_mask & __GFP_NOFAIL)) { |
2663 | return -ENOMEM; | 2797 | *ptr = NULL; |
2798 | return -ENOMEM; | ||
2799 | } | ||
2664 | bypass: | 2800 | bypass: |
2665 | *ptr = root_mem_cgroup; | 2801 | *ptr = root_mem_cgroup; |
2666 | return -EINTR; | 2802 | return -EINTR; |
@@ -2709,15 +2845,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | |||
2709 | */ | 2845 | */ |
2710 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | 2846 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) |
2711 | { | 2847 | { |
2712 | struct cgroup_subsys_state *css; | ||
2713 | |||
2714 | /* ID 0 is unused ID */ | 2848 | /* ID 0 is unused ID */ |
2715 | if (!id) | 2849 | if (!id) |
2716 | return NULL; | 2850 | return NULL; |
2717 | css = css_lookup(&mem_cgroup_subsys, id); | 2851 | return mem_cgroup_from_id(id); |
2718 | if (!css) | ||
2719 | return NULL; | ||
2720 | return mem_cgroup_from_css(css); | ||
2721 | } | 2852 | } |
2722 | 2853 | ||
2723 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2854 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
@@ -2812,7 +2943,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2812 | unlock_page_cgroup(pc); | 2943 | unlock_page_cgroup(pc); |
2813 | 2944 | ||
2814 | /* | 2945 | /* |
2815 | * "charge_statistics" updated event counter. | 2946 | * "charge_statistics" updated event counter. Then, check it. |
2947 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
2948 | * if they exceeds softlimit. | ||
2816 | */ | 2949 | */ |
2817 | memcg_check_events(memcg, page); | 2950 | memcg_check_events(memcg, page); |
2818 | } | 2951 | } |
@@ -2836,7 +2969,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | |||
2836 | 2969 | ||
2837 | VM_BUG_ON(p->is_root_cache); | 2970 | VM_BUG_ON(p->is_root_cache); |
2838 | cachep = p->root_cache; | 2971 | cachep = p->root_cache; |
2839 | return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; | 2972 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); |
2840 | } | 2973 | } |
2841 | 2974 | ||
2842 | #ifdef CONFIG_SLABINFO | 2975 | #ifdef CONFIG_SLABINFO |
@@ -2865,21 +2998,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | |||
2865 | struct res_counter *fail_res; | 2998 | struct res_counter *fail_res; |
2866 | struct mem_cgroup *_memcg; | 2999 | struct mem_cgroup *_memcg; |
2867 | int ret = 0; | 3000 | int ret = 0; |
2868 | bool may_oom; | ||
2869 | 3001 | ||
2870 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); | 3002 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); |
2871 | if (ret) | 3003 | if (ret) |
2872 | return ret; | 3004 | return ret; |
2873 | 3005 | ||
2874 | /* | ||
2875 | * Conditions under which we can wait for the oom_killer. Those are | ||
2876 | * the same conditions tested by the core page allocator | ||
2877 | */ | ||
2878 | may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); | ||
2879 | |||
2880 | _memcg = memcg; | 3006 | _memcg = memcg; |
2881 | ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, | 3007 | ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, |
2882 | &_memcg, may_oom); | 3008 | &_memcg, oom_gfp_allowed(gfp)); |
2883 | 3009 | ||
2884 | if (ret == -EINTR) { | 3010 | if (ret == -EINTR) { |
2885 | /* | 3011 | /* |
@@ -3019,7 +3145,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3019 | { | 3145 | { |
3020 | struct memcg_cache_params *cur_params = s->memcg_params; | 3146 | struct memcg_cache_params *cur_params = s->memcg_params; |
3021 | 3147 | ||
3022 | VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); | 3148 | VM_BUG_ON(!is_root_cache(s)); |
3023 | 3149 | ||
3024 | if (num_groups > memcg_limited_groups_array_size) { | 3150 | if (num_groups > memcg_limited_groups_array_size) { |
3025 | int i; | 3151 | int i; |
@@ -3280,7 +3406,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
3280 | idx = memcg_cache_id(memcg); | 3406 | idx = memcg_cache_id(memcg); |
3281 | 3407 | ||
3282 | mutex_lock(&memcg_cache_mutex); | 3408 | mutex_lock(&memcg_cache_mutex); |
3283 | new_cachep = cachep->memcg_params->memcg_caches[idx]; | 3409 | new_cachep = cache_from_memcg_idx(cachep, idx); |
3284 | if (new_cachep) { | 3410 | if (new_cachep) { |
3285 | css_put(&memcg->css); | 3411 | css_put(&memcg->css); |
3286 | goto out; | 3412 | goto out; |
@@ -3326,8 +3452,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | |||
3326 | * we'll take the set_limit_mutex to protect ourselves against this. | 3452 | * we'll take the set_limit_mutex to protect ourselves against this. |
3327 | */ | 3453 | */ |
3328 | mutex_lock(&set_limit_mutex); | 3454 | mutex_lock(&set_limit_mutex); |
3329 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | 3455 | for_each_memcg_cache_index(i) { |
3330 | c = s->memcg_params->memcg_caches[i]; | 3456 | c = cache_from_memcg_idx(s, i); |
3331 | if (!c) | 3457 | if (!c) |
3332 | continue; | 3458 | continue; |
3333 | 3459 | ||
@@ -3460,8 +3586,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3460 | * code updating memcg_caches will issue a write barrier to match this. | 3586 | * code updating memcg_caches will issue a write barrier to match this. |
3461 | */ | 3587 | */ |
3462 | read_barrier_depends(); | 3588 | read_barrier_depends(); |
3463 | if (likely(cachep->memcg_params->memcg_caches[idx])) { | 3589 | if (likely(cache_from_memcg_idx(cachep, idx))) { |
3464 | cachep = cachep->memcg_params->memcg_caches[idx]; | 3590 | cachep = cache_from_memcg_idx(cachep, idx); |
3465 | goto out; | 3591 | goto out; |
3466 | } | 3592 | } |
3467 | 3593 | ||
@@ -3663,8 +3789,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, | |||
3663 | { | 3789 | { |
3664 | /* Update stat data for mem_cgroup */ | 3790 | /* Update stat data for mem_cgroup */ |
3665 | preempt_disable(); | 3791 | preempt_disable(); |
3666 | WARN_ON_ONCE(from->stat->count[idx] < nr_pages); | 3792 | __this_cpu_sub(from->stat->count[idx], nr_pages); |
3667 | __this_cpu_add(from->stat->count[idx], -nr_pages); | ||
3668 | __this_cpu_add(to->stat->count[idx], nr_pages); | 3793 | __this_cpu_add(to->stat->count[idx], nr_pages); |
3669 | preempt_enable(); | 3794 | preempt_enable(); |
3670 | } | 3795 | } |
@@ -4232,7 +4357,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
4232 | * css_get() was called in uncharge(). | 4357 | * css_get() was called in uncharge(). |
4233 | */ | 4358 | */ |
4234 | if (do_swap_account && swapout && memcg) | 4359 | if (do_swap_account && swapout && memcg) |
4235 | swap_cgroup_record(ent, css_id(&memcg->css)); | 4360 | swap_cgroup_record(ent, mem_cgroup_id(memcg)); |
4236 | } | 4361 | } |
4237 | #endif | 4362 | #endif |
4238 | 4363 | ||
@@ -4284,8 +4409,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
4284 | { | 4409 | { |
4285 | unsigned short old_id, new_id; | 4410 | unsigned short old_id, new_id; |
4286 | 4411 | ||
4287 | old_id = css_id(&from->css); | 4412 | old_id = mem_cgroup_id(from); |
4288 | new_id = css_id(&to->css); | 4413 | new_id = mem_cgroup_id(to); |
4289 | 4414 | ||
4290 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | 4415 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { |
4291 | mem_cgroup_swap_statistics(from, false); | 4416 | mem_cgroup_swap_statistics(from, false); |
@@ -4647,6 +4772,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
4647 | return ret; | 4772 | return ret; |
4648 | } | 4773 | } |
4649 | 4774 | ||
4775 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
4776 | gfp_t gfp_mask, | ||
4777 | unsigned long *total_scanned) | ||
4778 | { | ||
4779 | unsigned long nr_reclaimed = 0; | ||
4780 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
4781 | unsigned long reclaimed; | ||
4782 | int loop = 0; | ||
4783 | struct mem_cgroup_tree_per_zone *mctz; | ||
4784 | unsigned long long excess; | ||
4785 | unsigned long nr_scanned; | ||
4786 | |||
4787 | if (order > 0) | ||
4788 | return 0; | ||
4789 | |||
4790 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | ||
4791 | /* | ||
4792 | * This loop can run a while, specially if mem_cgroup's continuously | ||
4793 | * keep exceeding their soft limit and putting the system under | ||
4794 | * pressure | ||
4795 | */ | ||
4796 | do { | ||
4797 | if (next_mz) | ||
4798 | mz = next_mz; | ||
4799 | else | ||
4800 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
4801 | if (!mz) | ||
4802 | break; | ||
4803 | |||
4804 | nr_scanned = 0; | ||
4805 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | ||
4806 | gfp_mask, &nr_scanned); | ||
4807 | nr_reclaimed += reclaimed; | ||
4808 | *total_scanned += nr_scanned; | ||
4809 | spin_lock(&mctz->lock); | ||
4810 | |||
4811 | /* | ||
4812 | * If we failed to reclaim anything from this memory cgroup | ||
4813 | * it is time to move on to the next cgroup | ||
4814 | */ | ||
4815 | next_mz = NULL; | ||
4816 | if (!reclaimed) { | ||
4817 | do { | ||
4818 | /* | ||
4819 | * Loop until we find yet another one. | ||
4820 | * | ||
4821 | * By the time we get the soft_limit lock | ||
4822 | * again, someone might have aded the | ||
4823 | * group back on the RB tree. Iterate to | ||
4824 | * make sure we get a different mem. | ||
4825 | * mem_cgroup_largest_soft_limit_node returns | ||
4826 | * NULL if no other cgroup is present on | ||
4827 | * the tree | ||
4828 | */ | ||
4829 | next_mz = | ||
4830 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
4831 | if (next_mz == mz) | ||
4832 | css_put(&next_mz->memcg->css); | ||
4833 | else /* next_mz == NULL or other memcg */ | ||
4834 | break; | ||
4835 | } while (1); | ||
4836 | } | ||
4837 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
4838 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
4839 | /* | ||
4840 | * One school of thought says that we should not add | ||
4841 | * back the node to the tree if reclaim returns 0. | ||
4842 | * But our reclaim could return 0, simply because due | ||
4843 | * to priority we are exposing a smaller subset of | ||
4844 | * memory to reclaim from. Consider this as a longer | ||
4845 | * term TODO. | ||
4846 | */ | ||
4847 | /* If excess == 0, no tree ops */ | ||
4848 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); | ||
4849 | spin_unlock(&mctz->lock); | ||
4850 | css_put(&mz->memcg->css); | ||
4851 | loop++; | ||
4852 | /* | ||
4853 | * Could not reclaim anything and there are no more | ||
4854 | * mem cgroups to try or we seem to be looping without | ||
4855 | * reclaiming anything. | ||
4856 | */ | ||
4857 | if (!nr_reclaimed && | ||
4858 | (next_mz == NULL || | ||
4859 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
4860 | break; | ||
4861 | } while (!nr_reclaimed); | ||
4862 | if (next_mz) | ||
4863 | css_put(&next_mz->memcg->css); | ||
4864 | return nr_reclaimed; | ||
4865 | } | ||
4866 | |||
4650 | /** | 4867 | /** |
4651 | * mem_cgroup_force_empty_list - clears LRU of a group | 4868 | * mem_cgroup_force_empty_list - clears LRU of a group |
4652 | * @memcg: group to clear | 4869 | * @memcg: group to clear |
@@ -4748,31 +4965,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | |||
4748 | } while (usage > 0); | 4965 | } while (usage > 0); |
4749 | } | 4966 | } |
4750 | 4967 | ||
4751 | /* | ||
4752 | * This mainly exists for tests during the setting of set of use_hierarchy. | ||
4753 | * Since this is the very setting we are changing, the current hierarchy value | ||
4754 | * is meaningless | ||
4755 | */ | ||
4756 | static inline bool __memcg_has_children(struct mem_cgroup *memcg) | ||
4757 | { | ||
4758 | struct cgroup_subsys_state *pos; | ||
4759 | |||
4760 | /* bounce at first found */ | ||
4761 | css_for_each_child(pos, &memcg->css) | ||
4762 | return true; | ||
4763 | return false; | ||
4764 | } | ||
4765 | |||
4766 | /* | ||
4767 | * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed | ||
4768 | * to be already dead (as in mem_cgroup_force_empty, for instance). This is | ||
4769 | * from mem_cgroup_count_children(), in the sense that we don't really care how | ||
4770 | * many children we have; we only need to know if we have any. It also counts | ||
4771 | * any memcg without hierarchy as infertile. | ||
4772 | */ | ||
4773 | static inline bool memcg_has_children(struct mem_cgroup *memcg) | 4968 | static inline bool memcg_has_children(struct mem_cgroup *memcg) |
4774 | { | 4969 | { |
4775 | return memcg->use_hierarchy && __memcg_has_children(memcg); | 4970 | lockdep_assert_held(&memcg_create_mutex); |
4971 | /* | ||
4972 | * The lock does not prevent addition or deletion to the list | ||
4973 | * of children, but it prevents a new child from being | ||
4974 | * initialized based on this parent in css_online(), so it's | ||
4975 | * enough to decide whether hierarchically inherited | ||
4976 | * attributes can still be changed or not. | ||
4977 | */ | ||
4978 | return memcg->use_hierarchy && | ||
4979 | !list_empty(&memcg->css.cgroup->children); | ||
4776 | } | 4980 | } |
4777 | 4981 | ||
4778 | /* | 4982 | /* |
@@ -4852,7 +5056,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, | |||
4852 | */ | 5056 | */ |
4853 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && | 5057 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && |
4854 | (val == 1 || val == 0)) { | 5058 | (val == 1 || val == 0)) { |
4855 | if (!__memcg_has_children(memcg)) | 5059 | if (list_empty(&memcg->css.cgroup->children)) |
4856 | memcg->use_hierarchy = val; | 5060 | memcg->use_hierarchy = val; |
4857 | else | 5061 | else |
4858 | retval = -EBUSY; | 5062 | retval = -EBUSY; |
@@ -5179,45 +5383,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | |||
5179 | static int memcg_numa_stat_show(struct cgroup_subsys_state *css, | 5383 | static int memcg_numa_stat_show(struct cgroup_subsys_state *css, |
5180 | struct cftype *cft, struct seq_file *m) | 5384 | struct cftype *cft, struct seq_file *m) |
5181 | { | 5385 | { |
5386 | struct numa_stat { | ||
5387 | const char *name; | ||
5388 | unsigned int lru_mask; | ||
5389 | }; | ||
5390 | |||
5391 | static const struct numa_stat stats[] = { | ||
5392 | { "total", LRU_ALL }, | ||
5393 | { "file", LRU_ALL_FILE }, | ||
5394 | { "anon", LRU_ALL_ANON }, | ||
5395 | { "unevictable", BIT(LRU_UNEVICTABLE) }, | ||
5396 | }; | ||
5397 | const struct numa_stat *stat; | ||
5182 | int nid; | 5398 | int nid; |
5183 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; | 5399 | unsigned long nr; |
5184 | unsigned long node_nr; | ||
5185 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5400 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5186 | 5401 | ||
5187 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); | 5402 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
5188 | seq_printf(m, "total=%lu", total_nr); | 5403 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); |
5189 | for_each_node_state(nid, N_MEMORY) { | 5404 | seq_printf(m, "%s=%lu", stat->name, nr); |
5190 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); | 5405 | for_each_node_state(nid, N_MEMORY) { |
5191 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5406 | nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
5192 | } | 5407 | stat->lru_mask); |
5193 | seq_putc(m, '\n'); | 5408 | seq_printf(m, " N%d=%lu", nid, nr); |
5194 | 5409 | } | |
5195 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); | 5410 | seq_putc(m, '\n'); |
5196 | seq_printf(m, "file=%lu", file_nr); | ||
5197 | for_each_node_state(nid, N_MEMORY) { | ||
5198 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | ||
5199 | LRU_ALL_FILE); | ||
5200 | seq_printf(m, " N%d=%lu", nid, node_nr); | ||
5201 | } | 5411 | } |
5202 | seq_putc(m, '\n'); | ||
5203 | 5412 | ||
5204 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); | 5413 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
5205 | seq_printf(m, "anon=%lu", anon_nr); | 5414 | struct mem_cgroup *iter; |
5206 | for_each_node_state(nid, N_MEMORY) { | 5415 | |
5207 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5416 | nr = 0; |
5208 | LRU_ALL_ANON); | 5417 | for_each_mem_cgroup_tree(iter, memcg) |
5209 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5418 | nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); |
5419 | seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); | ||
5420 | for_each_node_state(nid, N_MEMORY) { | ||
5421 | nr = 0; | ||
5422 | for_each_mem_cgroup_tree(iter, memcg) | ||
5423 | nr += mem_cgroup_node_nr_lru_pages( | ||
5424 | iter, nid, stat->lru_mask); | ||
5425 | seq_printf(m, " N%d=%lu", nid, nr); | ||
5426 | } | ||
5427 | seq_putc(m, '\n'); | ||
5210 | } | 5428 | } |
5211 | seq_putc(m, '\n'); | ||
5212 | 5429 | ||
5213 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); | ||
5214 | seq_printf(m, "unevictable=%lu", unevictable_nr); | ||
5215 | for_each_node_state(nid, N_MEMORY) { | ||
5216 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | ||
5217 | BIT(LRU_UNEVICTABLE)); | ||
5218 | seq_printf(m, " N%d=%lu", nid, node_nr); | ||
5219 | } | ||
5220 | seq_putc(m, '\n'); | ||
5221 | return 0; | 5430 | return 0; |
5222 | } | 5431 | } |
5223 | #endif /* CONFIG_NUMA */ | 5432 | #endif /* CONFIG_NUMA */ |
@@ -5911,6 +6120,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
5911 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 6120 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
5912 | mz = &pn->zoneinfo[zone]; | 6121 | mz = &pn->zoneinfo[zone]; |
5913 | lruvec_init(&mz->lruvec); | 6122 | lruvec_init(&mz->lruvec); |
6123 | mz->usage_in_excess = 0; | ||
6124 | mz->on_tree = false; | ||
5914 | mz->memcg = memcg; | 6125 | mz->memcg = memcg; |
5915 | } | 6126 | } |
5916 | memcg->nodeinfo[node] = pn; | 6127 | memcg->nodeinfo[node] = pn; |
@@ -5966,7 +6177,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
5966 | int node; | 6177 | int node; |
5967 | size_t size = memcg_size(); | 6178 | size_t size = memcg_size(); |
5968 | 6179 | ||
5969 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 6180 | mem_cgroup_remove_from_trees(memcg); |
5970 | 6181 | ||
5971 | for_each_node(node) | 6182 | for_each_node(node) |
5972 | free_mem_cgroup_per_zone_info(memcg, node); | 6183 | free_mem_cgroup_per_zone_info(memcg, node); |
@@ -6002,6 +6213,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
6002 | } | 6213 | } |
6003 | EXPORT_SYMBOL(parent_mem_cgroup); | 6214 | EXPORT_SYMBOL(parent_mem_cgroup); |
6004 | 6215 | ||
6216 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
6217 | { | ||
6218 | struct mem_cgroup_tree_per_node *rtpn; | ||
6219 | struct mem_cgroup_tree_per_zone *rtpz; | ||
6220 | int tmp, node, zone; | ||
6221 | |||
6222 | for_each_node(node) { | ||
6223 | tmp = node; | ||
6224 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
6225 | tmp = -1; | ||
6226 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
6227 | BUG_ON(!rtpn); | ||
6228 | |||
6229 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
6230 | |||
6231 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
6232 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
6233 | rtpz->rb_root = RB_ROOT; | ||
6234 | spin_lock_init(&rtpz->lock); | ||
6235 | } | ||
6236 | } | ||
6237 | } | ||
6238 | |||
6005 | static struct cgroup_subsys_state * __ref | 6239 | static struct cgroup_subsys_state * __ref |
6006 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6240 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
6007 | { | 6241 | { |
@@ -6031,7 +6265,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
6031 | mutex_init(&memcg->thresholds_lock); | 6265 | mutex_init(&memcg->thresholds_lock); |
6032 | spin_lock_init(&memcg->move_lock); | 6266 | spin_lock_init(&memcg->move_lock); |
6033 | vmpressure_init(&memcg->vmpressure); | 6267 | vmpressure_init(&memcg->vmpressure); |
6034 | spin_lock_init(&memcg->soft_lock); | ||
6035 | 6268 | ||
6036 | return &memcg->css; | 6269 | return &memcg->css; |
6037 | 6270 | ||
@@ -6047,6 +6280,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
6047 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); | 6280 | struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); |
6048 | int error = 0; | 6281 | int error = 0; |
6049 | 6282 | ||
6283 | if (css->cgroup->id > MEM_CGROUP_ID_MAX) | ||
6284 | return -ENOSPC; | ||
6285 | |||
6050 | if (!parent) | 6286 | if (!parent) |
6051 | return 0; | 6287 | return 0; |
6052 | 6288 | ||
@@ -6109,13 +6345,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
6109 | 6345 | ||
6110 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6346 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
6111 | mem_cgroup_reparent_charges(memcg); | 6347 | mem_cgroup_reparent_charges(memcg); |
6112 | if (memcg->soft_contributed) { | ||
6113 | while ((memcg = parent_mem_cgroup(memcg))) | ||
6114 | atomic_dec(&memcg->children_in_excess); | ||
6115 | |||
6116 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
6117 | atomic_dec(&root_mem_cgroup->children_in_excess); | ||
6118 | } | ||
6119 | mem_cgroup_destroy_all_caches(memcg); | 6348 | mem_cgroup_destroy_all_caches(memcg); |
6120 | vmpressure_cleanup(&memcg->vmpressure); | 6349 | vmpressure_cleanup(&memcg->vmpressure); |
6121 | } | 6350 | } |
@@ -6325,7 +6554,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
6325 | } | 6554 | } |
6326 | /* There is a swap entry and a page doesn't exist or isn't charged */ | 6555 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
6327 | if (ent.val && !ret && | 6556 | if (ent.val && !ret && |
6328 | css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { | 6557 | mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { |
6329 | ret = MC_TARGET_SWAP; | 6558 | ret = MC_TARGET_SWAP; |
6330 | if (target) | 6559 | if (target) |
6331 | target->ent = ent; | 6560 | target->ent = ent; |
@@ -6376,10 +6605,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
6376 | pte_t *pte; | 6605 | pte_t *pte; |
6377 | spinlock_t *ptl; | 6606 | spinlock_t *ptl; |
6378 | 6607 | ||
6379 | if (pmd_trans_huge_lock(pmd, vma) == 1) { | 6608 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
6380 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | 6609 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) |
6381 | mc.precharge += HPAGE_PMD_NR; | 6610 | mc.precharge += HPAGE_PMD_NR; |
6382 | spin_unlock(&vma->vm_mm->page_table_lock); | 6611 | spin_unlock(ptl); |
6383 | return 0; | 6612 | return 0; |
6384 | } | 6613 | } |
6385 | 6614 | ||
@@ -6568,9 +6797,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
6568 | * to be unlocked in __split_huge_page_splitting(), where the main | 6797 | * to be unlocked in __split_huge_page_splitting(), where the main |
6569 | * part of thp split is not executed yet. | 6798 | * part of thp split is not executed yet. |
6570 | */ | 6799 | */ |
6571 | if (pmd_trans_huge_lock(pmd, vma) == 1) { | 6800 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
6572 | if (mc.precharge < HPAGE_PMD_NR) { | 6801 | if (mc.precharge < HPAGE_PMD_NR) { |
6573 | spin_unlock(&vma->vm_mm->page_table_lock); | 6802 | spin_unlock(ptl); |
6574 | return 0; | 6803 | return 0; |
6575 | } | 6804 | } |
6576 | target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); | 6805 | target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); |
@@ -6587,7 +6816,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
6587 | } | 6816 | } |
6588 | put_page(page); | 6817 | put_page(page); |
6589 | } | 6818 | } |
6590 | spin_unlock(&vma->vm_mm->page_table_lock); | 6819 | spin_unlock(ptl); |
6591 | return 0; | 6820 | return 0; |
6592 | } | 6821 | } |
6593 | 6822 | ||
@@ -6745,7 +6974,6 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
6745 | .bind = mem_cgroup_bind, | 6974 | .bind = mem_cgroup_bind, |
6746 | .base_cftypes = mem_cgroup_files, | 6975 | .base_cftypes = mem_cgroup_files, |
6747 | .early_init = 0, | 6976 | .early_init = 0, |
6748 | .use_id = 1, | ||
6749 | }; | 6977 | }; |
6750 | 6978 | ||
6751 | #ifdef CONFIG_MEMCG_SWAP | 6979 | #ifdef CONFIG_MEMCG_SWAP |
@@ -6790,6 +7018,7 @@ static int __init mem_cgroup_init(void) | |||
6790 | { | 7018 | { |
6791 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 7019 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
6792 | enable_swap_cgroup(); | 7020 | enable_swap_cgroup(); |
7021 | mem_cgroup_soft_limit_tree_init(); | ||
6793 | memcg_stock_init(); | 7022 | memcg_stock_init(); |
6794 | return 0; | 7023 | return 0; |
6795 | } | 7024 | } |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 947ed5413279..b7c171602ba1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1114,8 +1114,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1114 | * shake_page could have turned it free. | 1114 | * shake_page could have turned it free. |
1115 | */ | 1115 | */ |
1116 | if (is_free_buddy_page(p)) { | 1116 | if (is_free_buddy_page(p)) { |
1117 | action_result(pfn, "free buddy, 2nd try", | 1117 | if (flags & MF_COUNT_INCREASED) |
1118 | DELAYED); | 1118 | action_result(pfn, "free buddy", DELAYED); |
1119 | else | ||
1120 | action_result(pfn, "free buddy, 2nd try", DELAYED); | ||
1119 | return 0; | 1121 | return 0; |
1120 | } | 1122 | } |
1121 | action_result(pfn, "non LRU", IGNORED); | 1123 | action_result(pfn, "non LRU", IGNORED); |
@@ -1267,7 +1269,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags) | |||
1267 | 1269 | ||
1268 | mf_cpu = &get_cpu_var(memory_failure_cpu); | 1270 | mf_cpu = &get_cpu_var(memory_failure_cpu); |
1269 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | 1271 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); |
1270 | if (kfifo_put(&mf_cpu->fifo, &entry)) | 1272 | if (kfifo_put(&mf_cpu->fifo, entry)) |
1271 | schedule_work_on(smp_processor_id(), &mf_cpu->work); | 1273 | schedule_work_on(smp_processor_id(), &mf_cpu->work); |
1272 | else | 1274 | else |
1273 | pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", | 1275 | pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", |
@@ -1349,7 +1351,7 @@ int unpoison_memory(unsigned long pfn) | |||
1349 | * worked by memory_failure() and the page lock is not held yet. | 1351 | * worked by memory_failure() and the page lock is not held yet. |
1350 | * In such case, we yield to memory_failure() and make unpoison fail. | 1352 | * In such case, we yield to memory_failure() and make unpoison fail. |
1351 | */ | 1353 | */ |
1352 | if (PageTransHuge(page)) { | 1354 | if (!PageHuge(page) && PageTransHuge(page)) { |
1353 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); | 1355 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); |
1354 | return 0; | 1356 | return 0; |
1355 | } | 1357 | } |
@@ -1421,19 +1423,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1421 | return 1; | 1423 | return 1; |
1422 | 1424 | ||
1423 | /* | 1425 | /* |
1424 | * The lock_memory_hotplug prevents a race with memory hotplug. | ||
1425 | * This is a big hammer, a better would be nicer. | ||
1426 | */ | ||
1427 | lock_memory_hotplug(); | ||
1428 | |||
1429 | /* | ||
1430 | * Isolate the page, so that it doesn't get reallocated if it | ||
1431 | * was free. This flag should be kept set until the source page | ||
1432 | * is freed and PG_hwpoison on it is set. | ||
1433 | */ | ||
1434 | if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE) | ||
1435 | set_migratetype_isolate(p, true); | ||
1436 | /* | ||
1437 | * When the target page is a free hugepage, just remove it | 1426 | * When the target page is a free hugepage, just remove it |
1438 | * from free hugepage list. | 1427 | * from free hugepage list. |
1439 | */ | 1428 | */ |
@@ -1453,7 +1442,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1453 | /* Not a free page */ | 1442 | /* Not a free page */ |
1454 | ret = 1; | 1443 | ret = 1; |
1455 | } | 1444 | } |
1456 | unlock_memory_hotplug(); | ||
1457 | return ret; | 1445 | return ret; |
1458 | } | 1446 | } |
1459 | 1447 | ||
@@ -1652,15 +1640,28 @@ int soft_offline_page(struct page *page, int flags) | |||
1652 | } | 1640 | } |
1653 | } | 1641 | } |
1654 | 1642 | ||
1643 | /* | ||
1644 | * The lock_memory_hotplug prevents a race with memory hotplug. | ||
1645 | * This is a big hammer, a better would be nicer. | ||
1646 | */ | ||
1647 | lock_memory_hotplug(); | ||
1648 | |||
1649 | /* | ||
1650 | * Isolate the page, so that it doesn't get reallocated if it | ||
1651 | * was free. This flag should be kept set until the source page | ||
1652 | * is freed and PG_hwpoison on it is set. | ||
1653 | */ | ||
1654 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
1655 | set_migratetype_isolate(page, true); | ||
1656 | |||
1655 | ret = get_any_page(page, pfn, flags); | 1657 | ret = get_any_page(page, pfn, flags); |
1656 | if (ret < 0) | 1658 | unlock_memory_hotplug(); |
1657 | goto unset; | 1659 | if (ret > 0) { /* for in-use pages */ |
1658 | if (ret) { /* for in-use pages */ | ||
1659 | if (PageHuge(page)) | 1660 | if (PageHuge(page)) |
1660 | ret = soft_offline_huge_page(page, flags); | 1661 | ret = soft_offline_huge_page(page, flags); |
1661 | else | 1662 | else |
1662 | ret = __soft_offline_page(page, flags); | 1663 | ret = __soft_offline_page(page, flags); |
1663 | } else { /* for free pages */ | 1664 | } else if (ret == 0) { /* for free pages */ |
1664 | if (PageHuge(page)) { | 1665 | if (PageHuge(page)) { |
1665 | set_page_hwpoison_huge_page(hpage); | 1666 | set_page_hwpoison_huge_page(hpage); |
1666 | dequeue_hwpoisoned_huge_page(hpage); | 1667 | dequeue_hwpoisoned_huge_page(hpage); |
@@ -1671,7 +1672,6 @@ int soft_offline_page(struct page *page, int flags) | |||
1671 | atomic_long_inc(&num_poisoned_pages); | 1672 | atomic_long_inc(&num_poisoned_pages); |
1672 | } | 1673 | } |
1673 | } | 1674 | } |
1674 | unset: | ||
1675 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); | 1675 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); |
1676 | return ret; | 1676 | return ret; |
1677 | } | 1677 | } |
diff --git a/mm/memory.c b/mm/memory.c index ca0003947115..0409e8f43fa0 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -69,8 +69,8 @@ | |||
69 | 69 | ||
70 | #include "internal.h" | 70 | #include "internal.h" |
71 | 71 | ||
72 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | 72 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS |
73 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. | 73 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 76 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |||
382 | pgtable_t token = pmd_pgtable(*pmd); | 382 | pgtable_t token = pmd_pgtable(*pmd); |
383 | pmd_clear(pmd); | 383 | pmd_clear(pmd); |
384 | pte_free_tlb(tlb, token, addr); | 384 | pte_free_tlb(tlb, token, addr); |
385 | tlb->mm->nr_ptes--; | 385 | atomic_long_dec(&tlb->mm->nr_ptes); |
386 | } | 386 | } |
387 | 387 | ||
388 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 388 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
@@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
453 | 453 | ||
454 | /* | 454 | /* |
455 | * This function frees user-level page tables of a process. | 455 | * This function frees user-level page tables of a process. |
456 | * | ||
457 | * Must be called with pagetable lock held. | ||
458 | */ | 456 | */ |
459 | void free_pgd_range(struct mmu_gather *tlb, | 457 | void free_pgd_range(struct mmu_gather *tlb, |
460 | unsigned long addr, unsigned long end, | 458 | unsigned long addr, unsigned long end, |
@@ -552,6 +550,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
552 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, | 550 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
553 | pmd_t *pmd, unsigned long address) | 551 | pmd_t *pmd, unsigned long address) |
554 | { | 552 | { |
553 | spinlock_t *ptl; | ||
555 | pgtable_t new = pte_alloc_one(mm, address); | 554 | pgtable_t new = pte_alloc_one(mm, address); |
556 | int wait_split_huge_page; | 555 | int wait_split_huge_page; |
557 | if (!new) | 556 | if (!new) |
@@ -572,15 +571,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, | |||
572 | */ | 571 | */ |
573 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ | 572 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ |
574 | 573 | ||
575 | spin_lock(&mm->page_table_lock); | 574 | ptl = pmd_lock(mm, pmd); |
576 | wait_split_huge_page = 0; | 575 | wait_split_huge_page = 0; |
577 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | 576 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
578 | mm->nr_ptes++; | 577 | atomic_long_inc(&mm->nr_ptes); |
579 | pmd_populate(mm, pmd, new); | 578 | pmd_populate(mm, pmd, new); |
580 | new = NULL; | 579 | new = NULL; |
581 | } else if (unlikely(pmd_trans_splitting(*pmd))) | 580 | } else if (unlikely(pmd_trans_splitting(*pmd))) |
582 | wait_split_huge_page = 1; | 581 | wait_split_huge_page = 1; |
583 | spin_unlock(&mm->page_table_lock); | 582 | spin_unlock(ptl); |
584 | if (new) | 583 | if (new) |
585 | pte_free(mm, new); | 584 | pte_free(mm, new); |
586 | if (wait_split_huge_page) | 585 | if (wait_split_huge_page) |
@@ -681,7 +680,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
681 | if (vma->vm_ops) | 680 | if (vma->vm_ops) |
682 | printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", | 681 | printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", |
683 | vma->vm_ops->fault); | 682 | vma->vm_ops->fault); |
684 | if (vma->vm_file && vma->vm_file->f_op) | 683 | if (vma->vm_file) |
685 | printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", | 684 | printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", |
686 | vma->vm_file->f_op->mmap); | 685 | vma->vm_file->f_op->mmap); |
687 | dump_stack(); | 686 | dump_stack(); |
@@ -837,6 +836,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
837 | */ | 836 | */ |
838 | make_migration_entry_read(&entry); | 837 | make_migration_entry_read(&entry); |
839 | pte = swp_entry_to_pte(entry); | 838 | pte = swp_entry_to_pte(entry); |
839 | if (pte_swp_soft_dirty(*src_pte)) | ||
840 | pte = pte_swp_mksoft_dirty(pte); | ||
840 | set_pte_at(src_mm, addr, src_pte, pte); | 841 | set_pte_at(src_mm, addr, src_pte, pte); |
841 | } | 842 | } |
842 | } | 843 | } |
@@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
1516 | split_huge_page_pmd(vma, address, pmd); | 1517 | split_huge_page_pmd(vma, address, pmd); |
1517 | goto split_fallthrough; | 1518 | goto split_fallthrough; |
1518 | } | 1519 | } |
1519 | spin_lock(&mm->page_table_lock); | 1520 | ptl = pmd_lock(mm, pmd); |
1520 | if (likely(pmd_trans_huge(*pmd))) { | 1521 | if (likely(pmd_trans_huge(*pmd))) { |
1521 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1522 | if (unlikely(pmd_trans_splitting(*pmd))) { |
1522 | spin_unlock(&mm->page_table_lock); | 1523 | spin_unlock(ptl); |
1523 | wait_split_huge_page(vma->anon_vma, pmd); | 1524 | wait_split_huge_page(vma->anon_vma, pmd); |
1524 | } else { | 1525 | } else { |
1525 | page = follow_trans_huge_pmd(vma, address, | 1526 | page = follow_trans_huge_pmd(vma, address, |
1526 | pmd, flags); | 1527 | pmd, flags); |
1527 | spin_unlock(&mm->page_table_lock); | 1528 | spin_unlock(ptl); |
1528 | *page_mask = HPAGE_PMD_NR - 1; | 1529 | *page_mask = HPAGE_PMD_NR - 1; |
1529 | goto out; | 1530 | goto out; |
1530 | } | 1531 | } |
1531 | } else | 1532 | } else |
1532 | spin_unlock(&mm->page_table_lock); | 1533 | spin_unlock(ptl); |
1533 | /* fall through */ | 1534 | /* fall through */ |
1534 | } | 1535 | } |
1535 | split_fallthrough: | 1536 | split_fallthrough: |
@@ -2719,6 +2720,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2719 | get_page(dirty_page); | 2720 | get_page(dirty_page); |
2720 | 2721 | ||
2721 | reuse: | 2722 | reuse: |
2723 | /* | ||
2724 | * Clear the pages cpupid information as the existing | ||
2725 | * information potentially belongs to a now completely | ||
2726 | * unrelated process. | ||
2727 | */ | ||
2728 | if (old_page) | ||
2729 | page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); | ||
2730 | |||
2722 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2731 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2723 | entry = pte_mkyoung(orig_pte); | 2732 | entry = pte_mkyoung(orig_pte); |
2724 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2733 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -3519,13 +3528,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3519 | } | 3528 | } |
3520 | 3529 | ||
3521 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3530 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
3522 | unsigned long addr, int current_nid) | 3531 | unsigned long addr, int page_nid, |
3532 | int *flags) | ||
3523 | { | 3533 | { |
3524 | get_page(page); | 3534 | get_page(page); |
3525 | 3535 | ||
3526 | count_vm_numa_event(NUMA_HINT_FAULTS); | 3536 | count_vm_numa_event(NUMA_HINT_FAULTS); |
3527 | if (current_nid == numa_node_id()) | 3537 | if (page_nid == numa_node_id()) { |
3528 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | 3538 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); |
3539 | *flags |= TNF_FAULT_LOCAL; | ||
3540 | } | ||
3529 | 3541 | ||
3530 | return mpol_misplaced(page, vma, addr); | 3542 | return mpol_misplaced(page, vma, addr); |
3531 | } | 3543 | } |
@@ -3535,9 +3547,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3535 | { | 3547 | { |
3536 | struct page *page = NULL; | 3548 | struct page *page = NULL; |
3537 | spinlock_t *ptl; | 3549 | spinlock_t *ptl; |
3538 | int current_nid = -1; | 3550 | int page_nid = -1; |
3551 | int last_cpupid; | ||
3539 | int target_nid; | 3552 | int target_nid; |
3540 | bool migrated = false; | 3553 | bool migrated = false; |
3554 | int flags = 0; | ||
3541 | 3555 | ||
3542 | /* | 3556 | /* |
3543 | * The "pte" at this point cannot be used safely without | 3557 | * The "pte" at this point cannot be used safely without |
@@ -3564,123 +3578,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3564 | pte_unmap_unlock(ptep, ptl); | 3578 | pte_unmap_unlock(ptep, ptl); |
3565 | return 0; | 3579 | return 0; |
3566 | } | 3580 | } |
3581 | BUG_ON(is_zero_pfn(page_to_pfn(page))); | ||
3567 | 3582 | ||
3568 | current_nid = page_to_nid(page); | 3583 | /* |
3569 | target_nid = numa_migrate_prep(page, vma, addr, current_nid); | 3584 | * Avoid grouping on DSO/COW pages in specific and RO pages |
3585 | * in general, RO pages shouldn't hurt as much anyway since | ||
3586 | * they can be in shared cache state. | ||
3587 | */ | ||
3588 | if (!pte_write(pte)) | ||
3589 | flags |= TNF_NO_GROUP; | ||
3590 | |||
3591 | /* | ||
3592 | * Flag if the page is shared between multiple address spaces. This | ||
3593 | * is later used when determining whether to group tasks together | ||
3594 | */ | ||
3595 | if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) | ||
3596 | flags |= TNF_SHARED; | ||
3597 | |||
3598 | last_cpupid = page_cpupid_last(page); | ||
3599 | page_nid = page_to_nid(page); | ||
3600 | target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); | ||
3570 | pte_unmap_unlock(ptep, ptl); | 3601 | pte_unmap_unlock(ptep, ptl); |
3571 | if (target_nid == -1) { | 3602 | if (target_nid == -1) { |
3572 | /* | ||
3573 | * Account for the fault against the current node if it not | ||
3574 | * being replaced regardless of where the page is located. | ||
3575 | */ | ||
3576 | current_nid = numa_node_id(); | ||
3577 | put_page(page); | 3603 | put_page(page); |
3578 | goto out; | 3604 | goto out; |
3579 | } | 3605 | } |
3580 | 3606 | ||
3581 | /* Migrate to the requested node */ | 3607 | /* Migrate to the requested node */ |
3582 | migrated = migrate_misplaced_page(page, target_nid); | 3608 | migrated = migrate_misplaced_page(page, vma, target_nid); |
3583 | if (migrated) | 3609 | if (migrated) { |
3584 | current_nid = target_nid; | 3610 | page_nid = target_nid; |
3585 | 3611 | flags |= TNF_MIGRATED; | |
3586 | out: | ||
3587 | if (current_nid != -1) | ||
3588 | task_numa_fault(current_nid, 1, migrated); | ||
3589 | return 0; | ||
3590 | } | ||
3591 | |||
3592 | /* NUMA hinting page fault entry point for regular pmds */ | ||
3593 | #ifdef CONFIG_NUMA_BALANCING | ||
3594 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3595 | unsigned long addr, pmd_t *pmdp) | ||
3596 | { | ||
3597 | pmd_t pmd; | ||
3598 | pte_t *pte, *orig_pte; | ||
3599 | unsigned long _addr = addr & PMD_MASK; | ||
3600 | unsigned long offset; | ||
3601 | spinlock_t *ptl; | ||
3602 | bool numa = false; | ||
3603 | int local_nid = numa_node_id(); | ||
3604 | |||
3605 | spin_lock(&mm->page_table_lock); | ||
3606 | pmd = *pmdp; | ||
3607 | if (pmd_numa(pmd)) { | ||
3608 | set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); | ||
3609 | numa = true; | ||
3610 | } | ||
3611 | spin_unlock(&mm->page_table_lock); | ||
3612 | |||
3613 | if (!numa) | ||
3614 | return 0; | ||
3615 | |||
3616 | /* we're in a page fault so some vma must be in the range */ | ||
3617 | BUG_ON(!vma); | ||
3618 | BUG_ON(vma->vm_start >= _addr + PMD_SIZE); | ||
3619 | offset = max(_addr, vma->vm_start) & ~PMD_MASK; | ||
3620 | VM_BUG_ON(offset >= PMD_SIZE); | ||
3621 | orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); | ||
3622 | pte += offset >> PAGE_SHIFT; | ||
3623 | for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { | ||
3624 | pte_t pteval = *pte; | ||
3625 | struct page *page; | ||
3626 | int curr_nid = local_nid; | ||
3627 | int target_nid; | ||
3628 | bool migrated; | ||
3629 | if (!pte_present(pteval)) | ||
3630 | continue; | ||
3631 | if (!pte_numa(pteval)) | ||
3632 | continue; | ||
3633 | if (addr >= vma->vm_end) { | ||
3634 | vma = find_vma(mm, addr); | ||
3635 | /* there's a pte present so there must be a vma */ | ||
3636 | BUG_ON(!vma); | ||
3637 | BUG_ON(addr < vma->vm_start); | ||
3638 | } | ||
3639 | if (pte_numa(pteval)) { | ||
3640 | pteval = pte_mknonnuma(pteval); | ||
3641 | set_pte_at(mm, addr, pte, pteval); | ||
3642 | } | ||
3643 | page = vm_normal_page(vma, addr, pteval); | ||
3644 | if (unlikely(!page)) | ||
3645 | continue; | ||
3646 | /* only check non-shared pages */ | ||
3647 | if (unlikely(page_mapcount(page) != 1)) | ||
3648 | continue; | ||
3649 | |||
3650 | /* | ||
3651 | * Note that the NUMA fault is later accounted to either | ||
3652 | * the node that is currently running or where the page is | ||
3653 | * migrated to. | ||
3654 | */ | ||
3655 | curr_nid = local_nid; | ||
3656 | target_nid = numa_migrate_prep(page, vma, addr, | ||
3657 | page_to_nid(page)); | ||
3658 | if (target_nid == -1) { | ||
3659 | put_page(page); | ||
3660 | continue; | ||
3661 | } | ||
3662 | |||
3663 | /* Migrate to the requested node */ | ||
3664 | pte_unmap_unlock(pte, ptl); | ||
3665 | migrated = migrate_misplaced_page(page, target_nid); | ||
3666 | if (migrated) | ||
3667 | curr_nid = target_nid; | ||
3668 | task_numa_fault(curr_nid, 1, migrated); | ||
3669 | |||
3670 | pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); | ||
3671 | } | 3612 | } |
3672 | pte_unmap_unlock(orig_pte, ptl); | ||
3673 | 3613 | ||
3614 | out: | ||
3615 | if (page_nid != -1) | ||
3616 | task_numa_fault(last_cpupid, page_nid, 1, flags); | ||
3674 | return 0; | 3617 | return 0; |
3675 | } | 3618 | } |
3676 | #else | ||
3677 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3678 | unsigned long addr, pmd_t *pmdp) | ||
3679 | { | ||
3680 | BUG(); | ||
3681 | return 0; | ||
3682 | } | ||
3683 | #endif /* CONFIG_NUMA_BALANCING */ | ||
3684 | 3619 | ||
3685 | /* | 3620 | /* |
3686 | * These routines also need to handle stuff like marking pages dirty | 3621 | * These routines also need to handle stuff like marking pages dirty |
@@ -3820,8 +3755,8 @@ retry: | |||
3820 | } | 3755 | } |
3821 | } | 3756 | } |
3822 | 3757 | ||
3823 | if (pmd_numa(*pmd)) | 3758 | /* THP should already have been handled */ |
3824 | return do_pmd_numa_page(mm, vma, address, pmd); | 3759 | BUG_ON(pmd_numa(*pmd)); |
3825 | 3760 | ||
3826 | /* | 3761 | /* |
3827 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3762 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
@@ -3863,15 +3798,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3863 | * space. Kernel faults are handled more gracefully. | 3798 | * space. Kernel faults are handled more gracefully. |
3864 | */ | 3799 | */ |
3865 | if (flags & FAULT_FLAG_USER) | 3800 | if (flags & FAULT_FLAG_USER) |
3866 | mem_cgroup_enable_oom(); | 3801 | mem_cgroup_oom_enable(); |
3867 | 3802 | ||
3868 | ret = __handle_mm_fault(mm, vma, address, flags); | 3803 | ret = __handle_mm_fault(mm, vma, address, flags); |
3869 | 3804 | ||
3870 | if (flags & FAULT_FLAG_USER) | 3805 | if (flags & FAULT_FLAG_USER) { |
3871 | mem_cgroup_disable_oom(); | 3806 | mem_cgroup_oom_disable(); |
3872 | 3807 | /* | |
3873 | if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) | 3808 | * The task may have entered a memcg OOM situation but |
3874 | mem_cgroup_oom_synchronize(); | 3809 | * if the allocation error was handled gracefully (no |
3810 | * VM_FAULT_OOM), there is no need to kill anything. | ||
3811 | * Just clean up the OOM state peacefully. | ||
3812 | */ | ||
3813 | if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) | ||
3814 | mem_cgroup_oom_synchronize(false); | ||
3815 | } | ||
3875 | 3816 | ||
3876 | return ret; | 3817 | return ret; |
3877 | } | 3818 | } |
@@ -4329,3 +4270,28 @@ void copy_user_huge_page(struct page *dst, struct page *src, | |||
4329 | } | 4270 | } |
4330 | } | 4271 | } |
4331 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | 4272 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
4273 | |||
4274 | #if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS | ||
4275 | static struct kmem_cache *page_ptl_cachep; | ||
4276 | void __init ptlock_cache_init(void) | ||
4277 | { | ||
4278 | page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, | ||
4279 | SLAB_PANIC, NULL); | ||
4280 | } | ||
4281 | |||
4282 | bool ptlock_alloc(struct page *page) | ||
4283 | { | ||
4284 | spinlock_t *ptl; | ||
4285 | |||
4286 | ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); | ||
4287 | if (!ptl) | ||
4288 | return false; | ||
4289 | page->ptl = ptl; | ||
4290 | return true; | ||
4291 | } | ||
4292 | |||
4293 | void ptlock_free(struct page *page) | ||
4294 | { | ||
4295 | kfree(page->ptl); | ||
4296 | } | ||
4297 | #endif | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed85fe3870e2..489f235502db 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/firmware-map.h> | 31 | #include <linux/firmware-map.h> |
32 | #include <linux/stop_machine.h> | 32 | #include <linux/stop_machine.h> |
33 | #include <linux/hugetlb.h> | 33 | #include <linux/hugetlb.h> |
34 | #include <linux/memblock.h> | ||
34 | 35 | ||
35 | #include <asm/tlbflush.h> | 36 | #include <asm/tlbflush.h> |
36 | 37 | ||
@@ -365,8 +366,7 @@ out_fail: | |||
365 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | 366 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, |
366 | unsigned long end_pfn) | 367 | unsigned long end_pfn) |
367 | { | 368 | { |
368 | unsigned long old_pgdat_end_pfn = | 369 | unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); |
369 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
370 | 370 | ||
371 | if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) | 371 | if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) |
372 | pgdat->node_start_pfn = start_pfn; | 372 | pgdat->node_start_pfn = start_pfn; |
@@ -402,13 +402,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
402 | static int __meminit __add_section(int nid, struct zone *zone, | 402 | static int __meminit __add_section(int nid, struct zone *zone, |
403 | unsigned long phys_start_pfn) | 403 | unsigned long phys_start_pfn) |
404 | { | 404 | { |
405 | int nr_pages = PAGES_PER_SECTION; | ||
406 | int ret; | 405 | int ret; |
407 | 406 | ||
408 | if (pfn_valid(phys_start_pfn)) | 407 | if (pfn_valid(phys_start_pfn)) |
409 | return -EEXIST; | 408 | return -EEXIST; |
410 | 409 | ||
411 | ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); | 410 | ret = sparse_add_one_section(zone, phys_start_pfn); |
412 | 411 | ||
413 | if (ret < 0) | 412 | if (ret < 0) |
414 | return ret; | 413 | return ret; |
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, | |||
579 | static void shrink_pgdat_span(struct pglist_data *pgdat, | 578 | static void shrink_pgdat_span(struct pglist_data *pgdat, |
580 | unsigned long start_pfn, unsigned long end_pfn) | 579 | unsigned long start_pfn, unsigned long end_pfn) |
581 | { | 580 | { |
582 | unsigned long pgdat_start_pfn = pgdat->node_start_pfn; | 581 | unsigned long pgdat_start_pfn = pgdat->node_start_pfn; |
583 | unsigned long pgdat_end_pfn = | 582 | unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ |
584 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | 583 | unsigned long pgdat_end_pfn = p; |
585 | unsigned long pfn; | 584 | unsigned long pfn; |
586 | struct mem_section *ms; | 585 | struct mem_section *ms; |
587 | int nid = pgdat->node_id; | 586 | int nid = pgdat->node_id; |
@@ -935,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
935 | arg.nr_pages = nr_pages; | 934 | arg.nr_pages = nr_pages; |
936 | node_states_check_changes_online(nr_pages, zone, &arg); | 935 | node_states_check_changes_online(nr_pages, zone, &arg); |
937 | 936 | ||
938 | nid = page_to_nid(pfn_to_page(pfn)); | 937 | nid = pfn_to_nid(pfn); |
939 | 938 | ||
940 | ret = memory_notify(MEM_GOING_ONLINE, &arg); | 939 | ret = memory_notify(MEM_GOING_ONLINE, &arg); |
941 | ret = notifier_to_errno(ret); | 940 | ret = notifier_to_errno(ret); |
@@ -1044,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) | |||
1044 | } | 1043 | } |
1045 | 1044 | ||
1046 | 1045 | ||
1047 | /* | 1046 | /** |
1047 | * try_online_node - online a node if offlined | ||
1048 | * | ||
1048 | * called by cpu_up() to online a node without onlined memory. | 1049 | * called by cpu_up() to online a node without onlined memory. |
1049 | */ | 1050 | */ |
1050 | int mem_online_node(int nid) | 1051 | int try_online_node(int nid) |
1051 | { | 1052 | { |
1052 | pg_data_t *pgdat; | 1053 | pg_data_t *pgdat; |
1053 | int ret; | 1054 | int ret; |
1054 | 1055 | ||
1056 | if (node_online(nid)) | ||
1057 | return 0; | ||
1058 | |||
1055 | lock_memory_hotplug(); | 1059 | lock_memory_hotplug(); |
1056 | pgdat = hotadd_new_pgdat(nid, 0); | 1060 | pgdat = hotadd_new_pgdat(nid, 0); |
1057 | if (!pgdat) { | 1061 | if (!pgdat) { |
1062 | pr_err("Cannot online node %d due to NULL pgdat\n", nid); | ||
1058 | ret = -ENOMEM; | 1063 | ret = -ENOMEM; |
1059 | goto out; | 1064 | goto out; |
1060 | } | 1065 | } |
@@ -1062,6 +1067,12 @@ int mem_online_node(int nid) | |||
1062 | ret = register_one_node(nid); | 1067 | ret = register_one_node(nid); |
1063 | BUG_ON(ret); | 1068 | BUG_ON(ret); |
1064 | 1069 | ||
1070 | if (pgdat->node_zonelists->_zonerefs->zone == NULL) { | ||
1071 | mutex_lock(&zonelists_mutex); | ||
1072 | build_all_zonelists(NULL, NULL); | ||
1073 | mutex_unlock(&zonelists_mutex); | ||
1074 | } | ||
1075 | |||
1065 | out: | 1076 | out: |
1066 | unlock_memory_hotplug(); | 1077 | unlock_memory_hotplug(); |
1067 | return ret; | 1078 | return ret; |
@@ -1412,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | |||
1412 | } | 1423 | } |
1413 | #endif /* CONFIG_MOVABLE_NODE */ | 1424 | #endif /* CONFIG_MOVABLE_NODE */ |
1414 | 1425 | ||
1426 | static int __init cmdline_parse_movable_node(char *p) | ||
1427 | { | ||
1428 | #ifdef CONFIG_MOVABLE_NODE | ||
1429 | /* | ||
1430 | * Memory used by the kernel cannot be hot-removed because Linux | ||
1431 | * cannot migrate the kernel pages. When memory hotplug is | ||
1432 | * enabled, we should prevent memblock from allocating memory | ||
1433 | * for the kernel. | ||
1434 | * | ||
1435 | * ACPI SRAT records all hotpluggable memory ranges. But before | ||
1436 | * SRAT is parsed, we don't know about it. | ||
1437 | * | ||
1438 | * The kernel image is loaded into memory at very early time. We | ||
1439 | * cannot prevent this anyway. So on NUMA system, we set any | ||
1440 | * node the kernel resides in as un-hotpluggable. | ||
1441 | * | ||
1442 | * Since on modern servers, one node could have double-digit | ||
1443 | * gigabytes memory, we can assume the memory around the kernel | ||
1444 | * image is also un-hotpluggable. So before SRAT is parsed, just | ||
1445 | * allocate memory near the kernel image to try the best to keep | ||
1446 | * the kernel away from hotpluggable memory. | ||
1447 | */ | ||
1448 | memblock_set_bottom_up(true); | ||
1449 | #else | ||
1450 | pr_warn("movable_node option not supported\n"); | ||
1451 | #endif | ||
1452 | return 0; | ||
1453 | } | ||
1454 | early_param("movable_node", cmdline_parse_movable_node); | ||
1455 | |||
1415 | /* check which state of node_states will be changed when offline memory */ | 1456 | /* check which state of node_states will be changed when offline memory */ |
1416 | static void node_states_check_changes_offline(unsigned long nr_pages, | 1457 | static void node_states_check_changes_offline(unsigned long nr_pages, |
1417 | struct zone *zone, struct memory_notify *arg) | 1458 | struct zone *zone, struct memory_notify *arg) |
@@ -1702,7 +1743,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, | |||
1702 | } | 1743 | } |
1703 | 1744 | ||
1704 | #ifdef CONFIG_MEMORY_HOTREMOVE | 1745 | #ifdef CONFIG_MEMORY_HOTREMOVE |
1705 | static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | 1746 | static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) |
1706 | { | 1747 | { |
1707 | int ret = !is_memblock_offlined(mem); | 1748 | int ret = !is_memblock_offlined(mem); |
1708 | 1749 | ||
@@ -1854,7 +1895,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) | |||
1854 | * if this is not the case. | 1895 | * if this is not the case. |
1855 | */ | 1896 | */ |
1856 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, | 1897 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, |
1857 | is_memblock_offlined_cb); | 1898 | check_memblock_offlined_cb); |
1858 | if (ret) { | 1899 | if (ret) { |
1859 | unlock_memory_hotplug(); | 1900 | unlock_memory_hotplug(); |
1860 | BUG(); | 1901 | BUG(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 04729647f359..c4403cdf3433 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -525,8 +525,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, | |||
525 | #ifdef CONFIG_HUGETLB_PAGE | 525 | #ifdef CONFIG_HUGETLB_PAGE |
526 | int nid; | 526 | int nid; |
527 | struct page *page; | 527 | struct page *page; |
528 | spinlock_t *ptl; | ||
528 | 529 | ||
529 | spin_lock(&vma->vm_mm->page_table_lock); | 530 | ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); |
530 | page = pte_page(huge_ptep_get((pte_t *)pmd)); | 531 | page = pte_page(huge_ptep_get((pte_t *)pmd)); |
531 | nid = page_to_nid(page); | 532 | nid = page_to_nid(page); |
532 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 533 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
@@ -536,7 +537,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, | |||
536 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) | 537 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) |
537 | isolate_huge_page(page, private); | 538 | isolate_huge_page(page, private); |
538 | unlock: | 539 | unlock: |
539 | spin_unlock(&vma->vm_mm->page_table_lock); | 540 | spin_unlock(ptl); |
540 | #else | 541 | #else |
541 | BUG(); | 542 | BUG(); |
542 | #endif | 543 | #endif |
@@ -1125,7 +1126,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, | |||
1125 | tmp = *from; | 1126 | tmp = *from; |
1126 | while (!nodes_empty(tmp)) { | 1127 | while (!nodes_empty(tmp)) { |
1127 | int s,d; | 1128 | int s,d; |
1128 | int source = -1; | 1129 | int source = NUMA_NO_NODE; |
1129 | int dest = 0; | 1130 | int dest = 0; |
1130 | 1131 | ||
1131 | for_each_node_mask(s, tmp) { | 1132 | for_each_node_mask(s, tmp) { |
@@ -1160,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, | |||
1160 | if (!node_isset(dest, tmp)) | 1161 | if (!node_isset(dest, tmp)) |
1161 | break; | 1162 | break; |
1162 | } | 1163 | } |
1163 | if (source == -1) | 1164 | if (source == NUMA_NO_NODE) |
1164 | break; | 1165 | break; |
1165 | 1166 | ||
1166 | node_clear(source, tmp); | 1167 | node_clear(source, tmp); |
@@ -1679,6 +1680,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1679 | return pol; | 1680 | return pol; |
1680 | } | 1681 | } |
1681 | 1682 | ||
1683 | bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) | ||
1684 | { | ||
1685 | struct mempolicy *pol = get_task_policy(task); | ||
1686 | if (vma) { | ||
1687 | if (vma->vm_ops && vma->vm_ops->get_policy) { | ||
1688 | bool ret = false; | ||
1689 | |||
1690 | pol = vma->vm_ops->get_policy(vma, vma->vm_start); | ||
1691 | if (pol && (pol->flags & MPOL_F_MOF)) | ||
1692 | ret = true; | ||
1693 | mpol_cond_put(pol); | ||
1694 | |||
1695 | return ret; | ||
1696 | } else if (vma->vm_policy) { | ||
1697 | pol = vma->vm_policy; | ||
1698 | } | ||
1699 | } | ||
1700 | |||
1701 | if (!pol) | ||
1702 | return default_policy.flags & MPOL_F_MOF; | ||
1703 | |||
1704 | return pol->flags & MPOL_F_MOF; | ||
1705 | } | ||
1706 | |||
1682 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) | 1707 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) |
1683 | { | 1708 | { |
1684 | enum zone_type dynamic_policy_zone = policy_zone; | 1709 | enum zone_type dynamic_policy_zone = policy_zone; |
@@ -1811,7 +1836,7 @@ static unsigned offset_il_node(struct mempolicy *pol, | |||
1811 | unsigned nnodes = nodes_weight(pol->v.nodes); | 1836 | unsigned nnodes = nodes_weight(pol->v.nodes); |
1812 | unsigned target; | 1837 | unsigned target; |
1813 | int c; | 1838 | int c; |
1814 | int nid = -1; | 1839 | int nid = NUMA_NO_NODE; |
1815 | 1840 | ||
1816 | if (!nnodes) | 1841 | if (!nnodes) |
1817 | return numa_node_id(); | 1842 | return numa_node_id(); |
@@ -1848,11 +1873,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1848 | 1873 | ||
1849 | /* | 1874 | /* |
1850 | * Return the bit number of a random bit set in the nodemask. | 1875 | * Return the bit number of a random bit set in the nodemask. |
1851 | * (returns -1 if nodemask is empty) | 1876 | * (returns NUMA_NO_NODE if nodemask is empty) |
1852 | */ | 1877 | */ |
1853 | int node_random(const nodemask_t *maskp) | 1878 | int node_random(const nodemask_t *maskp) |
1854 | { | 1879 | { |
1855 | int w, bit = -1; | 1880 | int w, bit = NUMA_NO_NODE; |
1856 | 1881 | ||
1857 | w = nodes_weight(*maskp); | 1882 | w = nodes_weight(*maskp); |
1858 | if (w) | 1883 | if (w) |
@@ -2277,6 +2302,35 @@ static void sp_free(struct sp_node *n) | |||
2277 | kmem_cache_free(sn_cache, n); | 2302 | kmem_cache_free(sn_cache, n); |
2278 | } | 2303 | } |
2279 | 2304 | ||
2305 | #ifdef CONFIG_NUMA_BALANCING | ||
2306 | static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) | ||
2307 | { | ||
2308 | /* Never defer a private fault */ | ||
2309 | if (cpupid_match_pid(p, last_cpupid)) | ||
2310 | return false; | ||
2311 | |||
2312 | if (p->numa_migrate_deferred) { | ||
2313 | p->numa_migrate_deferred--; | ||
2314 | return true; | ||
2315 | } | ||
2316 | return false; | ||
2317 | } | ||
2318 | |||
2319 | static inline void defer_numa_migrate(struct task_struct *p) | ||
2320 | { | ||
2321 | p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; | ||
2322 | } | ||
2323 | #else | ||
2324 | static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) | ||
2325 | { | ||
2326 | return false; | ||
2327 | } | ||
2328 | |||
2329 | static inline void defer_numa_migrate(struct task_struct *p) | ||
2330 | { | ||
2331 | } | ||
2332 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2333 | |||
2280 | /** | 2334 | /** |
2281 | * mpol_misplaced - check whether current page node is valid in policy | 2335 | * mpol_misplaced - check whether current page node is valid in policy |
2282 | * | 2336 | * |
@@ -2300,6 +2354,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2300 | struct zone *zone; | 2354 | struct zone *zone; |
2301 | int curnid = page_to_nid(page); | 2355 | int curnid = page_to_nid(page); |
2302 | unsigned long pgoff; | 2356 | unsigned long pgoff; |
2357 | int thiscpu = raw_smp_processor_id(); | ||
2358 | int thisnid = cpu_to_node(thiscpu); | ||
2303 | int polnid = -1; | 2359 | int polnid = -1; |
2304 | int ret = -1; | 2360 | int ret = -1; |
2305 | 2361 | ||
@@ -2348,9 +2404,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2348 | 2404 | ||
2349 | /* Migrate the page towards the node whose CPU is referencing it */ | 2405 | /* Migrate the page towards the node whose CPU is referencing it */ |
2350 | if (pol->flags & MPOL_F_MORON) { | 2406 | if (pol->flags & MPOL_F_MORON) { |
2351 | int last_nid; | 2407 | int last_cpupid; |
2408 | int this_cpupid; | ||
2352 | 2409 | ||
2353 | polnid = numa_node_id(); | 2410 | polnid = thisnid; |
2411 | this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); | ||
2354 | 2412 | ||
2355 | /* | 2413 | /* |
2356 | * Multi-stage node selection is used in conjunction | 2414 | * Multi-stage node selection is used in conjunction |
@@ -2373,8 +2431,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2373 | * it less likely we act on an unlikely task<->page | 2431 | * it less likely we act on an unlikely task<->page |
2374 | * relation. | 2432 | * relation. |
2375 | */ | 2433 | */ |
2376 | last_nid = page_nid_xchg_last(page, polnid); | 2434 | last_cpupid = page_cpupid_xchg_last(page, this_cpupid); |
2377 | if (last_nid != polnid) | 2435 | if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { |
2436 | |||
2437 | /* See sysctl_numa_balancing_migrate_deferred comment */ | ||
2438 | if (!cpupid_match_pid(current, last_cpupid)) | ||
2439 | defer_numa_migrate(current); | ||
2440 | |||
2441 | goto out; | ||
2442 | } | ||
2443 | |||
2444 | /* | ||
2445 | * The quadratic filter above reduces extraneous migration | ||
2446 | * of shared pages somewhat. This code reduces it even more, | ||
2447 | * reducing the overhead of page migrations of shared pages. | ||
2448 | * This makes workloads with shared pages rely more on | ||
2449 | * "move task near its memory", and less on "move memory | ||
2450 | * towards its task", which is exactly what we want. | ||
2451 | */ | ||
2452 | if (numa_migrate_deferred(current, last_cpupid)) | ||
2378 | goto out; | 2453 | goto out; |
2379 | } | 2454 | } |
2380 | 2455 | ||
@@ -2840,62 +2915,45 @@ out: | |||
2840 | * @maxlen: length of @buffer | 2915 | * @maxlen: length of @buffer |
2841 | * @pol: pointer to mempolicy to be formatted | 2916 | * @pol: pointer to mempolicy to be formatted |
2842 | * | 2917 | * |
2843 | * Convert a mempolicy into a string. | 2918 | * Convert @pol into a string. If @buffer is too short, truncate the string. |
2844 | * Returns the number of characters in buffer (if positive) | 2919 | * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the |
2845 | * or an error (negative) | 2920 | * longest flag, "relative", and to display at least a few node ids. |
2846 | */ | 2921 | */ |
2847 | int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | 2922 | void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) |
2848 | { | 2923 | { |
2849 | char *p = buffer; | 2924 | char *p = buffer; |
2850 | int l; | 2925 | nodemask_t nodes = NODE_MASK_NONE; |
2851 | nodemask_t nodes; | 2926 | unsigned short mode = MPOL_DEFAULT; |
2852 | unsigned short mode; | 2927 | unsigned short flags = 0; |
2853 | unsigned short flags = pol ? pol->flags : 0; | ||
2854 | 2928 | ||
2855 | /* | 2929 | if (pol && pol != &default_policy) { |
2856 | * Sanity check: room for longest mode, flag and some nodes | ||
2857 | */ | ||
2858 | VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); | ||
2859 | |||
2860 | if (!pol || pol == &default_policy) | ||
2861 | mode = MPOL_DEFAULT; | ||
2862 | else | ||
2863 | mode = pol->mode; | 2930 | mode = pol->mode; |
2931 | flags = pol->flags; | ||
2932 | } | ||
2864 | 2933 | ||
2865 | switch (mode) { | 2934 | switch (mode) { |
2866 | case MPOL_DEFAULT: | 2935 | case MPOL_DEFAULT: |
2867 | nodes_clear(nodes); | ||
2868 | break; | 2936 | break; |
2869 | |||
2870 | case MPOL_PREFERRED: | 2937 | case MPOL_PREFERRED: |
2871 | nodes_clear(nodes); | ||
2872 | if (flags & MPOL_F_LOCAL) | 2938 | if (flags & MPOL_F_LOCAL) |
2873 | mode = MPOL_LOCAL; | 2939 | mode = MPOL_LOCAL; |
2874 | else | 2940 | else |
2875 | node_set(pol->v.preferred_node, nodes); | 2941 | node_set(pol->v.preferred_node, nodes); |
2876 | break; | 2942 | break; |
2877 | |||
2878 | case MPOL_BIND: | 2943 | case MPOL_BIND: |
2879 | /* Fall through */ | ||
2880 | case MPOL_INTERLEAVE: | 2944 | case MPOL_INTERLEAVE: |
2881 | nodes = pol->v.nodes; | 2945 | nodes = pol->v.nodes; |
2882 | break; | 2946 | break; |
2883 | |||
2884 | default: | 2947 | default: |
2885 | return -EINVAL; | 2948 | WARN_ON_ONCE(1); |
2949 | snprintf(p, maxlen, "unknown"); | ||
2950 | return; | ||
2886 | } | 2951 | } |
2887 | 2952 | ||
2888 | l = strlen(policy_modes[mode]); | 2953 | p += snprintf(p, maxlen, policy_modes[mode]); |
2889 | if (buffer + maxlen < p + l + 1) | ||
2890 | return -ENOSPC; | ||
2891 | |||
2892 | strcpy(p, policy_modes[mode]); | ||
2893 | p += l; | ||
2894 | 2954 | ||
2895 | if (flags & MPOL_MODE_FLAGS) { | 2955 | if (flags & MPOL_MODE_FLAGS) { |
2896 | if (buffer + maxlen < p + 2) | 2956 | p += snprintf(p, buffer + maxlen - p, "="); |
2897 | return -ENOSPC; | ||
2898 | *p++ = '='; | ||
2899 | 2957 | ||
2900 | /* | 2958 | /* |
2901 | * Currently, the only defined flags are mutually exclusive | 2959 | * Currently, the only defined flags are mutually exclusive |
@@ -2907,10 +2965,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
2907 | } | 2965 | } |
2908 | 2966 | ||
2909 | if (!nodes_empty(nodes)) { | 2967 | if (!nodes_empty(nodes)) { |
2910 | if (buffer + maxlen < p + 2) | 2968 | p += snprintf(p, buffer + maxlen - p, ":"); |
2911 | return -ENOSPC; | ||
2912 | *p++ = ':'; | ||
2913 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | 2969 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); |
2914 | } | 2970 | } |
2915 | return p - buffer; | ||
2916 | } | 2971 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 9c8d5f59d30b..316e720a2023 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l) | |||
107 | list_del(&page->lru); | 107 | list_del(&page->lru); |
108 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 108 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
109 | page_is_file_cache(page)); | 109 | page_is_file_cache(page)); |
110 | if (unlikely(balloon_page_movable(page))) | 110 | if (unlikely(isolated_balloon_page(page))) |
111 | balloon_page_putback(page); | 111 | balloon_page_putback(page); |
112 | else | 112 | else |
113 | putback_lru_page(page); | 113 | putback_lru_page(page); |
@@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
130 | ptep = huge_pte_offset(mm, addr); | 130 | ptep = huge_pte_offset(mm, addr); |
131 | if (!ptep) | 131 | if (!ptep) |
132 | goto out; | 132 | goto out; |
133 | ptl = &mm->page_table_lock; | 133 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); |
134 | } else { | 134 | } else { |
135 | pmd = mm_find_pmd(mm, addr); | 135 | pmd = mm_find_pmd(mm, addr); |
136 | if (!pmd) | 136 | if (!pmd) |
@@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
161 | 161 | ||
162 | get_page(new); | 162 | get_page(new); |
163 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 163 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
164 | if (pte_swp_soft_dirty(*ptep)) | ||
165 | pte = pte_mksoft_dirty(pte); | ||
164 | if (is_write_migration_entry(entry)) | 166 | if (is_write_migration_entry(entry)) |
165 | pte = pte_mkwrite(pte); | 167 | pte = pte_mkwrite(pte); |
166 | #ifdef CONFIG_HUGETLB_PAGE | 168 | #ifdef CONFIG_HUGETLB_PAGE |
@@ -247,9 +249,10 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | |||
247 | __migration_entry_wait(mm, ptep, ptl); | 249 | __migration_entry_wait(mm, ptep, ptl); |
248 | } | 250 | } |
249 | 251 | ||
250 | void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) | 252 | void migration_entry_wait_huge(struct vm_area_struct *vma, |
253 | struct mm_struct *mm, pte_t *pte) | ||
251 | { | 254 | { |
252 | spinlock_t *ptl = &(mm)->page_table_lock; | 255 | spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); |
253 | __migration_entry_wait(mm, pte, ptl); | 256 | __migration_entry_wait(mm, pte, ptl); |
254 | } | 257 | } |
255 | 258 | ||
@@ -443,6 +446,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
443 | */ | 446 | */ |
444 | void migrate_page_copy(struct page *newpage, struct page *page) | 447 | void migrate_page_copy(struct page *newpage, struct page *page) |
445 | { | 448 | { |
449 | int cpupid; | ||
450 | |||
446 | if (PageHuge(page) || PageTransHuge(page)) | 451 | if (PageHuge(page) || PageTransHuge(page)) |
447 | copy_huge_page(newpage, page); | 452 | copy_huge_page(newpage, page); |
448 | else | 453 | else |
@@ -479,6 +484,13 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
479 | __set_page_dirty_nobuffers(newpage); | 484 | __set_page_dirty_nobuffers(newpage); |
480 | } | 485 | } |
481 | 486 | ||
487 | /* | ||
488 | * Copy NUMA information to the new page, to prevent over-eager | ||
489 | * future migrations of this same page. | ||
490 | */ | ||
491 | cpupid = page_cpupid_xchg_last(page, -1); | ||
492 | page_cpupid_xchg_last(newpage, cpupid); | ||
493 | |||
482 | mlock_migrate_page(newpage, page); | 494 | mlock_migrate_page(newpage, page); |
483 | ksm_migrate_page(newpage, page); | 495 | ksm_migrate_page(newpage, page); |
484 | /* | 496 | /* |
@@ -1498,7 +1510,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
1498 | __GFP_NOWARN) & | 1510 | __GFP_NOWARN) & |
1499 | ~GFP_IOFS, 0); | 1511 | ~GFP_IOFS, 0); |
1500 | if (newpage) | 1512 | if (newpage) |
1501 | page_nid_xchg_last(newpage, page_nid_last(page)); | 1513 | page_cpupid_xchg_last(newpage, page_cpupid_last(page)); |
1502 | 1514 | ||
1503 | return newpage; | 1515 | return newpage; |
1504 | } | 1516 | } |
@@ -1599,7 +1611,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | |||
1599 | * node. Caller is expected to have an elevated reference count on | 1611 | * node. Caller is expected to have an elevated reference count on |
1600 | * the page that will be dropped by this function before returning. | 1612 | * the page that will be dropped by this function before returning. |
1601 | */ | 1613 | */ |
1602 | int migrate_misplaced_page(struct page *page, int node) | 1614 | int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, |
1615 | int node) | ||
1603 | { | 1616 | { |
1604 | pg_data_t *pgdat = NODE_DATA(node); | 1617 | pg_data_t *pgdat = NODE_DATA(node); |
1605 | int isolated; | 1618 | int isolated; |
@@ -1607,10 +1620,11 @@ int migrate_misplaced_page(struct page *page, int node) | |||
1607 | LIST_HEAD(migratepages); | 1620 | LIST_HEAD(migratepages); |
1608 | 1621 | ||
1609 | /* | 1622 | /* |
1610 | * Don't migrate pages that are mapped in multiple processes. | 1623 | * Don't migrate file pages that are mapped in multiple processes |
1611 | * TODO: Handle false sharing detection instead of this hammer | 1624 | * with execute permissions as they are probably shared libraries. |
1612 | */ | 1625 | */ |
1613 | if (page_mapcount(page) != 1) | 1626 | if (page_mapcount(page) != 1 && page_is_file_cache(page) && |
1627 | (vma->vm_flags & VM_EXEC)) | ||
1614 | goto out; | 1628 | goto out; |
1615 | 1629 | ||
1616 | /* | 1630 | /* |
@@ -1653,6 +1667,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1653 | unsigned long address, | 1667 | unsigned long address, |
1654 | struct page *page, int node) | 1668 | struct page *page, int node) |
1655 | { | 1669 | { |
1670 | spinlock_t *ptl; | ||
1656 | unsigned long haddr = address & HPAGE_PMD_MASK; | 1671 | unsigned long haddr = address & HPAGE_PMD_MASK; |
1657 | pg_data_t *pgdat = NODE_DATA(node); | 1672 | pg_data_t *pgdat = NODE_DATA(node); |
1658 | int isolated = 0; | 1673 | int isolated = 0; |
@@ -1661,13 +1676,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1661 | int page_lru = page_is_file_cache(page); | 1676 | int page_lru = page_is_file_cache(page); |
1662 | 1677 | ||
1663 | /* | 1678 | /* |
1664 | * Don't migrate pages that are mapped in multiple processes. | ||
1665 | * TODO: Handle false sharing detection instead of this hammer | ||
1666 | */ | ||
1667 | if (page_mapcount(page) != 1) | ||
1668 | goto out_dropref; | ||
1669 | |||
1670 | /* | ||
1671 | * Rate-limit the amount of data that is being migrated to a node. | 1679 | * Rate-limit the amount of data that is being migrated to a node. |
1672 | * Optimal placement is no good if the memory bus is saturated and | 1680 | * Optimal placement is no good if the memory bus is saturated and |
1673 | * all the time is being spent migrating! | 1681 | * all the time is being spent migrating! |
@@ -1680,7 +1688,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1680 | if (!new_page) | 1688 | if (!new_page) |
1681 | goto out_fail; | 1689 | goto out_fail; |
1682 | 1690 | ||
1683 | page_nid_xchg_last(new_page, page_nid_last(page)); | 1691 | page_cpupid_xchg_last(new_page, page_cpupid_last(page)); |
1684 | 1692 | ||
1685 | isolated = numamigrate_isolate_page(pgdat, page); | 1693 | isolated = numamigrate_isolate_page(pgdat, page); |
1686 | if (!isolated) { | 1694 | if (!isolated) { |
@@ -1699,9 +1707,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1699 | WARN_ON(PageLRU(new_page)); | 1707 | WARN_ON(PageLRU(new_page)); |
1700 | 1708 | ||
1701 | /* Recheck the target PMD */ | 1709 | /* Recheck the target PMD */ |
1702 | spin_lock(&mm->page_table_lock); | 1710 | ptl = pmd_lock(mm, pmd); |
1703 | if (unlikely(!pmd_same(*pmd, entry))) { | 1711 | if (unlikely(!pmd_same(*pmd, entry))) { |
1704 | spin_unlock(&mm->page_table_lock); | 1712 | spin_unlock(ptl); |
1705 | 1713 | ||
1706 | /* Reverse changes made by migrate_page_copy() */ | 1714 | /* Reverse changes made by migrate_page_copy() */ |
1707 | if (TestClearPageActive(new_page)) | 1715 | if (TestClearPageActive(new_page)) |
@@ -1713,12 +1721,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1713 | unlock_page(new_page); | 1721 | unlock_page(new_page); |
1714 | put_page(new_page); /* Free it */ | 1722 | put_page(new_page); /* Free it */ |
1715 | 1723 | ||
1716 | unlock_page(page); | 1724 | /* Retake the callers reference and putback on LRU */ |
1725 | get_page(page); | ||
1717 | putback_lru_page(page); | 1726 | putback_lru_page(page); |
1718 | 1727 | mod_zone_page_state(page_zone(page), | |
1719 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1728 | NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); |
1720 | isolated = 0; | 1729 | goto out_fail; |
1721 | goto out; | ||
1722 | } | 1730 | } |
1723 | 1731 | ||
1724 | /* | 1732 | /* |
@@ -1735,9 +1743,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1735 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1743 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1736 | entry = pmd_mkhuge(entry); | 1744 | entry = pmd_mkhuge(entry); |
1737 | 1745 | ||
1738 | page_add_new_anon_rmap(new_page, vma, haddr); | 1746 | pmdp_clear_flush(vma, haddr, pmd); |
1739 | |||
1740 | set_pmd_at(mm, haddr, pmd, entry); | 1747 | set_pmd_at(mm, haddr, pmd, entry); |
1748 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
1741 | update_mmu_cache_pmd(vma, address, &entry); | 1749 | update_mmu_cache_pmd(vma, address, &entry); |
1742 | page_remove_rmap(page); | 1750 | page_remove_rmap(page); |
1743 | /* | 1751 | /* |
@@ -1746,7 +1754,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1746 | * before it's fully transferred to the new page. | 1754 | * before it's fully transferred to the new page. |
1747 | */ | 1755 | */ |
1748 | mem_cgroup_end_migration(memcg, page, new_page, true); | 1756 | mem_cgroup_end_migration(memcg, page, new_page, true); |
1749 | spin_unlock(&mm->page_table_lock); | 1757 | spin_unlock(ptl); |
1750 | 1758 | ||
1751 | unlock_page(new_page); | 1759 | unlock_page(new_page); |
1752 | unlock_page(page); | 1760 | unlock_page(page); |
@@ -1756,7 +1764,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1756 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); | 1764 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); |
1757 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); | 1765 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); |
1758 | 1766 | ||
1759 | out: | ||
1760 | mod_zone_page_state(page_zone(page), | 1767 | mod_zone_page_state(page_zone(page), |
1761 | NR_ISOLATED_ANON + page_lru, | 1768 | NR_ISOLATED_ANON + page_lru, |
1762 | -HPAGE_PMD_NR); | 1769 | -HPAGE_PMD_NR); |
@@ -1765,6 +1772,10 @@ out: | |||
1765 | out_fail: | 1772 | out_fail: |
1766 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1773 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); |
1767 | out_dropref: | 1774 | out_dropref: |
1775 | entry = pmd_mknonnuma(entry); | ||
1776 | set_pmd_at(mm, haddr, pmd, entry); | ||
1777 | update_mmu_cache_pmd(vma, address, &entry); | ||
1778 | |||
1768 | unlock_page(page); | 1779 | unlock_page(page); |
1769 | put_page(page); | 1780 | put_page(page); |
1770 | return 0; | 1781 | return 0; |
diff --git a/mm/mlock.c b/mm/mlock.c index d63802663242..d480cd6fc475 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -379,10 +379,14 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, | |||
379 | 379 | ||
380 | /* | 380 | /* |
381 | * Initialize pte walk starting at the already pinned page where we | 381 | * Initialize pte walk starting at the already pinned page where we |
382 | * are sure that there is a pte. | 382 | * are sure that there is a pte, as it was pinned under the same |
383 | * mmap_sem write op. | ||
383 | */ | 384 | */ |
384 | pte = get_locked_pte(vma->vm_mm, start, &ptl); | 385 | pte = get_locked_pte(vma->vm_mm, start, &ptl); |
385 | end = min(end, pmd_addr_end(start, end)); | 386 | /* Make sure we do not cross the page table boundary */ |
387 | end = pgd_addr_end(start, end); | ||
388 | end = pud_addr_end(start, end); | ||
389 | end = pmd_addr_end(start, end); | ||
386 | 390 | ||
387 | /* The page next to the pinned page is the first we will try to get */ | 391 | /* The page next to the pinned page is the first we will try to get */ |
388 | start += PAGE_SIZE; | 392 | start += PAGE_SIZE; |
@@ -736,6 +740,7 @@ static int do_mlockall(int flags) | |||
736 | 740 | ||
737 | /* Ignore errors */ | 741 | /* Ignore errors */ |
738 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); | 742 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); |
743 | cond_resched(); | ||
739 | } | 744 | } |
740 | out: | 745 | out: |
741 | return 0; | 746 | return 0; |
diff --git a/mm/mm_init.c b/mm/mm_init.c index 633c08863fd8..68562e92d50c 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void) | |||
71 | unsigned long or_mask, add_mask; | 71 | unsigned long or_mask, add_mask; |
72 | 72 | ||
73 | shift = 8 * sizeof(unsigned long); | 73 | shift = 8 * sizeof(unsigned long); |
74 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; | 74 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; |
75 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", | 75 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", |
76 | "Section %d Node %d Zone %d Lastnid %d Flags %d\n", | 76 | "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", |
77 | SECTIONS_WIDTH, | 77 | SECTIONS_WIDTH, |
78 | NODES_WIDTH, | 78 | NODES_WIDTH, |
79 | ZONES_WIDTH, | 79 | ZONES_WIDTH, |
80 | LAST_NID_WIDTH, | 80 | LAST_CPUPID_WIDTH, |
81 | NR_PAGEFLAGS); | 81 | NR_PAGEFLAGS); |
82 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", | 82 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", |
83 | "Section %d Node %d Zone %d Lastnid %d\n", | 83 | "Section %d Node %d Zone %d Lastcpupid %d\n", |
84 | SECTIONS_SHIFT, | 84 | SECTIONS_SHIFT, |
85 | NODES_SHIFT, | 85 | NODES_SHIFT, |
86 | ZONES_SHIFT, | 86 | ZONES_SHIFT, |
87 | LAST_NID_SHIFT); | 87 | LAST_CPUPID_SHIFT); |
88 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", | 88 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", |
89 | "Section %lu Node %lu Zone %lu Lastnid %lu\n", | 89 | "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", |
90 | (unsigned long)SECTIONS_PGSHIFT, | 90 | (unsigned long)SECTIONS_PGSHIFT, |
91 | (unsigned long)NODES_PGSHIFT, | 91 | (unsigned long)NODES_PGSHIFT, |
92 | (unsigned long)ZONES_PGSHIFT, | 92 | (unsigned long)ZONES_PGSHIFT, |
93 | (unsigned long)LAST_NID_PGSHIFT); | 93 | (unsigned long)LAST_CPUPID_PGSHIFT); |
94 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", | 94 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", |
95 | "Node/Zone ID: %lu -> %lu\n", | 95 | "Node/Zone ID: %lu -> %lu\n", |
96 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), | 96 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), |
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void) | |||
102 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | 102 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", |
103 | "Node not in page flags"); | 103 | "Node not in page flags"); |
104 | #endif | 104 | #endif |
105 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | 105 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS |
106 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | 106 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", |
107 | "Last nid not in page flags"); | 107 | "Last cpupid not in page flags"); |
108 | #endif | 108 | #endif |
109 | 109 | ||
110 | if (SECTIONS_WIDTH) { | 110 | if (SECTIONS_WIDTH) { |
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
179 | goto error; | 179 | goto error; |
180 | } | 180 | } |
181 | 181 | ||
182 | allowed = (totalram_pages - hugetlb_total_pages()) | 182 | allowed = vm_commit_limit(); |
183 | * sysctl_overcommit_ratio / 100; | ||
184 | /* | 183 | /* |
185 | * Reserve some for root | 184 | * Reserve some for root |
186 | */ | 185 | */ |
187 | if (!cap_sys_admin) | 186 | if (!cap_sys_admin) |
188 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); | 187 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
189 | allowed += total_swap_pages; | ||
190 | 188 | ||
191 | /* | 189 | /* |
192 | * Don't let a single process grow so big a user can't recover | 190 | * Don't let a single process grow so big a user can't recover |
@@ -1299,7 +1297,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1299 | vm_flags &= ~VM_MAYEXEC; | 1297 | vm_flags &= ~VM_MAYEXEC; |
1300 | } | 1298 | } |
1301 | 1299 | ||
1302 | if (!file->f_op || !file->f_op->mmap) | 1300 | if (!file->f_op->mmap) |
1303 | return -ENODEV; | 1301 | return -ENODEV; |
1304 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | 1302 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) |
1305 | return -EINVAL; | 1303 | return -EINVAL; |
@@ -1856,7 +1854,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1856 | struct vm_area_struct *vma; | 1854 | struct vm_area_struct *vma; |
1857 | struct vm_unmapped_area_info info; | 1855 | struct vm_unmapped_area_info info; |
1858 | 1856 | ||
1859 | if (len > TASK_SIZE) | 1857 | if (len > TASK_SIZE - mmap_min_addr) |
1860 | return -ENOMEM; | 1858 | return -ENOMEM; |
1861 | 1859 | ||
1862 | if (flags & MAP_FIXED) | 1860 | if (flags & MAP_FIXED) |
@@ -1865,14 +1863,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1865 | if (addr) { | 1863 | if (addr) { |
1866 | addr = PAGE_ALIGN(addr); | 1864 | addr = PAGE_ALIGN(addr); |
1867 | vma = find_vma(mm, addr); | 1865 | vma = find_vma(mm, addr); |
1868 | if (TASK_SIZE - len >= addr && | 1866 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
1869 | (!vma || addr + len <= vma->vm_start)) | 1867 | (!vma || addr + len <= vma->vm_start)) |
1870 | return addr; | 1868 | return addr; |
1871 | } | 1869 | } |
1872 | 1870 | ||
1873 | info.flags = 0; | 1871 | info.flags = 0; |
1874 | info.length = len; | 1872 | info.length = len; |
1875 | info.low_limit = TASK_UNMAPPED_BASE; | 1873 | info.low_limit = mm->mmap_base; |
1876 | info.high_limit = TASK_SIZE; | 1874 | info.high_limit = TASK_SIZE; |
1877 | info.align_mask = 0; | 1875 | info.align_mask = 0; |
1878 | return vm_unmapped_area(&info); | 1876 | return vm_unmapped_area(&info); |
@@ -1895,7 +1893,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1895 | struct vm_unmapped_area_info info; | 1893 | struct vm_unmapped_area_info info; |
1896 | 1894 | ||
1897 | /* requested length too big for entire address space */ | 1895 | /* requested length too big for entire address space */ |
1898 | if (len > TASK_SIZE) | 1896 | if (len > TASK_SIZE - mmap_min_addr) |
1899 | return -ENOMEM; | 1897 | return -ENOMEM; |
1900 | 1898 | ||
1901 | if (flags & MAP_FIXED) | 1899 | if (flags & MAP_FIXED) |
@@ -1905,14 +1903,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1905 | if (addr) { | 1903 | if (addr) { |
1906 | addr = PAGE_ALIGN(addr); | 1904 | addr = PAGE_ALIGN(addr); |
1907 | vma = find_vma(mm, addr); | 1905 | vma = find_vma(mm, addr); |
1908 | if (TASK_SIZE - len >= addr && | 1906 | if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
1909 | (!vma || addr + len <= vma->vm_start)) | 1907 | (!vma || addr + len <= vma->vm_start)) |
1910 | return addr; | 1908 | return addr; |
1911 | } | 1909 | } |
1912 | 1910 | ||
1913 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; | 1911 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
1914 | info.length = len; | 1912 | info.length = len; |
1915 | info.low_limit = PAGE_SIZE; | 1913 | info.low_limit = max(PAGE_SIZE, mmap_min_addr); |
1916 | info.high_limit = mm->mmap_base; | 1914 | info.high_limit = mm->mmap_base; |
1917 | info.align_mask = 0; | 1915 | info.align_mask = 0; |
1918 | addr = vm_unmapped_area(&info); | 1916 | addr = vm_unmapped_area(&info); |
@@ -1951,7 +1949,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1951 | return -ENOMEM; | 1949 | return -ENOMEM; |
1952 | 1950 | ||
1953 | get_area = current->mm->get_unmapped_area; | 1951 | get_area = current->mm->get_unmapped_area; |
1954 | if (file && file->f_op && file->f_op->get_unmapped_area) | 1952 | if (file && file->f_op->get_unmapped_area) |
1955 | get_area = file->f_op->get_unmapped_area; | 1953 | get_area = file->f_op->get_unmapped_area; |
1956 | addr = get_area(file, addr, len, pgoff, flags); | 1954 | addr = get_area(file, addr, len, pgoff, flags); |
1957 | if (IS_ERR_VALUE(addr)) | 1955 | if (IS_ERR_VALUE(addr)) |
@@ -2726,7 +2724,8 @@ void exit_mmap(struct mm_struct *mm) | |||
2726 | } | 2724 | } |
2727 | vm_unacct_memory(nr_accounted); | 2725 | vm_unacct_memory(nr_accounted); |
2728 | 2726 | ||
2729 | WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 2727 | WARN_ON(atomic_long_read(&mm->nr_ptes) > |
2728 | (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | ||
2730 | } | 2729 | } |
2731 | 2730 | ||
2732 | /* Insert vm structure into process list sorted by address | 2731 | /* Insert vm structure into process list sorted by address |
diff --git a/mm/mmzone.c b/mm/mmzone.c index 2ac0afbd68f3..bf34fb8556db 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) | |||
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); |
98 | } | 98 | } |
99 | 99 | ||
100 | #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) | 100 | #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) |
101 | int page_nid_xchg_last(struct page *page, int nid) | 101 | int page_cpupid_xchg_last(struct page *page, int cpupid) |
102 | { | 102 | { |
103 | unsigned long old_flags, flags; | 103 | unsigned long old_flags, flags; |
104 | int last_nid; | 104 | int last_cpupid; |
105 | 105 | ||
106 | do { | 106 | do { |
107 | old_flags = flags = page->flags; | 107 | old_flags = flags = page->flags; |
108 | last_nid = page_nid_last(page); | 108 | last_cpupid = page_cpupid_last(page); |
109 | 109 | ||
110 | flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); | 110 | flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); |
111 | flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; | 111 | flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; |
112 | } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); | 112 | } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); |
113 | 113 | ||
114 | return last_nid; | 114 | return last_cpupid; |
115 | } | 115 | } |
116 | #endif | 116 | #endif |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..26667971c824 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
37 | 37 | ||
38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
39 | unsigned long addr, unsigned long end, pgprot_t newprot, | 39 | unsigned long addr, unsigned long end, pgprot_t newprot, |
40 | int dirty_accountable, int prot_numa, bool *ret_all_same_node) | 40 | int dirty_accountable, int prot_numa) |
41 | { | 41 | { |
42 | struct mm_struct *mm = vma->vm_mm; | 42 | struct mm_struct *mm = vma->vm_mm; |
43 | pte_t *pte, oldpte; | 43 | pte_t *pte, oldpte; |
44 | spinlock_t *ptl; | 44 | spinlock_t *ptl; |
45 | unsigned long pages = 0; | 45 | unsigned long pages = 0; |
46 | bool all_same_node = true; | ||
47 | int last_nid = -1; | ||
48 | 46 | ||
49 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 47 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
50 | arch_enter_lazy_mmu_mode(); | 48 | arch_enter_lazy_mmu_mode(); |
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
63 | 61 | ||
64 | page = vm_normal_page(vma, addr, oldpte); | 62 | page = vm_normal_page(vma, addr, oldpte); |
65 | if (page) { | 63 | if (page) { |
66 | int this_nid = page_to_nid(page); | 64 | if (!pte_numa(oldpte)) { |
67 | if (last_nid == -1) | ||
68 | last_nid = this_nid; | ||
69 | if (last_nid != this_nid) | ||
70 | all_same_node = false; | ||
71 | |||
72 | /* only check non-shared pages */ | ||
73 | if (!pte_numa(oldpte) && | ||
74 | page_mapcount(page) == 1) { | ||
75 | ptent = pte_mknuma(ptent); | 65 | ptent = pte_mknuma(ptent); |
76 | updated = true; | 66 | updated = true; |
77 | } | 67 | } |
@@ -94,40 +84,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
94 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 84 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
95 | 85 | ||
96 | if (is_write_migration_entry(entry)) { | 86 | if (is_write_migration_entry(entry)) { |
87 | pte_t newpte; | ||
97 | /* | 88 | /* |
98 | * A protection check is difficult so | 89 | * A protection check is difficult so |
99 | * just be safe and disable write | 90 | * just be safe and disable write |
100 | */ | 91 | */ |
101 | make_migration_entry_read(&entry); | 92 | make_migration_entry_read(&entry); |
102 | set_pte_at(mm, addr, pte, | 93 | newpte = swp_entry_to_pte(entry); |
103 | swp_entry_to_pte(entry)); | 94 | if (pte_swp_soft_dirty(oldpte)) |
95 | newpte = pte_swp_mksoft_dirty(newpte); | ||
96 | set_pte_at(mm, addr, pte, newpte); | ||
97 | |||
98 | pages++; | ||
104 | } | 99 | } |
105 | pages++; | ||
106 | } | 100 | } |
107 | } while (pte++, addr += PAGE_SIZE, addr != end); | 101 | } while (pte++, addr += PAGE_SIZE, addr != end); |
108 | arch_leave_lazy_mmu_mode(); | 102 | arch_leave_lazy_mmu_mode(); |
109 | pte_unmap_unlock(pte - 1, ptl); | 103 | pte_unmap_unlock(pte - 1, ptl); |
110 | 104 | ||
111 | *ret_all_same_node = all_same_node; | ||
112 | return pages; | 105 | return pages; |
113 | } | 106 | } |
114 | 107 | ||
115 | #ifdef CONFIG_NUMA_BALANCING | ||
116 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
117 | pmd_t *pmd) | ||
118 | { | ||
119 | spin_lock(&mm->page_table_lock); | ||
120 | set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); | ||
121 | spin_unlock(&mm->page_table_lock); | ||
122 | } | ||
123 | #else | ||
124 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
125 | pmd_t *pmd) | ||
126 | { | ||
127 | BUG(); | ||
128 | } | ||
129 | #endif /* CONFIG_NUMA_BALANCING */ | ||
130 | |||
131 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | 108 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, |
132 | pud_t *pud, unsigned long addr, unsigned long end, | 109 | pud_t *pud, unsigned long addr, unsigned long end, |
133 | pgprot_t newprot, int dirty_accountable, int prot_numa) | 110 | pgprot_t newprot, int dirty_accountable, int prot_numa) |
@@ -135,36 +112,39 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
135 | pmd_t *pmd; | 112 | pmd_t *pmd; |
136 | unsigned long next; | 113 | unsigned long next; |
137 | unsigned long pages = 0; | 114 | unsigned long pages = 0; |
138 | bool all_same_node; | 115 | unsigned long nr_huge_updates = 0; |
139 | 116 | ||
140 | pmd = pmd_offset(pud, addr); | 117 | pmd = pmd_offset(pud, addr); |
141 | do { | 118 | do { |
119 | unsigned long this_pages; | ||
120 | |||
142 | next = pmd_addr_end(addr, end); | 121 | next = pmd_addr_end(addr, end); |
143 | if (pmd_trans_huge(*pmd)) { | 122 | if (pmd_trans_huge(*pmd)) { |
144 | if (next - addr != HPAGE_PMD_SIZE) | 123 | if (next - addr != HPAGE_PMD_SIZE) |
145 | split_huge_page_pmd(vma, addr, pmd); | 124 | split_huge_page_pmd(vma, addr, pmd); |
146 | else if (change_huge_pmd(vma, pmd, addr, newprot, | 125 | else { |
147 | prot_numa)) { | 126 | int nr_ptes = change_huge_pmd(vma, pmd, addr, |
148 | pages += HPAGE_PMD_NR; | 127 | newprot, prot_numa); |
149 | continue; | 128 | |
129 | if (nr_ptes) { | ||
130 | if (nr_ptes == HPAGE_PMD_NR) { | ||
131 | pages += HPAGE_PMD_NR; | ||
132 | nr_huge_updates++; | ||
133 | } | ||
134 | continue; | ||
135 | } | ||
150 | } | 136 | } |
151 | /* fall through */ | 137 | /* fall through */ |
152 | } | 138 | } |
153 | if (pmd_none_or_clear_bad(pmd)) | 139 | if (pmd_none_or_clear_bad(pmd)) |
154 | continue; | 140 | continue; |
155 | pages += change_pte_range(vma, pmd, addr, next, newprot, | 141 | this_pages = change_pte_range(vma, pmd, addr, next, newprot, |
156 | dirty_accountable, prot_numa, &all_same_node); | 142 | dirty_accountable, prot_numa); |
157 | 143 | pages += this_pages; | |
158 | /* | ||
159 | * If we are changing protections for NUMA hinting faults then | ||
160 | * set pmd_numa if the examined pages were all on the same | ||
161 | * node. This allows a regular PMD to be handled as one fault | ||
162 | * and effectively batches the taking of the PTL | ||
163 | */ | ||
164 | if (prot_numa && all_same_node) | ||
165 | change_pmd_protnuma(vma->vm_mm, addr, pmd); | ||
166 | } while (pmd++, addr = next, addr != end); | 144 | } while (pmd++, addr = next, addr != end); |
167 | 145 | ||
146 | if (nr_huge_updates) | ||
147 | count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); | ||
168 | return pages; | 148 | return pages; |
169 | } | 149 | } |
170 | 150 | ||
diff --git a/mm/mremap.c b/mm/mremap.c index 91b13d6a16d4..0843feb66f3d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -25,7 +25,6 @@ | |||
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | #include <asm/pgalloc.h> | ||
29 | 28 | ||
30 | #include "internal.h" | 29 | #include "internal.h" |
31 | 30 | ||
@@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
63 | return NULL; | 62 | return NULL; |
64 | 63 | ||
65 | pmd = pmd_alloc(mm, pud, addr); | 64 | pmd = pmd_alloc(mm, pud, addr); |
66 | if (!pmd) { | 65 | if (!pmd) |
67 | pud_free(mm, pud); | ||
68 | return NULL; | 66 | return NULL; |
69 | } | ||
70 | 67 | ||
71 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 68 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
72 | 69 | ||
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 61107cf55bb3..2c254d374655 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
82 | 82 | ||
83 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | 83 | static void __init __free_pages_memory(unsigned long start, unsigned long end) |
84 | { | 84 | { |
85 | unsigned long i, start_aligned, end_aligned; | 85 | int order; |
86 | int order = ilog2(BITS_PER_LONG); | ||
87 | 86 | ||
88 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | 87 | while (start < end) { |
89 | end_aligned = end & ~(BITS_PER_LONG - 1); | 88 | order = min(MAX_ORDER - 1UL, __ffs(start)); |
90 | 89 | ||
91 | if (end_aligned <= start_aligned) { | 90 | while (start + (1UL << order) > end) |
92 | for (i = start; i < end; i++) | 91 | order--; |
93 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
94 | 92 | ||
95 | return; | 93 | __free_pages_bootmem(pfn_to_page(start), order); |
96 | } | ||
97 | |||
98 | for (i = start; i < start_aligned; i++) | ||
99 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
100 | 94 | ||
101 | for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) | 95 | start += (1UL << order); |
102 | __free_pages_bootmem(pfn_to_page(i), order); | 96 | } |
103 | |||
104 | for (i = end_aligned; i < end; i++) | ||
105 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
106 | } | 97 | } |
107 | 98 | ||
108 | static unsigned long __init __free_memory_core(phys_addr_t start, | 99 | static unsigned long __init __free_memory_core(phys_addr_t start, |
diff --git a/mm/nommu.c b/mm/nommu.c index ecd1f158548e..fec093adad9c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -937,7 +937,7 @@ static int validate_mmap_request(struct file *file, | |||
937 | struct address_space *mapping; | 937 | struct address_space *mapping; |
938 | 938 | ||
939 | /* files must support mmap */ | 939 | /* files must support mmap */ |
940 | if (!file->f_op || !file->f_op->mmap) | 940 | if (!file->f_op->mmap) |
941 | return -ENODEV; | 941 | return -ENODEV; |
942 | 942 | ||
943 | /* work out if what we've got could possibly be shared | 943 | /* work out if what we've got could possibly be shared |
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1948 | goto error; | 1948 | goto error; |
1949 | } | 1949 | } |
1950 | 1950 | ||
1951 | allowed = totalram_pages * sysctl_overcommit_ratio / 100; | 1951 | allowed = vm_commit_limit(); |
1952 | /* | 1952 | /* |
1953 | * Reserve some 3% for root | 1953 | * Reserve some 3% for root |
1954 | */ | 1954 | */ |
1955 | if (!cap_sys_admin) | 1955 | if (!cap_sys_admin) |
1956 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); | 1956 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
1957 | allowed += total_swap_pages; | ||
1958 | 1957 | ||
1959 | /* | 1958 | /* |
1960 | * Don't let a single process grow so big a user can't recover | 1959 | * Don't let a single process grow so big a user can't recover |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..1e4a600a6163 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
161 | * The baseline for the badness score is the proportion of RAM that each | 161 | * The baseline for the badness score is the proportion of RAM that each |
162 | * task's rss, pagetable and swap space use. | 162 | * task's rss, pagetable and swap space use. |
163 | */ | 163 | */ |
164 | points = get_mm_rss(p->mm) + p->mm->nr_ptes + | 164 | points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + |
165 | get_mm_counter(p->mm, MM_SWAPENTS); | 165 | get_mm_counter(p->mm, MM_SWAPENTS); |
166 | task_unlock(p); | 166 | task_unlock(p); |
167 | 167 | ||
@@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
364 | continue; | 364 | continue; |
365 | } | 365 | } |
366 | 366 | ||
367 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", | 367 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", |
368 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 368 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
369 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 369 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
370 | task->mm->nr_ptes, | 370 | atomic_long_read(&task->mm->nr_ptes), |
371 | get_mm_counter(task->mm, MM_SWAPENTS), | 371 | get_mm_counter(task->mm, MM_SWAPENTS), |
372 | task->signal->oom_score_adj, task->comm); | 372 | task->signal->oom_score_adj, task->comm); |
373 | task_unlock(task); | 373 | task_unlock(task); |
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void) | |||
680 | { | 680 | { |
681 | struct zonelist *zonelist; | 681 | struct zonelist *zonelist; |
682 | 682 | ||
683 | if (mem_cgroup_oom_synchronize()) | 683 | if (mem_cgroup_oom_synchronize(true)) |
684 | return; | 684 | return; |
685 | 685 | ||
686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | 686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f5236f804aa6..63807583d8e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty, | |||
1210 | return 1; | 1210 | return 1; |
1211 | } | 1211 | } |
1212 | 1212 | ||
1213 | static long bdi_max_pause(struct backing_dev_info *bdi, | 1213 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, |
1214 | unsigned long bdi_dirty) | 1214 | unsigned long bdi_dirty) |
1215 | { | 1215 | { |
1216 | long bw = bdi->avg_write_bandwidth; | 1216 | unsigned long bw = bdi->avg_write_bandwidth; |
1217 | long t; | 1217 | unsigned long t; |
1218 | 1218 | ||
1219 | /* | 1219 | /* |
1220 | * Limit pause time for small memory systems. If sleeping for too long | 1220 | * Limit pause time for small memory systems. If sleeping for too long |
@@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi, | |||
1226 | t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); | 1226 | t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); |
1227 | t++; | 1227 | t++; |
1228 | 1228 | ||
1229 | return min_t(long, t, MAX_PAUSE); | 1229 | return min_t(unsigned long, t, MAX_PAUSE); |
1230 | } | 1230 | } |
1231 | 1231 | ||
1232 | static long bdi_min_pause(struct backing_dev_info *bdi, | 1232 | static long bdi_min_pause(struct backing_dev_info *bdi, |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0ee638f76ebe..580a5f075ed0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly; | |||
234 | 234 | ||
235 | void set_pageblock_migratetype(struct page *page, int migratetype) | 235 | void set_pageblock_migratetype(struct page *page, int migratetype) |
236 | { | 236 | { |
237 | 237 | if (unlikely(page_group_by_mobility_disabled && | |
238 | if (unlikely(page_group_by_mobility_disabled)) | 238 | migratetype < MIGRATE_PCPTYPES)) |
239 | migratetype = MIGRATE_UNMOVABLE; | 239 | migratetype = MIGRATE_UNMOVABLE; |
240 | 240 | ||
241 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 241 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page) | |||
626 | bad_page(page); | 626 | bad_page(page); |
627 | return 1; | 627 | return 1; |
628 | } | 628 | } |
629 | page_nid_reset_last(page); | 629 | page_cpupid_reset_last(page); |
630 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 630 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
631 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 631 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
632 | return 0; | 632 | return 0; |
@@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1027 | { | 1027 | { |
1028 | int current_order = page_order(page); | 1028 | int current_order = page_order(page); |
1029 | 1029 | ||
1030 | /* | ||
1031 | * When borrowing from MIGRATE_CMA, we need to release the excess | ||
1032 | * buddy pages to CMA itself. | ||
1033 | */ | ||
1030 | if (is_migrate_cma(fallback_type)) | 1034 | if (is_migrate_cma(fallback_type)) |
1031 | return fallback_type; | 1035 | return fallback_type; |
1032 | 1036 | ||
@@ -1091,21 +1095,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1091 | list_del(&page->lru); | 1095 | list_del(&page->lru); |
1092 | rmv_page_order(page); | 1096 | rmv_page_order(page); |
1093 | 1097 | ||
1094 | /* | ||
1095 | * Borrow the excess buddy pages as well, irrespective | ||
1096 | * of whether we stole freepages, or took ownership of | ||
1097 | * the pageblock or not. | ||
1098 | * | ||
1099 | * Exception: When borrowing from MIGRATE_CMA, release | ||
1100 | * the excess buddy pages to CMA itself. | ||
1101 | */ | ||
1102 | expand(zone, page, order, current_order, area, | 1098 | expand(zone, page, order, current_order, area, |
1103 | is_migrate_cma(migratetype) | 1099 | new_type); |
1104 | ? migratetype : start_migratetype); | ||
1105 | 1100 | ||
1106 | trace_mm_page_alloc_extfrag(page, order, | 1101 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1107 | current_order, start_migratetype, migratetype, | 1102 | start_migratetype, migratetype, new_type); |
1108 | new_type == start_migratetype); | ||
1109 | 1103 | ||
1110 | return page; | 1104 | return page; |
1111 | } | 1105 | } |
@@ -1711,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1711 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | 1705 | * comments in mmzone.h. Reduces cache footprint of zonelist scans |
1712 | * that have to skip over a lot of full or unallowed zones. | 1706 | * that have to skip over a lot of full or unallowed zones. |
1713 | * | 1707 | * |
1714 | * If the zonelist cache is present in the passed in zonelist, then | 1708 | * If the zonelist cache is present in the passed zonelist, then |
1715 | * returns a pointer to the allowed node mask (either the current | 1709 | * returns a pointer to the allowed node mask (either the current |
1716 | * tasks mems_allowed, or node_states[N_MEMORY].) | 1710 | * tasks mems_allowed, or node_states[N_MEMORY].) |
1717 | * | 1711 | * |
@@ -2593,7 +2587,7 @@ rebalance: | |||
2593 | * running out of options and have to consider going OOM | 2587 | * running out of options and have to consider going OOM |
2594 | */ | 2588 | */ |
2595 | if (!did_some_progress) { | 2589 | if (!did_some_progress) { |
2596 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 2590 | if (oom_gfp_allowed(gfp_mask)) { |
2597 | if (oom_killer_disabled) | 2591 | if (oom_killer_disabled) |
2598 | goto nopage; | 2592 | goto nopage; |
2599 | /* Coredumps can quickly deplete all memory reserves */ | 2593 | /* Coredumps can quickly deplete all memory reserves */ |
@@ -3881,8 +3875,6 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
3881 | return ffz(~size); | 3875 | return ffz(~size); |
3882 | } | 3876 | } |
3883 | 3877 | ||
3884 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | ||
3885 | |||
3886 | /* | 3878 | /* |
3887 | * Check if a pageblock contains reserved pages | 3879 | * Check if a pageblock contains reserved pages |
3888 | */ | 3880 | */ |
@@ -4015,7 +4007,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4015 | mminit_verify_page_links(page, zone, nid, pfn); | 4007 | mminit_verify_page_links(page, zone, nid, pfn); |
4016 | init_page_count(page); | 4008 | init_page_count(page); |
4017 | page_mapcount_reset(page); | 4009 | page_mapcount_reset(page); |
4018 | page_nid_reset_last(page); | 4010 | page_cpupid_reset_last(page); |
4019 | SetPageReserved(page); | 4011 | SetPageReserved(page); |
4020 | /* | 4012 | /* |
4021 | * Mark the block movable so that blocks are reserved for | 4013 | * Mark the block movable so that blocks are reserved for |
@@ -4266,7 +4258,7 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
4266 | */ | 4258 | */ |
4267 | zone->pageset = &boot_pageset; | 4259 | zone->pageset = &boot_pageset; |
4268 | 4260 | ||
4269 | if (zone->present_pages) | 4261 | if (populated_zone(zone)) |
4270 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", | 4262 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
4271 | zone->name, zone->present_pages, | 4263 | zone->name, zone->present_pages, |
4272 | zone_batchsize(zone)); | 4264 | zone_batchsize(zone)); |
@@ -5160,7 +5152,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) | |||
5160 | 5152 | ||
5161 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | 5153 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { |
5162 | struct zone *zone = &pgdat->node_zones[zone_type]; | 5154 | struct zone *zone = &pgdat->node_zones[zone_type]; |
5163 | if (zone->present_pages) { | 5155 | if (populated_zone(zone)) { |
5164 | node_set_state(nid, N_HIGH_MEMORY); | 5156 | node_set_state(nid, N_HIGH_MEMORY); |
5165 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | 5157 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && |
5166 | zone_type <= ZONE_NORMAL) | 5158 | zone_type <= ZONE_NORMAL) |
@@ -6366,10 +6358,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6366 | list_del(&page->lru); | 6358 | list_del(&page->lru); |
6367 | rmv_page_order(page); | 6359 | rmv_page_order(page); |
6368 | zone->free_area[order].nr_free--; | 6360 | zone->free_area[order].nr_free--; |
6369 | #ifdef CONFIG_HIGHMEM | ||
6370 | if (PageHighMem(page)) | ||
6371 | totalhigh_pages -= 1 << order; | ||
6372 | #endif | ||
6373 | for (i = 0; i < (1 << order); i++) | 6361 | for (i = 0; i < (1 << order); i++) |
6374 | SetPageReserved((page+i)); | 6362 | SetPageReserved((page+i)); |
6375 | pfn += (1 << order); | 6363 | pfn += (1 << order); |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 5da2cbcfdbb5..2beeabf502c5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -242,7 +242,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
242 | if (err) | 242 | if (err) |
243 | break; | 243 | break; |
244 | pgd++; | 244 | pgd++; |
245 | } while (addr = next, addr != end); | 245 | } while (addr = next, addr < end); |
246 | 246 | ||
247 | return err; | 247 | return err; |
248 | } | 248 | } |
diff --git a/mm/percpu.c b/mm/percpu.c index 8c8e08f3a692..0d10defe951e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1706,8 +1706,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1706 | 1706 | ||
1707 | out_free_areas: | 1707 | out_free_areas: |
1708 | for (group = 0; group < ai->nr_groups; group++) | 1708 | for (group = 0; group < ai->nr_groups; group++) |
1709 | free_fn(areas[group], | 1709 | if (areas[group]) |
1710 | ai->groups[group].nr_units * ai->unit_size); | 1710 | free_fn(areas[group], |
1711 | ai->groups[group].nr_units * ai->unit_size); | ||
1711 | out_free: | 1712 | out_free: |
1712 | pcpu_free_alloc_info(ai); | 1713 | pcpu_free_alloc_info(ai); |
1713 | if (areas) | 1714 | if (areas) |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3929a40bd6c0..cbb38545d9d6 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -151,14 +151,14 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | |||
151 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | 151 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
152 | pgtable_t pgtable) | 152 | pgtable_t pgtable) |
153 | { | 153 | { |
154 | assert_spin_locked(&mm->page_table_lock); | 154 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
155 | 155 | ||
156 | /* FIFO */ | 156 | /* FIFO */ |
157 | if (!mm->pmd_huge_pte) | 157 | if (!pmd_huge_pte(mm, pmdp)) |
158 | INIT_LIST_HEAD(&pgtable->lru); | 158 | INIT_LIST_HEAD(&pgtable->lru); |
159 | else | 159 | else |
160 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | 160 | list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); |
161 | mm->pmd_huge_pte = pgtable; | 161 | pmd_huge_pte(mm, pmdp) = pgtable; |
162 | } | 162 | } |
163 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 163 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
164 | #endif | 164 | #endif |
@@ -170,14 +170,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) | |||
170 | { | 170 | { |
171 | pgtable_t pgtable; | 171 | pgtable_t pgtable; |
172 | 172 | ||
173 | assert_spin_locked(&mm->page_table_lock); | 173 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
174 | 174 | ||
175 | /* FIFO */ | 175 | /* FIFO */ |
176 | pgtable = mm->pmd_huge_pte; | 176 | pgtable = pmd_huge_pte(mm, pmdp); |
177 | if (list_empty(&pgtable->lru)) | 177 | if (list_empty(&pgtable->lru)) |
178 | mm->pmd_huge_pte = NULL; | 178 | pmd_huge_pte(mm, pmdp) = NULL; |
179 | else { | 179 | else { |
180 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | 180 | pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, |
181 | struct page, lru); | 181 | struct page, lru); |
182 | list_del(&pgtable->lru); | 182 | list_del(&pgtable->lru); |
183 | } | 183 | } |
diff --git a/mm/readahead.c b/mm/readahead.c index e4ed04149785..7cdbb44aa90b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping, | |||
401 | unsigned long req_size) | 401 | unsigned long req_size) |
402 | { | 402 | { |
403 | unsigned long max = max_sane_readahead(ra->ra_pages); | 403 | unsigned long max = max_sane_readahead(ra->ra_pages); |
404 | pgoff_t prev_offset; | ||
404 | 405 | ||
405 | /* | 406 | /* |
406 | * start of file | 407 | * start of file |
@@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping, | |||
452 | 453 | ||
453 | /* | 454 | /* |
454 | * sequential cache miss | 455 | * sequential cache miss |
456 | * trivial case: (offset - prev_offset) == 1 | ||
457 | * unaligned reads: (offset - prev_offset) == 0 | ||
455 | */ | 458 | */ |
456 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | 459 | prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; |
460 | if (offset - prev_offset <= 1UL) | ||
457 | goto initial_readahead; | 461 | goto initial_readahead; |
458 | 462 | ||
459 | /* | 463 | /* |
@@ -569,7 +573,7 @@ static ssize_t | |||
569 | do_readahead(struct address_space *mapping, struct file *filp, | 573 | do_readahead(struct address_space *mapping, struct file *filp, |
570 | pgoff_t index, unsigned long nr) | 574 | pgoff_t index, unsigned long nr) |
571 | { | 575 | { |
572 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 576 | if (!mapping || !mapping->a_ops) |
573 | return -EINVAL; | 577 | return -EINVAL; |
574 | 578 | ||
575 | force_page_cache_readahead(mapping, filp, index, nr); | 579 | force_page_cache_readahead(mapping, filp, index, nr); |
@@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | |||
601 | 601 | ||
602 | if (unlikely(PageHuge(page))) { | 602 | if (unlikely(PageHuge(page))) { |
603 | pte = huge_pte_offset(mm, address); | 603 | pte = huge_pte_offset(mm, address); |
604 | ptl = &mm->page_table_lock; | 604 | ptl = huge_pte_lockptr(page_hstate(page), mm, pte); |
605 | goto check; | 605 | goto check; |
606 | } | 606 | } |
607 | 607 | ||
@@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
665 | unsigned long *vm_flags) | 665 | unsigned long *vm_flags) |
666 | { | 666 | { |
667 | struct mm_struct *mm = vma->vm_mm; | 667 | struct mm_struct *mm = vma->vm_mm; |
668 | spinlock_t *ptl; | ||
668 | int referenced = 0; | 669 | int referenced = 0; |
669 | 670 | ||
670 | if (unlikely(PageTransHuge(page))) { | 671 | if (unlikely(PageTransHuge(page))) { |
671 | pmd_t *pmd; | 672 | pmd_t *pmd; |
672 | 673 | ||
673 | spin_lock(&mm->page_table_lock); | ||
674 | /* | 674 | /* |
675 | * rmap might return false positives; we must filter | 675 | * rmap might return false positives; we must filter |
676 | * these out using page_check_address_pmd(). | 676 | * these out using page_check_address_pmd(). |
677 | */ | 677 | */ |
678 | pmd = page_check_address_pmd(page, mm, address, | 678 | pmd = page_check_address_pmd(page, mm, address, |
679 | PAGE_CHECK_ADDRESS_PMD_FLAG); | 679 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); |
680 | if (!pmd) { | 680 | if (!pmd) |
681 | spin_unlock(&mm->page_table_lock); | ||
682 | goto out; | 681 | goto out; |
683 | } | ||
684 | 682 | ||
685 | if (vma->vm_flags & VM_LOCKED) { | 683 | if (vma->vm_flags & VM_LOCKED) { |
686 | spin_unlock(&mm->page_table_lock); | 684 | spin_unlock(ptl); |
687 | *mapcount = 0; /* break early from loop */ | 685 | *mapcount = 0; /* break early from loop */ |
688 | *vm_flags |= VM_LOCKED; | 686 | *vm_flags |= VM_LOCKED; |
689 | goto out; | 687 | goto out; |
@@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
692 | /* go ahead even if the pmd is pmd_trans_splitting() */ | 690 | /* go ahead even if the pmd is pmd_trans_splitting() */ |
693 | if (pmdp_clear_flush_young_notify(vma, address, pmd)) | 691 | if (pmdp_clear_flush_young_notify(vma, address, pmd)) |
694 | referenced++; | 692 | referenced++; |
695 | spin_unlock(&mm->page_table_lock); | 693 | spin_unlock(ptl); |
696 | } else { | 694 | } else { |
697 | pte_t *pte; | 695 | pte_t *pte; |
698 | spinlock_t *ptl; | ||
699 | 696 | ||
700 | /* | 697 | /* |
701 | * rmap might return false positives; we must filter | 698 | * rmap might return false positives; we must filter |
@@ -3982,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3982 | 3982 | ||
3983 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); | 3983 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); |
3984 | for_each_memcg_cache_index(i) { | 3984 | for_each_memcg_cache_index(i) { |
3985 | c = cache_from_memcg(cachep, i); | 3985 | c = cache_from_memcg_idx(cachep, i); |
3986 | if (c) | 3986 | if (c) |
3987 | /* return value determined by the parent cache only */ | 3987 | /* return value determined by the parent cache only */ |
3988 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); | 3988 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); |
@@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s) | |||
160 | return s->name; | 160 | return s->name; |
161 | } | 161 | } |
162 | 162 | ||
163 | static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) | 163 | static inline struct kmem_cache * |
164 | cache_from_memcg_idx(struct kmem_cache *s, int idx) | ||
164 | { | 165 | { |
165 | if (!s->memcg_params) | 166 | if (!s->memcg_params) |
166 | return NULL; | 167 | return NULL; |
@@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s) | |||
204 | return s->name; | 205 | return s->name; |
205 | } | 206 | } |
206 | 207 | ||
207 | static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) | 208 | static inline struct kmem_cache * |
209 | cache_from_memcg_idx(struct kmem_cache *s, int idx) | ||
208 | { | 210 | { |
209 | return NULL; | 211 | return NULL; |
210 | } | 212 | } |
diff --git a/mm/slab_common.c b/mm/slab_common.c index a3443278ce3a..0b7bb399b0e4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -56,6 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, | |||
56 | continue; | 56 | continue; |
57 | } | 57 | } |
58 | 58 | ||
59 | #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) | ||
59 | /* | 60 | /* |
60 | * For simplicity, we won't check this in the list of memcg | 61 | * For simplicity, we won't check this in the list of memcg |
61 | * caches. We have control over memcg naming, and if there | 62 | * caches. We have control over memcg naming, and if there |
@@ -69,6 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, | |||
69 | s = NULL; | 70 | s = NULL; |
70 | return -EINVAL; | 71 | return -EINVAL; |
71 | } | 72 | } |
73 | #endif | ||
72 | } | 74 | } |
73 | 75 | ||
74 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | 76 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ |
@@ -569,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) | |||
569 | return; | 571 | return; |
570 | 572 | ||
571 | for_each_memcg_cache_index(i) { | 573 | for_each_memcg_cache_index(i) { |
572 | c = cache_from_memcg(s, i); | 574 | c = cache_from_memcg_idx(s, i); |
573 | if (!c) | 575 | if (!c) |
574 | continue; | 576 | continue; |
575 | 577 | ||
@@ -955,7 +955,7 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
955 | kmemleak_free_recursive(x, s->flags); | 955 | kmemleak_free_recursive(x, s->flags); |
956 | 956 | ||
957 | /* | 957 | /* |
958 | * Trouble is that we may no longer disable interupts in the fast path | 958 | * Trouble is that we may no longer disable interrupts in the fast path |
959 | * So in order to make the debug calls that expect irqs to be | 959 | * So in order to make the debug calls that expect irqs to be |
960 | * disabled we need to disable interrupts temporarily. | 960 | * disabled we need to disable interrupts temporarily. |
961 | */ | 961 | */ |
@@ -4983,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
4983 | * through the descendants with best-effort propagation. | 4983 | * through the descendants with best-effort propagation. |
4984 | */ | 4984 | */ |
4985 | for_each_memcg_cache_index(i) { | 4985 | for_each_memcg_cache_index(i) { |
4986 | struct kmem_cache *c = cache_from_memcg(s, i); | 4986 | struct kmem_cache *c = cache_from_memcg_idx(s, i); |
4987 | if (c) | 4987 | if (c) |
4988 | attribute->store(c, buf, len); | 4988 | attribute->store(c, buf, len); |
4989 | } | 4989 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 4ac1d7ef548f..8cc7be0e9590 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -590,33 +590,32 @@ void __init sparse_init(void) | |||
590 | 590 | ||
591 | #ifdef CONFIG_MEMORY_HOTPLUG | 591 | #ifdef CONFIG_MEMORY_HOTPLUG |
592 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 592 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
593 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | 593 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) |
594 | unsigned long nr_pages) | ||
595 | { | 594 | { |
596 | /* This will make the necessary allocations eventually. */ | 595 | /* This will make the necessary allocations eventually. */ |
597 | return sparse_mem_map_populate(pnum, nid); | 596 | return sparse_mem_map_populate(pnum, nid); |
598 | } | 597 | } |
599 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 598 | static void __kfree_section_memmap(struct page *memmap) |
600 | { | 599 | { |
601 | unsigned long start = (unsigned long)memmap; | 600 | unsigned long start = (unsigned long)memmap; |
602 | unsigned long end = (unsigned long)(memmap + nr_pages); | 601 | unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); |
603 | 602 | ||
604 | vmemmap_free(start, end); | 603 | vmemmap_free(start, end); |
605 | } | 604 | } |
606 | #ifdef CONFIG_MEMORY_HOTREMOVE | 605 | #ifdef CONFIG_MEMORY_HOTREMOVE |
607 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 606 | static void free_map_bootmem(struct page *memmap) |
608 | { | 607 | { |
609 | unsigned long start = (unsigned long)memmap; | 608 | unsigned long start = (unsigned long)memmap; |
610 | unsigned long end = (unsigned long)(memmap + nr_pages); | 609 | unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); |
611 | 610 | ||
612 | vmemmap_free(start, end); | 611 | vmemmap_free(start, end); |
613 | } | 612 | } |
614 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 613 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
615 | #else | 614 | #else |
616 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 615 | static struct page *__kmalloc_section_memmap(void) |
617 | { | 616 | { |
618 | struct page *page, *ret; | 617 | struct page *page, *ret; |
619 | unsigned long memmap_size = sizeof(struct page) * nr_pages; | 618 | unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; |
620 | 619 | ||
621 | page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); | 620 | page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); |
622 | if (page) | 621 | if (page) |
@@ -634,28 +633,30 @@ got_map_ptr: | |||
634 | return ret; | 633 | return ret; |
635 | } | 634 | } |
636 | 635 | ||
637 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | 636 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) |
638 | unsigned long nr_pages) | ||
639 | { | 637 | { |
640 | return __kmalloc_section_memmap(nr_pages); | 638 | return __kmalloc_section_memmap(); |
641 | } | 639 | } |
642 | 640 | ||
643 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 641 | static void __kfree_section_memmap(struct page *memmap) |
644 | { | 642 | { |
645 | if (is_vmalloc_addr(memmap)) | 643 | if (is_vmalloc_addr(memmap)) |
646 | vfree(memmap); | 644 | vfree(memmap); |
647 | else | 645 | else |
648 | free_pages((unsigned long)memmap, | 646 | free_pages((unsigned long)memmap, |
649 | get_order(sizeof(struct page) * nr_pages)); | 647 | get_order(sizeof(struct page) * PAGES_PER_SECTION)); |
650 | } | 648 | } |
651 | 649 | ||
652 | #ifdef CONFIG_MEMORY_HOTREMOVE | 650 | #ifdef CONFIG_MEMORY_HOTREMOVE |
653 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 651 | static void free_map_bootmem(struct page *memmap) |
654 | { | 652 | { |
655 | unsigned long maps_section_nr, removing_section_nr, i; | 653 | unsigned long maps_section_nr, removing_section_nr, i; |
656 | unsigned long magic; | 654 | unsigned long magic, nr_pages; |
657 | struct page *page = virt_to_page(memmap); | 655 | struct page *page = virt_to_page(memmap); |
658 | 656 | ||
657 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) | ||
658 | >> PAGE_SHIFT; | ||
659 | |||
659 | for (i = 0; i < nr_pages; i++, page++) { | 660 | for (i = 0; i < nr_pages; i++, page++) { |
660 | magic = (unsigned long) page->lru.next; | 661 | magic = (unsigned long) page->lru.next; |
661 | 662 | ||
@@ -684,8 +685,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | |||
684 | * set. If this is <=0, then that means that the passed-in | 685 | * set. If this is <=0, then that means that the passed-in |
685 | * map was not consumed and must be freed. | 686 | * map was not consumed and must be freed. |
686 | */ | 687 | */ |
687 | int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | 688 | int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) |
688 | int nr_pages) | ||
689 | { | 689 | { |
690 | unsigned long section_nr = pfn_to_section_nr(start_pfn); | 690 | unsigned long section_nr = pfn_to_section_nr(start_pfn); |
691 | struct pglist_data *pgdat = zone->zone_pgdat; | 691 | struct pglist_data *pgdat = zone->zone_pgdat; |
@@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
702 | ret = sparse_index_init(section_nr, pgdat->node_id); | 702 | ret = sparse_index_init(section_nr, pgdat->node_id); |
703 | if (ret < 0 && ret != -EEXIST) | 703 | if (ret < 0 && ret != -EEXIST) |
704 | return ret; | 704 | return ret; |
705 | memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); | 705 | memmap = kmalloc_section_memmap(section_nr, pgdat->node_id); |
706 | if (!memmap) | 706 | if (!memmap) |
707 | return -ENOMEM; | 707 | return -ENOMEM; |
708 | usemap = __kmalloc_section_usemap(); | 708 | usemap = __kmalloc_section_usemap(); |
709 | if (!usemap) { | 709 | if (!usemap) { |
710 | __kfree_section_memmap(memmap, nr_pages); | 710 | __kfree_section_memmap(memmap); |
711 | return -ENOMEM; | 711 | return -ENOMEM; |
712 | } | 712 | } |
713 | 713 | ||
@@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
719 | goto out; | 719 | goto out; |
720 | } | 720 | } |
721 | 721 | ||
722 | memset(memmap, 0, sizeof(struct page) * nr_pages); | 722 | memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); |
723 | 723 | ||
724 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 724 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
725 | 725 | ||
@@ -729,7 +729,7 @@ out: | |||
729 | pgdat_resize_unlock(pgdat, &flags); | 729 | pgdat_resize_unlock(pgdat, &flags); |
730 | if (ret <= 0) { | 730 | if (ret <= 0) { |
731 | kfree(usemap); | 731 | kfree(usemap); |
732 | __kfree_section_memmap(memmap, nr_pages); | 732 | __kfree_section_memmap(memmap); |
733 | } | 733 | } |
734 | return ret; | 734 | return ret; |
735 | } | 735 | } |
@@ -759,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
759 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) | 759 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) |
760 | { | 760 | { |
761 | struct page *usemap_page; | 761 | struct page *usemap_page; |
762 | unsigned long nr_pages; | ||
763 | 762 | ||
764 | if (!usemap) | 763 | if (!usemap) |
765 | return; | 764 | return; |
@@ -771,7 +770,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
771 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { | 770 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { |
772 | kfree(usemap); | 771 | kfree(usemap); |
773 | if (memmap) | 772 | if (memmap) |
774 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); | 773 | __kfree_section_memmap(memmap); |
775 | return; | 774 | return; |
776 | } | 775 | } |
777 | 776 | ||
@@ -780,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
780 | * on the section which has pgdat at boot time. Just keep it as is now. | 779 | * on the section which has pgdat at boot time. Just keep it as is now. |
781 | */ | 780 | */ |
782 | 781 | ||
783 | if (memmap) { | 782 | if (memmap) |
784 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) | 783 | free_map_bootmem(memmap); |
785 | >> PAGE_SHIFT; | ||
786 | |||
787 | free_map_bootmem(memmap, nr_pages); | ||
788 | } | ||
789 | } | 784 | } |
790 | 785 | ||
791 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 786 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
@@ -934,7 +934,8 @@ void __init swap_setup(void) | |||
934 | #ifdef CONFIG_SWAP | 934 | #ifdef CONFIG_SWAP |
935 | int i; | 935 | int i; |
936 | 936 | ||
937 | bdi_init(swapper_spaces[0].backing_dev_info); | 937 | if (bdi_init(swapper_spaces[0].backing_dev_info)) |
938 | panic("Failed to init swap bdi"); | ||
938 | for (i = 0; i < MAX_SWAPFILES; i++) { | 939 | for (i = 0; i < MAX_SWAPFILES; i++) { |
939 | spin_lock_init(&swapper_spaces[i].tree_lock); | 940 | spin_lock_init(&swapper_spaces[i].tree_lock); |
940 | INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); | 941 | INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 3963fc24fcc1..612a7c9795f6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -707,7 +707,7 @@ noswap: | |||
707 | return (swp_entry_t) {0}; | 707 | return (swp_entry_t) {0}; |
708 | } | 708 | } |
709 | 709 | ||
710 | /* The only caller of this function is now susupend routine */ | 710 | /* The only caller of this function is now suspend routine */ |
711 | swp_entry_t get_swap_page_of_type(int type) | 711 | swp_entry_t get_swap_page_of_type(int type) |
712 | { | 712 | { |
713 | struct swap_info_struct *si; | 713 | struct swap_info_struct *si; |
@@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
845 | } | 845 | } |
846 | 846 | ||
847 | /* | 847 | /* |
848 | * Caller has made sure that the swapdevice corresponding to entry | 848 | * Caller has made sure that the swap device corresponding to entry |
849 | * is still around or has not been recycled. | 849 | * is still around or has not been recycled. |
850 | */ | 850 | */ |
851 | void swap_free(swp_entry_t entry) | 851 | void swap_free(swp_entry_t entry) |
@@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page) | |||
947 | * original page might be freed under memory pressure, then | 947 | * original page might be freed under memory pressure, then |
948 | * later read back in from swap, now with the wrong data. | 948 | * later read back in from swap, now with the wrong data. |
949 | * | 949 | * |
950 | * Hibration suspends storage while it is writing the image | 950 | * Hibernation suspends storage while it is writing the image |
951 | * to disk so check that here. | 951 | * to disk so check that here. |
952 | */ | 952 | */ |
953 | if (pm_suspended_storage()) | 953 | if (pm_suspended_storage()) |
@@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
1179 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse | 1179 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse |
1180 | * of unmatched parts which look like swp_pte, so unuse_pte must | 1180 | * of unmatched parts which look like swp_pte, so unuse_pte must |
1181 | * recheck under pte lock. Scanning without pte lock lets it be | 1181 | * recheck under pte lock. Scanning without pte lock lets it be |
1182 | * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. | 1182 | * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. |
1183 | */ | 1183 | */ |
1184 | pte = pte_offset_map(pmd, addr); | 1184 | pte = pte_offset_map(pmd, addr); |
1185 | do { | 1185 | do { |
@@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1824 | struct filename *pathname; | 1824 | struct filename *pathname; |
1825 | int i, type, prev; | 1825 | int i, type, prev; |
1826 | int err; | 1826 | int err; |
1827 | unsigned int old_block_size; | ||
1827 | 1828 | ||
1828 | if (!capable(CAP_SYS_ADMIN)) | 1829 | if (!capable(CAP_SYS_ADMIN)) |
1829 | return -EPERM; | 1830 | return -EPERM; |
@@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1914 | } | 1915 | } |
1915 | 1916 | ||
1916 | swap_file = p->swap_file; | 1917 | swap_file = p->swap_file; |
1918 | old_block_size = p->old_block_size; | ||
1917 | p->swap_file = NULL; | 1919 | p->swap_file = NULL; |
1918 | p->max = 0; | 1920 | p->max = 0; |
1919 | swap_map = p->swap_map; | 1921 | swap_map = p->swap_map; |
@@ -1922,23 +1924,23 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1922 | p->cluster_info = NULL; | 1924 | p->cluster_info = NULL; |
1923 | p->flags = 0; | 1925 | p->flags = 0; |
1924 | frontswap_map = frontswap_map_get(p); | 1926 | frontswap_map = frontswap_map_get(p); |
1925 | frontswap_map_set(p, NULL); | ||
1926 | spin_unlock(&p->lock); | 1927 | spin_unlock(&p->lock); |
1927 | spin_unlock(&swap_lock); | 1928 | spin_unlock(&swap_lock); |
1928 | frontswap_invalidate_area(type); | 1929 | frontswap_invalidate_area(type); |
1930 | frontswap_map_set(p, NULL); | ||
1929 | mutex_unlock(&swapon_mutex); | 1931 | mutex_unlock(&swapon_mutex); |
1930 | free_percpu(p->percpu_cluster); | 1932 | free_percpu(p->percpu_cluster); |
1931 | p->percpu_cluster = NULL; | 1933 | p->percpu_cluster = NULL; |
1932 | vfree(swap_map); | 1934 | vfree(swap_map); |
1933 | vfree(cluster_info); | 1935 | vfree(cluster_info); |
1934 | vfree(frontswap_map); | 1936 | vfree(frontswap_map); |
1935 | /* Destroy swap account informatin */ | 1937 | /* Destroy swap account information */ |
1936 | swap_cgroup_swapoff(type); | 1938 | swap_cgroup_swapoff(type); |
1937 | 1939 | ||
1938 | inode = mapping->host; | 1940 | inode = mapping->host; |
1939 | if (S_ISBLK(inode->i_mode)) { | 1941 | if (S_ISBLK(inode->i_mode)) { |
1940 | struct block_device *bdev = I_BDEV(inode); | 1942 | struct block_device *bdev = I_BDEV(inode); |
1941 | set_blocksize(bdev, p->old_block_size); | 1943 | set_blocksize(bdev, old_block_size); |
1942 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 1944 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1943 | } else { | 1945 | } else { |
1944 | mutex_lock(&inode->i_mutex); | 1946 | mutex_lock(&inode->i_mutex); |
@@ -2784,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2784 | 2786 | ||
2785 | /* | 2787 | /* |
2786 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | 2788 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, |
2787 | * no architecture is using highmem pages for kernel pagetables: so it | 2789 | * no architecture is using highmem pages for kernel page tables: so it |
2788 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | 2790 | * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. |
2789 | */ | 2791 | */ |
2790 | head = vmalloc_to_page(si->swap_map + offset); | 2792 | head = vmalloc_to_page(si->swap_map + offset); |
2791 | offset &= ~PAGE_MASK; | 2793 | offset &= ~PAGE_MASK; |
@@ -7,6 +7,9 @@ | |||
7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
8 | #include <linux/swap.h> | 8 | #include <linux/swap.h> |
9 | #include <linux/swapops.h> | 9 | #include <linux/swapops.h> |
10 | #include <linux/mman.h> | ||
11 | #include <linux/hugetlb.h> | ||
12 | |||
10 | #include <asm/uaccess.h> | 13 | #include <asm/uaccess.h> |
11 | 14 | ||
12 | #include "internal.h" | 15 | #include "internal.h" |
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page) | |||
398 | return mapping; | 401 | return mapping; |
399 | } | 402 | } |
400 | 403 | ||
404 | /* | ||
405 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used | ||
406 | */ | ||
407 | unsigned long vm_commit_limit(void) | ||
408 | { | ||
409 | return ((totalram_pages - hugetlb_total_pages()) | ||
410 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | ||
411 | } | ||
412 | |||
413 | |||
401 | /* Tracepoints definitions. */ | 414 | /* Tracepoints definitions. */ |
402 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 415 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
403 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 416 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 107454312d5e..0fdf96803c5b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, | |||
359 | if (unlikely(!va)) | 359 | if (unlikely(!va)) |
360 | return ERR_PTR(-ENOMEM); | 360 | return ERR_PTR(-ENOMEM); |
361 | 361 | ||
362 | /* | ||
363 | * Only scan the relevant parts containing pointers to other objects | ||
364 | * to avoid false negatives. | ||
365 | */ | ||
366 | kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); | ||
367 | |||
362 | retry: | 368 | retry: |
363 | spin_lock(&vmap_area_lock); | 369 | spin_lock(&vmap_area_lock); |
364 | /* | 370 | /* |
@@ -1546,7 +1552,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1546 | gfp_t gfp_mask, pgprot_t prot, | 1552 | gfp_t gfp_mask, pgprot_t prot, |
1547 | int node, const void *caller); | 1553 | int node, const void *caller); |
1548 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1554 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
1549 | pgprot_t prot, int node, const void *caller) | 1555 | pgprot_t prot, int node) |
1550 | { | 1556 | { |
1551 | const int order = 0; | 1557 | const int order = 0; |
1552 | struct page **pages; | 1558 | struct page **pages; |
@@ -1560,13 +1566,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1560 | /* Please note that the recursion is strictly bounded. */ | 1566 | /* Please note that the recursion is strictly bounded. */ |
1561 | if (array_size > PAGE_SIZE) { | 1567 | if (array_size > PAGE_SIZE) { |
1562 | pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, | 1568 | pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, |
1563 | PAGE_KERNEL, node, caller); | 1569 | PAGE_KERNEL, node, area->caller); |
1564 | area->flags |= VM_VPAGES; | 1570 | area->flags |= VM_VPAGES; |
1565 | } else { | 1571 | } else { |
1566 | pages = kmalloc_node(array_size, nested_gfp, node); | 1572 | pages = kmalloc_node(array_size, nested_gfp, node); |
1567 | } | 1573 | } |
1568 | area->pages = pages; | 1574 | area->pages = pages; |
1569 | area->caller = caller; | ||
1570 | if (!area->pages) { | 1575 | if (!area->pages) { |
1571 | remove_vm_area(area->addr); | 1576 | remove_vm_area(area->addr); |
1572 | kfree(area); | 1577 | kfree(area); |
@@ -1577,7 +1582,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1577 | struct page *page; | 1582 | struct page *page; |
1578 | gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; | 1583 | gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; |
1579 | 1584 | ||
1580 | if (node < 0) | 1585 | if (node == NUMA_NO_NODE) |
1581 | page = alloc_page(tmp_mask); | 1586 | page = alloc_page(tmp_mask); |
1582 | else | 1587 | else |
1583 | page = alloc_pages_node(node, tmp_mask, order); | 1588 | page = alloc_pages_node(node, tmp_mask, order); |
@@ -1634,9 +1639,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1634 | if (!area) | 1639 | if (!area) |
1635 | goto fail; | 1640 | goto fail; |
1636 | 1641 | ||
1637 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1642 | addr = __vmalloc_area_node(area, gfp_mask, prot, node); |
1638 | if (!addr) | 1643 | if (!addr) |
1639 | goto fail; | 1644 | return NULL; |
1640 | 1645 | ||
1641 | /* | 1646 | /* |
1642 | * In this function, newly allocated vm_struct has VM_UNINITIALIZED | 1647 | * In this function, newly allocated vm_struct has VM_UNINITIALIZED |
@@ -1646,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1646 | clear_vm_uninitialized_flag(area); | 1651 | clear_vm_uninitialized_flag(area); |
1647 | 1652 | ||
1648 | /* | 1653 | /* |
1649 | * A ref_count = 3 is needed because the vm_struct and vmap_area | 1654 | * A ref_count = 2 is needed because vm_struct allocated in |
1650 | * structures allocated in the __get_vm_area_node() function contain | 1655 | * __get_vm_area_node() contains a reference to the virtual address of |
1651 | * references to the virtual address of the vmalloc'ed block. | 1656 | * the vmalloc'ed block. |
1652 | */ | 1657 | */ |
1653 | kmemleak_alloc(addr, real_size, 3, gfp_mask); | 1658 | kmemleak_alloc(addr, real_size, 2, gfp_mask); |
1654 | 1659 | ||
1655 | return addr; | 1660 | return addr; |
1656 | 1661 | ||
@@ -2563,6 +2568,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
2563 | if (!counters) | 2568 | if (!counters) |
2564 | return; | 2569 | return; |
2565 | 2570 | ||
2571 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2572 | smp_rmb(); | ||
2573 | if (v->flags & VM_UNINITIALIZED) | ||
2574 | return; | ||
2575 | |||
2566 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | 2576 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); |
2567 | 2577 | ||
2568 | for (nr = 0; nr < v->nr_pages; nr++) | 2578 | for (nr = 0; nr < v->nr_pages; nr++) |
@@ -2579,23 +2589,15 @@ static int s_show(struct seq_file *m, void *p) | |||
2579 | struct vmap_area *va = p; | 2589 | struct vmap_area *va = p; |
2580 | struct vm_struct *v; | 2590 | struct vm_struct *v; |
2581 | 2591 | ||
2582 | if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) | 2592 | /* |
2593 | * s_show can encounter race with remove_vm_area, !VM_VM_AREA on | ||
2594 | * behalf of vmap area is being tear down or vm_map_ram allocation. | ||
2595 | */ | ||
2596 | if (!(va->flags & VM_VM_AREA)) | ||
2583 | return 0; | 2597 | return 0; |
2584 | 2598 | ||
2585 | if (!(va->flags & VM_VM_AREA)) { | ||
2586 | seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", | ||
2587 | (void *)va->va_start, (void *)va->va_end, | ||
2588 | va->va_end - va->va_start); | ||
2589 | return 0; | ||
2590 | } | ||
2591 | |||
2592 | v = va->vm; | 2599 | v = va->vm; |
2593 | 2600 | ||
2594 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2595 | smp_rmb(); | ||
2596 | if (v->flags & VM_UNINITIALIZED) | ||
2597 | return 0; | ||
2598 | |||
2599 | seq_printf(m, "0x%pK-0x%pK %7ld", | 2601 | seq_printf(m, "0x%pK-0x%pK %7ld", |
2600 | v->addr, v->addr + v->size, v->size); | 2602 | v->addr, v->addr + v->size, v->size); |
2601 | 2603 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ed1b775bdc9..eea668d9cff6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <asm/div64.h> | 48 | #include <asm/div64.h> |
49 | 49 | ||
50 | #include <linux/swapops.h> | 50 | #include <linux/swapops.h> |
51 | #include <linux/balloon_compaction.h> | ||
51 | 52 | ||
52 | #include "internal.h" | 53 | #include "internal.h" |
53 | 54 | ||
@@ -139,23 +140,11 @@ static bool global_reclaim(struct scan_control *sc) | |||
139 | { | 140 | { |
140 | return !sc->target_mem_cgroup; | 141 | return !sc->target_mem_cgroup; |
141 | } | 142 | } |
142 | |||
143 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
144 | { | ||
145 | struct mem_cgroup *root = sc->target_mem_cgroup; | ||
146 | return !mem_cgroup_disabled() && | ||
147 | mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; | ||
148 | } | ||
149 | #else | 143 | #else |
150 | static bool global_reclaim(struct scan_control *sc) | 144 | static bool global_reclaim(struct scan_control *sc) |
151 | { | 145 | { |
152 | return true; | 146 | return true; |
153 | } | 147 | } |
154 | |||
155 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
156 | { | ||
157 | return false; | ||
158 | } | ||
159 | #endif | 148 | #endif |
160 | 149 | ||
161 | unsigned long zone_reclaimable_pages(struct zone *zone) | 150 | unsigned long zone_reclaimable_pages(struct zone *zone) |
@@ -222,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker) | |||
222 | down_write(&shrinker_rwsem); | 211 | down_write(&shrinker_rwsem); |
223 | list_del(&shrinker->list); | 212 | list_del(&shrinker->list); |
224 | up_write(&shrinker_rwsem); | 213 | up_write(&shrinker_rwsem); |
214 | kfree(shrinker->nr_deferred); | ||
225 | } | 215 | } |
226 | EXPORT_SYMBOL(unregister_shrinker); | 216 | EXPORT_SYMBOL(unregister_shrinker); |
227 | 217 | ||
@@ -1125,7 +1115,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1125 | LIST_HEAD(clean_pages); | 1115 | LIST_HEAD(clean_pages); |
1126 | 1116 | ||
1127 | list_for_each_entry_safe(page, next, page_list, lru) { | 1117 | list_for_each_entry_safe(page, next, page_list, lru) { |
1128 | if (page_is_file_cache(page) && !PageDirty(page)) { | 1118 | if (page_is_file_cache(page) && !PageDirty(page) && |
1119 | !isolated_balloon_page(page)) { | ||
1129 | ClearPageActive(page); | 1120 | ClearPageActive(page); |
1130 | list_move(&page->lru, &clean_pages); | 1121 | list_move(&page->lru, &clean_pages); |
1131 | } | 1122 | } |
@@ -2176,11 +2167,9 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2176 | } | 2167 | } |
2177 | } | 2168 | } |
2178 | 2169 | ||
2179 | static int | 2170 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
2180 | __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | ||
2181 | { | 2171 | { |
2182 | unsigned long nr_reclaimed, nr_scanned; | 2172 | unsigned long nr_reclaimed, nr_scanned; |
2183 | int groups_scanned = 0; | ||
2184 | 2173 | ||
2185 | do { | 2174 | do { |
2186 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2175 | struct mem_cgroup *root = sc->target_mem_cgroup; |
@@ -2188,17 +2177,15 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
2188 | .zone = zone, | 2177 | .zone = zone, |
2189 | .priority = sc->priority, | 2178 | .priority = sc->priority, |
2190 | }; | 2179 | }; |
2191 | struct mem_cgroup *memcg = NULL; | 2180 | struct mem_cgroup *memcg; |
2192 | mem_cgroup_iter_filter filter = (soft_reclaim) ? | ||
2193 | mem_cgroup_soft_reclaim_eligible : NULL; | ||
2194 | 2181 | ||
2195 | nr_reclaimed = sc->nr_reclaimed; | 2182 | nr_reclaimed = sc->nr_reclaimed; |
2196 | nr_scanned = sc->nr_scanned; | 2183 | nr_scanned = sc->nr_scanned; |
2197 | 2184 | ||
2198 | while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { | 2185 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2186 | do { | ||
2199 | struct lruvec *lruvec; | 2187 | struct lruvec *lruvec; |
2200 | 2188 | ||
2201 | groups_scanned++; | ||
2202 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2189 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2203 | 2190 | ||
2204 | shrink_lruvec(lruvec, sc); | 2191 | shrink_lruvec(lruvec, sc); |
@@ -2218,7 +2205,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
2218 | mem_cgroup_iter_break(root, memcg); | 2205 | mem_cgroup_iter_break(root, memcg); |
2219 | break; | 2206 | break; |
2220 | } | 2207 | } |
2221 | } | 2208 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
2209 | } while (memcg); | ||
2222 | 2210 | ||
2223 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2211 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
2224 | sc->nr_scanned - nr_scanned, | 2212 | sc->nr_scanned - nr_scanned, |
@@ -2226,37 +2214,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
2226 | 2214 | ||
2227 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 2215 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
2228 | sc->nr_scanned - nr_scanned, sc)); | 2216 | sc->nr_scanned - nr_scanned, sc)); |
2229 | |||
2230 | return groups_scanned; | ||
2231 | } | ||
2232 | |||
2233 | |||
2234 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | ||
2235 | { | ||
2236 | bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); | ||
2237 | unsigned long nr_scanned = sc->nr_scanned; | ||
2238 | int scanned_groups; | ||
2239 | |||
2240 | scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); | ||
2241 | /* | ||
2242 | * memcg iterator might race with other reclaimer or start from | ||
2243 | * a incomplete tree walk so the tree walk in __shrink_zone | ||
2244 | * might have missed groups that are above the soft limit. Try | ||
2245 | * another loop to catch up with others. Do it just once to | ||
2246 | * prevent from reclaim latencies when other reclaimers always | ||
2247 | * preempt this one. | ||
2248 | */ | ||
2249 | if (do_soft_reclaim && !scanned_groups) | ||
2250 | __shrink_zone(zone, sc, do_soft_reclaim); | ||
2251 | |||
2252 | /* | ||
2253 | * No group is over the soft limit or those that are do not have | ||
2254 | * pages in the zone we are reclaiming so we have to reclaim everybody | ||
2255 | */ | ||
2256 | if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { | ||
2257 | __shrink_zone(zone, sc, false); | ||
2258 | return; | ||
2259 | } | ||
2260 | } | 2217 | } |
2261 | 2218 | ||
2262 | /* Returns true if compaction should go ahead for a high-order request */ | 2219 | /* Returns true if compaction should go ahead for a high-order request */ |
@@ -2320,6 +2277,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2320 | { | 2277 | { |
2321 | struct zoneref *z; | 2278 | struct zoneref *z; |
2322 | struct zone *zone; | 2279 | struct zone *zone; |
2280 | unsigned long nr_soft_reclaimed; | ||
2281 | unsigned long nr_soft_scanned; | ||
2323 | bool aborted_reclaim = false; | 2282 | bool aborted_reclaim = false; |
2324 | 2283 | ||
2325 | /* | 2284 | /* |
@@ -2359,6 +2318,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2359 | continue; | 2318 | continue; |
2360 | } | 2319 | } |
2361 | } | 2320 | } |
2321 | /* | ||
2322 | * This steals pages from memory cgroups over softlimit | ||
2323 | * and returns the number of reclaimed pages and | ||
2324 | * scanned pages. This works for global memory pressure | ||
2325 | * and balancing, not for a memcg's limit. | ||
2326 | */ | ||
2327 | nr_soft_scanned = 0; | ||
2328 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2329 | sc->order, sc->gfp_mask, | ||
2330 | &nr_soft_scanned); | ||
2331 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2332 | sc->nr_scanned += nr_soft_scanned; | ||
2362 | /* need some check for avoid more shrink_zone() */ | 2333 | /* need some check for avoid more shrink_zone() */ |
2363 | } | 2334 | } |
2364 | 2335 | ||
@@ -2952,6 +2923,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2952 | { | 2923 | { |
2953 | int i; | 2924 | int i; |
2954 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2925 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2926 | unsigned long nr_soft_reclaimed; | ||
2927 | unsigned long nr_soft_scanned; | ||
2955 | struct scan_control sc = { | 2928 | struct scan_control sc = { |
2956 | .gfp_mask = GFP_KERNEL, | 2929 | .gfp_mask = GFP_KERNEL, |
2957 | .priority = DEF_PRIORITY, | 2930 | .priority = DEF_PRIORITY, |
@@ -3066,6 +3039,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3066 | 3039 | ||
3067 | sc.nr_scanned = 0; | 3040 | sc.nr_scanned = 0; |
3068 | 3041 | ||
3042 | nr_soft_scanned = 0; | ||
3043 | /* | ||
3044 | * Call soft limit reclaim before calling shrink_zone. | ||
3045 | */ | ||
3046 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
3047 | order, sc.gfp_mask, | ||
3048 | &nr_soft_scanned); | ||
3049 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
3050 | |||
3069 | /* | 3051 | /* |
3070 | * There should be no need to raise the scanning | 3052 | * There should be no need to raise the scanning |
3071 | * priority if enough pages are already being scanned | 3053 | * priority if enough pages are already being scanned |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9bb314577911..72496140ac08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -812,6 +812,7 @@ const char * const vmstat_text[] = { | |||
812 | 812 | ||
813 | #ifdef CONFIG_NUMA_BALANCING | 813 | #ifdef CONFIG_NUMA_BALANCING |
814 | "numa_pte_updates", | 814 | "numa_pte_updates", |
815 | "numa_huge_pte_updates", | ||
815 | "numa_hint_faults", | 816 | "numa_hint_faults", |
816 | "numa_hint_faults_local", | 817 | "numa_hint_faults_local", |
817 | "numa_pages_migrated", | 818 | "numa_pages_migrated", |
@@ -1229,6 +1230,20 @@ static void start_cpu_timer(int cpu) | |||
1229 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); | 1230 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); |
1230 | } | 1231 | } |
1231 | 1232 | ||
1233 | static void vmstat_cpu_dead(int node) | ||
1234 | { | ||
1235 | int cpu; | ||
1236 | |||
1237 | get_online_cpus(); | ||
1238 | for_each_online_cpu(cpu) | ||
1239 | if (cpu_to_node(cpu) == node) | ||
1240 | goto end; | ||
1241 | |||
1242 | node_clear_state(node, N_CPU); | ||
1243 | end: | ||
1244 | put_online_cpus(); | ||
1245 | } | ||
1246 | |||
1232 | /* | 1247 | /* |
1233 | * Use the cpu notifier to insure that the thresholds are recalculated | 1248 | * Use the cpu notifier to insure that the thresholds are recalculated |
1234 | * when necessary. | 1249 | * when necessary. |
@@ -1258,6 +1273,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, | |||
1258 | case CPU_DEAD: | 1273 | case CPU_DEAD: |
1259 | case CPU_DEAD_FROZEN: | 1274 | case CPU_DEAD_FROZEN: |
1260 | refresh_zone_stat_thresholds(); | 1275 | refresh_zone_stat_thresholds(); |
1276 | vmstat_cpu_dead(cpu_to_node(cpu)); | ||
1261 | break; | 1277 | break; |
1262 | default: | 1278 | default: |
1263 | break; | 1279 | break; |
@@ -1276,8 +1292,12 @@ static int __init setup_vmstat(void) | |||
1276 | 1292 | ||
1277 | register_cpu_notifier(&vmstat_notifier); | 1293 | register_cpu_notifier(&vmstat_notifier); |
1278 | 1294 | ||
1279 | for_each_online_cpu(cpu) | 1295 | get_online_cpus(); |
1296 | for_each_online_cpu(cpu) { | ||
1280 | start_cpu_timer(cpu); | 1297 | start_cpu_timer(cpu); |
1298 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
1299 | } | ||
1300 | put_online_cpus(); | ||
1281 | #endif | 1301 | #endif |
1282 | #ifdef CONFIG_PROC_FS | 1302 | #ifdef CONFIG_PROC_FS |
1283 | proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); | 1303 | proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); |
diff --git a/mm/zswap.c b/mm/zswap.c index 841e35f1db22..5a63f78a5601 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) | |||
217 | if (!entry) | 217 | if (!entry) |
218 | return NULL; | 218 | return NULL; |
219 | entry->refcount = 1; | 219 | entry->refcount = 1; |
220 | RB_CLEAR_NODE(&entry->rbnode); | ||
220 | return entry; | 221 | return entry; |
221 | } | 222 | } |
222 | 223 | ||
@@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) | |||
225 | kmem_cache_free(zswap_entry_cache, entry); | 226 | kmem_cache_free(zswap_entry_cache, entry); |
226 | } | 227 | } |
227 | 228 | ||
228 | /* caller must hold the tree lock */ | ||
229 | static void zswap_entry_get(struct zswap_entry *entry) | ||
230 | { | ||
231 | entry->refcount++; | ||
232 | } | ||
233 | |||
234 | /* caller must hold the tree lock */ | ||
235 | static int zswap_entry_put(struct zswap_entry *entry) | ||
236 | { | ||
237 | entry->refcount--; | ||
238 | return entry->refcount; | ||
239 | } | ||
240 | |||
241 | /********************************* | 229 | /********************************* |
242 | * rbtree functions | 230 | * rbtree functions |
243 | **********************************/ | 231 | **********************************/ |
@@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, | |||
285 | return 0; | 273 | return 0; |
286 | } | 274 | } |
287 | 275 | ||
276 | static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) | ||
277 | { | ||
278 | if (!RB_EMPTY_NODE(&entry->rbnode)) { | ||
279 | rb_erase(&entry->rbnode, root); | ||
280 | RB_CLEAR_NODE(&entry->rbnode); | ||
281 | } | ||
282 | } | ||
283 | |||
284 | /* | ||
285 | * Carries out the common pattern of freeing and entry's zsmalloc allocation, | ||
286 | * freeing the entry itself, and decrementing the number of stored pages. | ||
287 | */ | ||
288 | static void zswap_free_entry(struct zswap_tree *tree, | ||
289 | struct zswap_entry *entry) | ||
290 | { | ||
291 | zbud_free(tree->pool, entry->handle); | ||
292 | zswap_entry_cache_free(entry); | ||
293 | atomic_dec(&zswap_stored_pages); | ||
294 | zswap_pool_pages = zbud_get_pool_size(tree->pool); | ||
295 | } | ||
296 | |||
297 | /* caller must hold the tree lock */ | ||
298 | static void zswap_entry_get(struct zswap_entry *entry) | ||
299 | { | ||
300 | entry->refcount++; | ||
301 | } | ||
302 | |||
303 | /* caller must hold the tree lock | ||
304 | * remove from the tree and free it, if nobody reference the entry | ||
305 | */ | ||
306 | static void zswap_entry_put(struct zswap_tree *tree, | ||
307 | struct zswap_entry *entry) | ||
308 | { | ||
309 | int refcount = --entry->refcount; | ||
310 | |||
311 | BUG_ON(refcount < 0); | ||
312 | if (refcount == 0) { | ||
313 | zswap_rb_erase(&tree->rbroot, entry); | ||
314 | zswap_free_entry(tree, entry); | ||
315 | } | ||
316 | } | ||
317 | |||
318 | /* caller must hold the tree lock */ | ||
319 | static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, | ||
320 | pgoff_t offset) | ||
321 | { | ||
322 | struct zswap_entry *entry = NULL; | ||
323 | |||
324 | entry = zswap_rb_search(root, offset); | ||
325 | if (entry) | ||
326 | zswap_entry_get(entry); | ||
327 | |||
328 | return entry; | ||
329 | } | ||
330 | |||
288 | /********************************* | 331 | /********************************* |
289 | * per-cpu code | 332 | * per-cpu code |
290 | **********************************/ | 333 | **********************************/ |
@@ -368,18 +411,6 @@ static bool zswap_is_full(void) | |||
368 | zswap_pool_pages); | 411 | zswap_pool_pages); |
369 | } | 412 | } |
370 | 413 | ||
371 | /* | ||
372 | * Carries out the common pattern of freeing and entry's zsmalloc allocation, | ||
373 | * freeing the entry itself, and decrementing the number of stored pages. | ||
374 | */ | ||
375 | static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) | ||
376 | { | ||
377 | zbud_free(tree->pool, entry->handle); | ||
378 | zswap_entry_cache_free(entry); | ||
379 | atomic_dec(&zswap_stored_pages); | ||
380 | zswap_pool_pages = zbud_get_pool_size(tree->pool); | ||
381 | } | ||
382 | |||
383 | /********************************* | 414 | /********************************* |
384 | * writeback code | 415 | * writeback code |
385 | **********************************/ | 416 | **********************************/ |
@@ -387,7 +418,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) | |||
387 | enum zswap_get_swap_ret { | 418 | enum zswap_get_swap_ret { |
388 | ZSWAP_SWAPCACHE_NEW, | 419 | ZSWAP_SWAPCACHE_NEW, |
389 | ZSWAP_SWAPCACHE_EXIST, | 420 | ZSWAP_SWAPCACHE_EXIST, |
390 | ZSWAP_SWAPCACHE_NOMEM | 421 | ZSWAP_SWAPCACHE_FAIL, |
391 | }; | 422 | }; |
392 | 423 | ||
393 | /* | 424 | /* |
@@ -401,9 +432,10 @@ enum zswap_get_swap_ret { | |||
401 | * added to the swap cache, and returned in retpage. | 432 | * added to the swap cache, and returned in retpage. |
402 | * | 433 | * |
403 | * If success, the swap cache page is returned in retpage | 434 | * If success, the swap cache page is returned in retpage |
404 | * Returns 0 if page was already in the swap cache, page is not locked | 435 | * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache |
405 | * Returns 1 if the new page needs to be populated, page is locked | 436 | * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, |
406 | * Returns <0 on error | 437 | * the new page is added to swapcache and locked |
438 | * Returns ZSWAP_SWAPCACHE_FAIL on error | ||
407 | */ | 439 | */ |
408 | static int zswap_get_swap_cache_page(swp_entry_t entry, | 440 | static int zswap_get_swap_cache_page(swp_entry_t entry, |
409 | struct page **retpage) | 441 | struct page **retpage) |
@@ -475,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, | |||
475 | if (new_page) | 507 | if (new_page) |
476 | page_cache_release(new_page); | 508 | page_cache_release(new_page); |
477 | if (!found_page) | 509 | if (!found_page) |
478 | return ZSWAP_SWAPCACHE_NOMEM; | 510 | return ZSWAP_SWAPCACHE_FAIL; |
479 | *retpage = found_page; | 511 | *retpage = found_page; |
480 | return ZSWAP_SWAPCACHE_EXIST; | 512 | return ZSWAP_SWAPCACHE_EXIST; |
481 | } | 513 | } |
@@ -502,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
502 | struct page *page; | 534 | struct page *page; |
503 | u8 *src, *dst; | 535 | u8 *src, *dst; |
504 | unsigned int dlen; | 536 | unsigned int dlen; |
505 | int ret, refcount; | 537 | int ret; |
506 | struct writeback_control wbc = { | 538 | struct writeback_control wbc = { |
507 | .sync_mode = WB_SYNC_NONE, | 539 | .sync_mode = WB_SYNC_NONE, |
508 | }; | 540 | }; |
@@ -517,23 +549,22 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
517 | 549 | ||
518 | /* find and ref zswap entry */ | 550 | /* find and ref zswap entry */ |
519 | spin_lock(&tree->lock); | 551 | spin_lock(&tree->lock); |
520 | entry = zswap_rb_search(&tree->rbroot, offset); | 552 | entry = zswap_entry_find_get(&tree->rbroot, offset); |
521 | if (!entry) { | 553 | if (!entry) { |
522 | /* entry was invalidated */ | 554 | /* entry was invalidated */ |
523 | spin_unlock(&tree->lock); | 555 | spin_unlock(&tree->lock); |
524 | return 0; | 556 | return 0; |
525 | } | 557 | } |
526 | zswap_entry_get(entry); | ||
527 | spin_unlock(&tree->lock); | 558 | spin_unlock(&tree->lock); |
528 | BUG_ON(offset != entry->offset); | 559 | BUG_ON(offset != entry->offset); |
529 | 560 | ||
530 | /* try to allocate swap cache page */ | 561 | /* try to allocate swap cache page */ |
531 | switch (zswap_get_swap_cache_page(swpentry, &page)) { | 562 | switch (zswap_get_swap_cache_page(swpentry, &page)) { |
532 | case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ | 563 | case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ |
533 | ret = -ENOMEM; | 564 | ret = -ENOMEM; |
534 | goto fail; | 565 | goto fail; |
535 | 566 | ||
536 | case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ | 567 | case ZSWAP_SWAPCACHE_EXIST: |
537 | /* page is already in the swap cache, ignore for now */ | 568 | /* page is already in the swap cache, ignore for now */ |
538 | page_cache_release(page); | 569 | page_cache_release(page); |
539 | ret = -EEXIST; | 570 | ret = -EEXIST; |
@@ -556,43 +587,44 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
556 | SetPageUptodate(page); | 587 | SetPageUptodate(page); |
557 | } | 588 | } |
558 | 589 | ||
590 | /* move it to the tail of the inactive list after end_writeback */ | ||
591 | SetPageReclaim(page); | ||
592 | |||
559 | /* start writeback */ | 593 | /* start writeback */ |
560 | __swap_writepage(page, &wbc, end_swap_bio_write); | 594 | __swap_writepage(page, &wbc, end_swap_bio_write); |
561 | page_cache_release(page); | 595 | page_cache_release(page); |
562 | zswap_written_back_pages++; | 596 | zswap_written_back_pages++; |
563 | 597 | ||
564 | spin_lock(&tree->lock); | 598 | spin_lock(&tree->lock); |
565 | |||
566 | /* drop local reference */ | 599 | /* drop local reference */ |
567 | zswap_entry_put(entry); | 600 | zswap_entry_put(tree, entry); |
568 | /* drop the initial reference from entry creation */ | ||
569 | refcount = zswap_entry_put(entry); | ||
570 | 601 | ||
571 | /* | 602 | /* |
572 | * There are three possible values for refcount here: | 603 | * There are two possible situations for entry here: |
573 | * (1) refcount is 1, load is in progress, unlink from rbtree, | 604 | * (1) refcount is 1(normal case), entry is valid and on the tree |
574 | * load will free | 605 | * (2) refcount is 0, entry is freed and not on the tree |
575 | * (2) refcount is 0, (normal case) entry is valid, | 606 | * because invalidate happened during writeback |
576 | * remove from rbtree and free entry | 607 | * search the tree and free the entry if find entry |
577 | * (3) refcount is -1, invalidate happened during writeback, | 608 | */ |
578 | * free entry | 609 | if (entry == zswap_rb_search(&tree->rbroot, offset)) |
579 | */ | 610 | zswap_entry_put(tree, entry); |
580 | if (refcount >= 0) { | ||
581 | /* no invalidate yet, remove from rbtree */ | ||
582 | rb_erase(&entry->rbnode, &tree->rbroot); | ||
583 | } | ||
584 | spin_unlock(&tree->lock); | 611 | spin_unlock(&tree->lock); |
585 | if (refcount <= 0) { | ||
586 | /* free the entry */ | ||
587 | zswap_free_entry(tree, entry); | ||
588 | return 0; | ||
589 | } | ||
590 | return -EAGAIN; | ||
591 | 612 | ||
613 | goto end; | ||
614 | |||
615 | /* | ||
616 | * if we get here due to ZSWAP_SWAPCACHE_EXIST | ||
617 | * a load may happening concurrently | ||
618 | * it is safe and okay to not free the entry | ||
619 | * if we free the entry in the following put | ||
620 | * it it either okay to return !0 | ||
621 | */ | ||
592 | fail: | 622 | fail: |
593 | spin_lock(&tree->lock); | 623 | spin_lock(&tree->lock); |
594 | zswap_entry_put(entry); | 624 | zswap_entry_put(tree, entry); |
595 | spin_unlock(&tree->lock); | 625 | spin_unlock(&tree->lock); |
626 | |||
627 | end: | ||
596 | return ret; | 628 | return ret; |
597 | } | 629 | } |
598 | 630 | ||
@@ -676,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
676 | if (ret == -EEXIST) { | 708 | if (ret == -EEXIST) { |
677 | zswap_duplicate_entry++; | 709 | zswap_duplicate_entry++; |
678 | /* remove from rbtree */ | 710 | /* remove from rbtree */ |
679 | rb_erase(&dupentry->rbnode, &tree->rbroot); | 711 | zswap_rb_erase(&tree->rbroot, dupentry); |
680 | if (!zswap_entry_put(dupentry)) { | 712 | zswap_entry_put(tree, dupentry); |
681 | /* free */ | ||
682 | zswap_free_entry(tree, dupentry); | ||
683 | } | ||
684 | } | 713 | } |
685 | } while (ret == -EEXIST); | 714 | } while (ret == -EEXIST); |
686 | spin_unlock(&tree->lock); | 715 | spin_unlock(&tree->lock); |
@@ -709,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, | |||
709 | struct zswap_entry *entry; | 738 | struct zswap_entry *entry; |
710 | u8 *src, *dst; | 739 | u8 *src, *dst; |
711 | unsigned int dlen; | 740 | unsigned int dlen; |
712 | int refcount, ret; | 741 | int ret; |
713 | 742 | ||
714 | /* find */ | 743 | /* find */ |
715 | spin_lock(&tree->lock); | 744 | spin_lock(&tree->lock); |
716 | entry = zswap_rb_search(&tree->rbroot, offset); | 745 | entry = zswap_entry_find_get(&tree->rbroot, offset); |
717 | if (!entry) { | 746 | if (!entry) { |
718 | /* entry was written back */ | 747 | /* entry was written back */ |
719 | spin_unlock(&tree->lock); | 748 | spin_unlock(&tree->lock); |
720 | return -1; | 749 | return -1; |
721 | } | 750 | } |
722 | zswap_entry_get(entry); | ||
723 | spin_unlock(&tree->lock); | 751 | spin_unlock(&tree->lock); |
724 | 752 | ||
725 | /* decompress */ | 753 | /* decompress */ |
@@ -734,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, | |||
734 | BUG_ON(ret); | 762 | BUG_ON(ret); |
735 | 763 | ||
736 | spin_lock(&tree->lock); | 764 | spin_lock(&tree->lock); |
737 | refcount = zswap_entry_put(entry); | 765 | zswap_entry_put(tree, entry); |
738 | if (likely(refcount)) { | ||
739 | spin_unlock(&tree->lock); | ||
740 | return 0; | ||
741 | } | ||
742 | spin_unlock(&tree->lock); | 766 | spin_unlock(&tree->lock); |
743 | 767 | ||
744 | /* | ||
745 | * We don't have to unlink from the rbtree because | ||
746 | * zswap_writeback_entry() or zswap_frontswap_invalidate page() | ||
747 | * has already done this for us if we are the last reference. | ||
748 | */ | ||
749 | /* free */ | ||
750 | |||
751 | zswap_free_entry(tree, entry); | ||
752 | |||
753 | return 0; | 768 | return 0; |
754 | } | 769 | } |
755 | 770 | ||
@@ -758,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) | |||
758 | { | 773 | { |
759 | struct zswap_tree *tree = zswap_trees[type]; | 774 | struct zswap_tree *tree = zswap_trees[type]; |
760 | struct zswap_entry *entry; | 775 | struct zswap_entry *entry; |
761 | int refcount; | ||
762 | 776 | ||
763 | /* find */ | 777 | /* find */ |
764 | spin_lock(&tree->lock); | 778 | spin_lock(&tree->lock); |
@@ -770,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) | |||
770 | } | 784 | } |
771 | 785 | ||
772 | /* remove from rbtree */ | 786 | /* remove from rbtree */ |
773 | rb_erase(&entry->rbnode, &tree->rbroot); | 787 | zswap_rb_erase(&tree->rbroot, entry); |
774 | 788 | ||
775 | /* drop the initial reference from entry creation */ | 789 | /* drop the initial reference from entry creation */ |
776 | refcount = zswap_entry_put(entry); | 790 | zswap_entry_put(tree, entry); |
777 | 791 | ||
778 | spin_unlock(&tree->lock); | 792 | spin_unlock(&tree->lock); |
779 | |||
780 | if (refcount) { | ||
781 | /* writeback in progress, writeback will free */ | ||
782 | return; | ||
783 | } | ||
784 | |||
785 | /* free */ | ||
786 | zswap_free_entry(tree, entry); | ||
787 | } | 793 | } |
788 | 794 | ||
789 | /* frees all zswap entries for the given swap type */ | 795 | /* frees all zswap entries for the given swap type */ |
@@ -797,13 +803,14 @@ static void zswap_frontswap_invalidate_area(unsigned type) | |||
797 | 803 | ||
798 | /* walk the tree and free everything */ | 804 | /* walk the tree and free everything */ |
799 | spin_lock(&tree->lock); | 805 | spin_lock(&tree->lock); |
800 | rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { | 806 | rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) |
801 | zbud_free(tree->pool, entry->handle); | 807 | zswap_free_entry(tree, entry); |
802 | zswap_entry_cache_free(entry); | ||
803 | atomic_dec(&zswap_stored_pages); | ||
804 | } | ||
805 | tree->rbroot = RB_ROOT; | 808 | tree->rbroot = RB_ROOT; |
806 | spin_unlock(&tree->lock); | 809 | spin_unlock(&tree->lock); |
810 | |||
811 | zbud_destroy_pool(tree->pool); | ||
812 | kfree(tree); | ||
813 | zswap_trees[type] = NULL; | ||
807 | } | 814 | } |
808 | 815 | ||
809 | static struct zbud_ops zswap_zbud_ops = { | 816 | static struct zbud_ops zswap_zbud_ops = { |