aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bounce.c21
-rw-r--r--mm/filemap.c34
-rw-r--r--mm/fremap.c17
-rw-r--r--mm/huge_memory.c77
-rw-r--r--mm/hugetlb.c63
-rw-r--r--mm/madvise.c31
-rw-r--r--mm/memblock.c12
-rw-r--r--mm/memcontrol.c338
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c61
-rw-r--r--mm/memory_hotplug.c101
-rw-r--r--mm/migrate.c24
-rw-r--r--mm/mlock.c11
-rw-r--r--mm/mmap.c202
-rw-r--r--mm/nobootmem.c6
-rw-r--r--mm/nommu.c80
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c78
-rw-r--r--mm/page_io.c36
-rw-r--r--mm/process_vm_access.c8
-rw-r--r--mm/rmap.c3
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/slab.c8
-rw-r--r--mm/slub.c9
-rw-r--r--mm/sparse-vmemmap.c27
-rw-r--r--mm/sparse.c82
-rw-r--r--mm/swap.c11
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmalloc.c218
-rw-r--r--mm/vmpressure.c374
-rw-r--r--mm/vmscan.c16
-rw-r--r--mm/vmstat.c6
35 files changed, 1504 insertions, 489 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ae55c1e04d10..e742d06285b7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -263,8 +263,14 @@ config ZONE_DMA_FLAG
263 default "1" 263 default "1"
264 264
265config BOUNCE 265config BOUNCE
266 def_bool y 266 bool "Enable bounce buffers"
267 default y
267 depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) 268 depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
269 help
270 Enable bounce buffers for devices that cannot access
271 the full range of memory available to the CPU. Enabled
272 by default when ZONE_DMA or HIGHMEM is selected, but you
273 may say n to override this.
268 274
269# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often 275# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
270# have more than 4GB of memory, but we don't currently use the IOTLB to present 276# have more than 4GB of memory, but we don't currently use the IOTLB to present
@@ -286,8 +292,12 @@ config NR_QUICK
286 default "1" 292 default "1"
287 293
288config VIRT_TO_BUS 294config VIRT_TO_BUS
289 def_bool y 295 bool
290 depends on HAVE_VIRT_TO_BUS 296 help
297 An architecture should select this if it implements the
298 deprecated interface virt_to_bus(). All new architectures
299 should probably not select this.
300
291 301
292config MMU_NOTIFIER 302config MMU_NOTIFIER
293 bool 303 bool
diff --git a/mm/Makefile b/mm/Makefile
index 3a4628751f89..72c5acb9345f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
50obj-$(CONFIG_MIGRATION) += migrate.o 50obj-$(CONFIG_MIGRATION) += migrate.o
51obj-$(CONFIG_QUICKLIST) += quicklist.o 51obj-$(CONFIG_QUICKLIST) += quicklist.o
52obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 52obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
53obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o 53obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
54obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o 54obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
55obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 55obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
56obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 56obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/bounce.c b/mm/bounce.c
index 5f8901768602..a5c2ec3589cb 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -181,32 +181,13 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
181#ifdef CONFIG_NEED_BOUNCE_POOL 181#ifdef CONFIG_NEED_BOUNCE_POOL
182static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) 182static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
183{ 183{
184 struct page *page;
185 struct backing_dev_info *bdi;
186 struct address_space *mapping;
187 struct bio_vec *from;
188 int i;
189
190 if (bio_data_dir(bio) != WRITE) 184 if (bio_data_dir(bio) != WRITE)
191 return 0; 185 return 0;
192 186
193 if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) 187 if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
194 return 0; 188 return 0;
195 189
196 /* 190 return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
197 * Based on the first page that has a valid mapping, decide whether or
198 * not we have to employ bounce buffering to guarantee stable pages.
199 */
200 bio_for_each_segment(from, bio, i) {
201 page = from->bv_page;
202 mapping = page_mapping(page);
203 if (!mapping)
204 continue;
205 bdi = mapping->backing_dev_info;
206 return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
207 }
208
209 return 0;
210} 191}
211#else 192#else
212static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) 193static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
diff --git a/mm/filemap.c b/mm/filemap.c
index e1979fdca805..e989fb1eaa72 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,9 @@
35#include <linux/cleancache.h> 35#include <linux/cleancache.h>
36#include "internal.h" 36#include "internal.h"
37 37
38#define CREATE_TRACE_POINTS
39#include <trace/events/filemap.h>
40
38/* 41/*
39 * FIXME: remove all knowledge of the buffer layer from the core VM 42 * FIXME: remove all knowledge of the buffer layer from the core VM
40 */ 43 */
@@ -113,6 +116,7 @@ void __delete_from_page_cache(struct page *page)
113{ 116{
114 struct address_space *mapping = page->mapping; 117 struct address_space *mapping = page->mapping;
115 118
119 trace_mm_filemap_delete_from_page_cache(page);
116 /* 120 /*
117 * if we're uptodate, flush out into the cleancache, otherwise 121 * if we're uptodate, flush out into the cleancache, otherwise
118 * invalidate any existing cleancache entries. We can't leave 122 * invalidate any existing cleancache entries. We can't leave
@@ -184,6 +188,17 @@ static int sleep_on_page_killable(void *word)
184 return fatal_signal_pending(current) ? -EINTR : 0; 188 return fatal_signal_pending(current) ? -EINTR : 0;
185} 189}
186 190
191static int filemap_check_errors(struct address_space *mapping)
192{
193 int ret = 0;
194 /* Check for outstanding write errors */
195 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
196 ret = -ENOSPC;
197 if (test_and_clear_bit(AS_EIO, &mapping->flags))
198 ret = -EIO;
199 return ret;
200}
201
187/** 202/**
188 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 203 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
189 * @mapping: address space structure to write 204 * @mapping: address space structure to write
@@ -265,10 +280,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
265 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; 280 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
266 struct pagevec pvec; 281 struct pagevec pvec;
267 int nr_pages; 282 int nr_pages;
268 int ret = 0; 283 int ret2, ret = 0;
269 284
270 if (end_byte < start_byte) 285 if (end_byte < start_byte)
271 return 0; 286 goto out;
272 287
273 pagevec_init(&pvec, 0); 288 pagevec_init(&pvec, 0);
274 while ((index <= end) && 289 while ((index <= end) &&
@@ -291,12 +306,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
291 pagevec_release(&pvec); 306 pagevec_release(&pvec);
292 cond_resched(); 307 cond_resched();
293 } 308 }
294 309out:
295 /* Check for outstanding write errors */ 310 ret2 = filemap_check_errors(mapping);
296 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 311 if (!ret)
297 ret = -ENOSPC; 312 ret = ret2;
298 if (test_and_clear_bit(AS_EIO, &mapping->flags))
299 ret = -EIO;
300 313
301 return ret; 314 return ret;
302} 315}
@@ -337,6 +350,8 @@ int filemap_write_and_wait(struct address_space *mapping)
337 if (!err) 350 if (!err)
338 err = err2; 351 err = err2;
339 } 352 }
353 } else {
354 err = filemap_check_errors(mapping);
340 } 355 }
341 return err; 356 return err;
342} 357}
@@ -368,6 +383,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
368 if (!err) 383 if (!err)
369 err = err2; 384 err = err2;
370 } 385 }
386 } else {
387 err = filemap_check_errors(mapping);
371 } 388 }
372 return err; 389 return err;
373} 390}
@@ -464,6 +481,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
464 mapping->nrpages++; 481 mapping->nrpages++;
465 __inc_zone_page_state(page, NR_FILE_PAGES); 482 __inc_zone_page_state(page, NR_FILE_PAGES);
466 spin_unlock_irq(&mapping->tree_lock); 483 spin_unlock_irq(&mapping->tree_lock);
484 trace_mm_filemap_add_to_page_cache(page);
467 } else { 485 } else {
468 page->mapping = NULL; 486 page->mapping = NULL;
469 /* Leave page->index set: truncation relies upon it */ 487 /* Leave page->index set: truncation relies upon it */
diff --git a/mm/fremap.c b/mm/fremap.c
index 0cd4c11488ed..87da3590c61e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -129,7 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
129 struct vm_area_struct *vma; 129 struct vm_area_struct *vma;
130 int err = -EINVAL; 130 int err = -EINVAL;
131 int has_write_lock = 0; 131 int has_write_lock = 0;
132 vm_flags_t vm_flags; 132 vm_flags_t vm_flags = 0;
133 133
134 if (prot) 134 if (prot)
135 return err; 135 return err;
@@ -204,10 +204,8 @@ get_write_lock:
204 unsigned long addr; 204 unsigned long addr;
205 struct file *file = get_file(vma->vm_file); 205 struct file *file = get_file(vma->vm_file);
206 206
207 vm_flags = vma->vm_flags; 207 addr = mmap_region(file, start, size,
208 if (!(flags & MAP_NONBLOCK)) 208 vma->vm_flags, pgoff);
209 vm_flags |= VM_POPULATE;
210 addr = mmap_region(file, start, size, vm_flags, pgoff);
211 fput(file); 209 fput(file);
212 if (IS_ERR_VALUE(addr)) { 210 if (IS_ERR_VALUE(addr)) {
213 err = addr; 211 err = addr;
@@ -226,12 +224,6 @@ get_write_lock:
226 mutex_unlock(&mapping->i_mmap_mutex); 224 mutex_unlock(&mapping->i_mmap_mutex);
227 } 225 }
228 226
229 if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
230 if (!has_write_lock)
231 goto get_write_lock;
232 vma->vm_flags |= VM_POPULATE;
233 }
234
235 if (vma->vm_flags & VM_LOCKED) { 227 if (vma->vm_flags & VM_LOCKED) {
236 /* 228 /*
237 * drop PG_Mlocked flag for over-mapped range 229 * drop PG_Mlocked flag for over-mapped range
@@ -254,7 +246,8 @@ get_write_lock:
254 */ 246 */
255 247
256out: 248out:
257 vm_flags = vma->vm_flags; 249 if (vma)
250 vm_flags = vma->vm_flags;
258 if (likely(!has_write_lock)) 251 if (likely(!has_write_lock))
259 up_read(&mm->mmap_sem); 252 up_read(&mm->mmap_sem);
260 else 253 else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2f7f5aaaafb..03a89a2f464b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -163,35 +163,34 @@ static int start_khugepaged(void)
163} 163}
164 164
165static atomic_t huge_zero_refcount; 165static atomic_t huge_zero_refcount;
166static unsigned long huge_zero_pfn __read_mostly; 166static struct page *huge_zero_page __read_mostly;
167 167
168static inline bool is_huge_zero_pfn(unsigned long pfn) 168static inline bool is_huge_zero_page(struct page *page)
169{ 169{
170 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); 170 return ACCESS_ONCE(huge_zero_page) == page;
171 return zero_pfn && pfn == zero_pfn;
172} 171}
173 172
174static inline bool is_huge_zero_pmd(pmd_t pmd) 173static inline bool is_huge_zero_pmd(pmd_t pmd)
175{ 174{
176 return is_huge_zero_pfn(pmd_pfn(pmd)); 175 return is_huge_zero_page(pmd_page(pmd));
177} 176}
178 177
179static unsigned long get_huge_zero_page(void) 178static struct page *get_huge_zero_page(void)
180{ 179{
181 struct page *zero_page; 180 struct page *zero_page;
182retry: 181retry:
183 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 182 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
184 return ACCESS_ONCE(huge_zero_pfn); 183 return ACCESS_ONCE(huge_zero_page);
185 184
186 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 185 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
187 HPAGE_PMD_ORDER); 186 HPAGE_PMD_ORDER);
188 if (!zero_page) { 187 if (!zero_page) {
189 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 188 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
190 return 0; 189 return NULL;
191 } 190 }
192 count_vm_event(THP_ZERO_PAGE_ALLOC); 191 count_vm_event(THP_ZERO_PAGE_ALLOC);
193 preempt_disable(); 192 preempt_disable();
194 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { 193 if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
195 preempt_enable(); 194 preempt_enable();
196 __free_page(zero_page); 195 __free_page(zero_page);
197 goto retry; 196 goto retry;
@@ -200,7 +199,7 @@ retry:
200 /* We take additional reference here. It will be put back by shrinker */ 199 /* We take additional reference here. It will be put back by shrinker */
201 atomic_set(&huge_zero_refcount, 2); 200 atomic_set(&huge_zero_refcount, 2);
202 preempt_enable(); 201 preempt_enable();
203 return ACCESS_ONCE(huge_zero_pfn); 202 return ACCESS_ONCE(huge_zero_page);
204} 203}
205 204
206static void put_huge_zero_page(void) 205static void put_huge_zero_page(void)
@@ -220,9 +219,9 @@ static int shrink_huge_zero_page(struct shrinker *shrink,
220 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 219 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
221 220
222 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 221 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
223 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); 222 struct page *zero_page = xchg(&huge_zero_page, NULL);
224 BUG_ON(zero_pfn == 0); 223 BUG_ON(zero_page == NULL);
225 __free_page(__pfn_to_page(zero_pfn)); 224 __free_page(zero_page);
226 } 225 }
227 226
228 return 0; 227 return 0;
@@ -713,6 +712,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
713 return VM_FAULT_OOM; 712 return VM_FAULT_OOM;
714 713
715 clear_huge_page(page, haddr, HPAGE_PMD_NR); 714 clear_huge_page(page, haddr, HPAGE_PMD_NR);
715 /*
716 * The memory barrier inside __SetPageUptodate makes sure that
717 * clear_huge_page writes become visible before the set_pmd_at()
718 * write.
719 */
716 __SetPageUptodate(page); 720 __SetPageUptodate(page);
717 721
718 spin_lock(&mm->page_table_lock); 722 spin_lock(&mm->page_table_lock);
@@ -724,12 +728,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
724 } else { 728 } else {
725 pmd_t entry; 729 pmd_t entry;
726 entry = mk_huge_pmd(page, vma); 730 entry = mk_huge_pmd(page, vma);
727 /*
728 * The spinlocking to take the lru_lock inside
729 * page_add_new_anon_rmap() acts as a full memory
730 * barrier to be sure clear_huge_page writes become
731 * visible after the set_pmd_at() write.
732 */
733 page_add_new_anon_rmap(page, vma, haddr); 731 page_add_new_anon_rmap(page, vma, haddr);
734 set_pmd_at(mm, haddr, pmd, entry); 732 set_pmd_at(mm, haddr, pmd, entry);
735 pgtable_trans_huge_deposit(mm, pgtable); 733 pgtable_trans_huge_deposit(mm, pgtable);
@@ -765,12 +763,12 @@ static inline struct page *alloc_hugepage(int defrag)
765 763
766static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 764static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
767 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 765 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
768 unsigned long zero_pfn) 766 struct page *zero_page)
769{ 767{
770 pmd_t entry; 768 pmd_t entry;
771 if (!pmd_none(*pmd)) 769 if (!pmd_none(*pmd))
772 return false; 770 return false;
773 entry = pfn_pmd(zero_pfn, vma->vm_page_prot); 771 entry = mk_pmd(zero_page, vma->vm_page_prot);
774 entry = pmd_wrprotect(entry); 772 entry = pmd_wrprotect(entry);
775 entry = pmd_mkhuge(entry); 773 entry = pmd_mkhuge(entry);
776 set_pmd_at(mm, haddr, pmd, entry); 774 set_pmd_at(mm, haddr, pmd, entry);
@@ -795,20 +793,20 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
795 if (!(flags & FAULT_FLAG_WRITE) && 793 if (!(flags & FAULT_FLAG_WRITE) &&
796 transparent_hugepage_use_zero_page()) { 794 transparent_hugepage_use_zero_page()) {
797 pgtable_t pgtable; 795 pgtable_t pgtable;
798 unsigned long zero_pfn; 796 struct page *zero_page;
799 bool set; 797 bool set;
800 pgtable = pte_alloc_one(mm, haddr); 798 pgtable = pte_alloc_one(mm, haddr);
801 if (unlikely(!pgtable)) 799 if (unlikely(!pgtable))
802 return VM_FAULT_OOM; 800 return VM_FAULT_OOM;
803 zero_pfn = get_huge_zero_page(); 801 zero_page = get_huge_zero_page();
804 if (unlikely(!zero_pfn)) { 802 if (unlikely(!zero_page)) {
805 pte_free(mm, pgtable); 803 pte_free(mm, pgtable);
806 count_vm_event(THP_FAULT_FALLBACK); 804 count_vm_event(THP_FAULT_FALLBACK);
807 goto out; 805 goto out;
808 } 806 }
809 spin_lock(&mm->page_table_lock); 807 spin_lock(&mm->page_table_lock);
810 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, 808 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
811 zero_pfn); 809 zero_page);
812 spin_unlock(&mm->page_table_lock); 810 spin_unlock(&mm->page_table_lock);
813 if (!set) { 811 if (!set) {
814 pte_free(mm, pgtable); 812 pte_free(mm, pgtable);
@@ -887,16 +885,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
887 * a page table. 885 * a page table.
888 */ 886 */
889 if (is_huge_zero_pmd(pmd)) { 887 if (is_huge_zero_pmd(pmd)) {
890 unsigned long zero_pfn; 888 struct page *zero_page;
891 bool set; 889 bool set;
892 /* 890 /*
893 * get_huge_zero_page() will never allocate a new page here, 891 * get_huge_zero_page() will never allocate a new page here,
894 * since we already have a zero page to copy. It just takes a 892 * since we already have a zero page to copy. It just takes a
895 * reference. 893 * reference.
896 */ 894 */
897 zero_pfn = get_huge_zero_page(); 895 zero_page = get_huge_zero_page();
898 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 896 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
899 zero_pfn); 897 zero_page);
900 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ 898 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
901 ret = 0; 899 ret = 0;
902 goto out_unlock; 900 goto out_unlock;
@@ -1560,7 +1558,8 @@ static int __split_huge_page_splitting(struct page *page,
1560 return ret; 1558 return ret;
1561} 1559}
1562 1560
1563static void __split_huge_page_refcount(struct page *page) 1561static void __split_huge_page_refcount(struct page *page,
1562 struct list_head *list)
1564{ 1563{
1565 int i; 1564 int i;
1566 struct zone *zone = page_zone(page); 1565 struct zone *zone = page_zone(page);
@@ -1646,7 +1645,7 @@ static void __split_huge_page_refcount(struct page *page)
1646 BUG_ON(!PageDirty(page_tail)); 1645 BUG_ON(!PageDirty(page_tail));
1647 BUG_ON(!PageSwapBacked(page_tail)); 1646 BUG_ON(!PageSwapBacked(page_tail));
1648 1647
1649 lru_add_page_tail(page, page_tail, lruvec); 1648 lru_add_page_tail(page, page_tail, lruvec, list);
1650 } 1649 }
1651 atomic_sub(tail_count, &page->_count); 1650 atomic_sub(tail_count, &page->_count);
1652 BUG_ON(atomic_read(&page->_count) <= 0); 1651 BUG_ON(atomic_read(&page->_count) <= 0);
@@ -1753,7 +1752,8 @@ static int __split_huge_page_map(struct page *page,
1753 1752
1754/* must be called with anon_vma->root->rwsem held */ 1753/* must be called with anon_vma->root->rwsem held */
1755static void __split_huge_page(struct page *page, 1754static void __split_huge_page(struct page *page,
1756 struct anon_vma *anon_vma) 1755 struct anon_vma *anon_vma,
1756 struct list_head *list)
1757{ 1757{
1758 int mapcount, mapcount2; 1758 int mapcount, mapcount2;
1759 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1759 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1784,7 +1784,7 @@ static void __split_huge_page(struct page *page,
1784 mapcount, page_mapcount(page)); 1784 mapcount, page_mapcount(page));
1785 BUG_ON(mapcount != page_mapcount(page)); 1785 BUG_ON(mapcount != page_mapcount(page));
1786 1786
1787 __split_huge_page_refcount(page); 1787 __split_huge_page_refcount(page, list);
1788 1788
1789 mapcount2 = 0; 1789 mapcount2 = 0;
1790 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1790 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
@@ -1799,12 +1799,19 @@ static void __split_huge_page(struct page *page,
1799 BUG_ON(mapcount != mapcount2); 1799 BUG_ON(mapcount != mapcount2);
1800} 1800}
1801 1801
1802int split_huge_page(struct page *page) 1802/*
1803 * Split a hugepage into normal pages. This doesn't change the position of head
1804 * page. If @list is null, tail pages will be added to LRU list, otherwise, to
1805 * @list. Both head page and tail pages will inherit mapping, flags, and so on
1806 * from the hugepage.
1807 * Return 0 if the hugepage is split successfully otherwise return 1.
1808 */
1809int split_huge_page_to_list(struct page *page, struct list_head *list)
1803{ 1810{
1804 struct anon_vma *anon_vma; 1811 struct anon_vma *anon_vma;
1805 int ret = 1; 1812 int ret = 1;
1806 1813
1807 BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); 1814 BUG_ON(is_huge_zero_page(page));
1808 BUG_ON(!PageAnon(page)); 1815 BUG_ON(!PageAnon(page));
1809 1816
1810 /* 1817 /*
@@ -1824,7 +1831,7 @@ int split_huge_page(struct page *page)
1824 goto out_unlock; 1831 goto out_unlock;
1825 1832
1826 BUG_ON(!PageSwapBacked(page)); 1833 BUG_ON(!PageSwapBacked(page));
1827 __split_huge_page(page, anon_vma); 1834 __split_huge_page(page, anon_vma, list);
1828 count_vm_event(THP_SPLIT); 1835 count_vm_event(THP_SPLIT);
1829 1836
1830 BUG_ON(PageCompound(page)); 1837 BUG_ON(PageCompound(page));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a0be33bb199..f8feeeca6686 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1761,7 +1761,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1761 * Unregister hstate attributes from a single node device. 1761 * Unregister hstate attributes from a single node device.
1762 * No-op if no hstate attributes attached. 1762 * No-op if no hstate attributes attached.
1763 */ 1763 */
1764void hugetlb_unregister_node(struct node *node) 1764static void hugetlb_unregister_node(struct node *node)
1765{ 1765{
1766 struct hstate *h; 1766 struct hstate *h;
1767 struct node_hstate *nhs = &node_hstates[node->dev.id]; 1767 struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -1805,7 +1805,7 @@ static void hugetlb_unregister_all_nodes(void)
1805 * Register hstate attributes for a single node device. 1805 * Register hstate attributes for a single node device.
1806 * No-op if attributes already registered. 1806 * No-op if attributes already registered.
1807 */ 1807 */
1808void hugetlb_register_node(struct node *node) 1808static void hugetlb_register_node(struct node *node)
1809{ 1809{
1810 struct hstate *h; 1810 struct hstate *h;
1811 struct node_hstate *nhs = &node_hstates[node->dev.id]; 1811 struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -2121,11 +2121,30 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
2121 nid, h->surplus_huge_pages_node[nid]); 2121 nid, h->surplus_huge_pages_node[nid]);
2122} 2122}
2123 2123
2124void hugetlb_show_meminfo(void)
2125{
2126 struct hstate *h;
2127 int nid;
2128
2129 for_each_node_state(nid, N_MEMORY)
2130 for_each_hstate(h)
2131 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
2132 nid,
2133 h->nr_huge_pages_node[nid],
2134 h->free_huge_pages_node[nid],
2135 h->surplus_huge_pages_node[nid],
2136 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2137}
2138
2124/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 2139/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
2125unsigned long hugetlb_total_pages(void) 2140unsigned long hugetlb_total_pages(void)
2126{ 2141{
2127 struct hstate *h = &default_hstate; 2142 struct hstate *h;
2128 return h->nr_huge_pages * pages_per_huge_page(h); 2143 unsigned long nr_total_pages = 0;
2144
2145 for_each_hstate(h)
2146 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2147 return nr_total_pages;
2129} 2148}
2130 2149
2131static int hugetlb_acct_memory(struct hstate *h, long delta) 2150static int hugetlb_acct_memory(struct hstate *h, long delta)
@@ -2243,10 +2262,11 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2243 pte_t entry; 2262 pte_t entry;
2244 2263
2245 if (writable) { 2264 if (writable) {
2246 entry = 2265 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
2247 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 2266 vma->vm_page_prot)));
2248 } else { 2267 } else {
2249 entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 2268 entry = huge_pte_wrprotect(mk_huge_pte(page,
2269 vma->vm_page_prot));
2250 } 2270 }
2251 entry = pte_mkyoung(entry); 2271 entry = pte_mkyoung(entry);
2252 entry = pte_mkhuge(entry); 2272 entry = pte_mkhuge(entry);
@@ -2260,7 +2280,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2260{ 2280{
2261 pte_t entry; 2281 pte_t entry;
2262 2282
2263 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2283 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
2264 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 2284 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
2265 update_mmu_cache(vma, address, ptep); 2285 update_mmu_cache(vma, address, ptep);
2266} 2286}
@@ -2375,7 +2395,7 @@ again:
2375 * HWPoisoned hugepage is already unmapped and dropped reference 2395 * HWPoisoned hugepage is already unmapped and dropped reference
2376 */ 2396 */
2377 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 2397 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
2378 pte_clear(mm, address, ptep); 2398 huge_pte_clear(mm, address, ptep);
2379 continue; 2399 continue;
2380 } 2400 }
2381 2401
@@ -2399,7 +2419,7 @@ again:
2399 2419
2400 pte = huge_ptep_get_and_clear(mm, address, ptep); 2420 pte = huge_ptep_get_and_clear(mm, address, ptep);
2401 tlb_remove_tlb_entry(tlb, ptep, address); 2421 tlb_remove_tlb_entry(tlb, ptep, address);
2402 if (pte_dirty(pte)) 2422 if (huge_pte_dirty(pte))
2403 set_page_dirty(page); 2423 set_page_dirty(page);
2404 2424
2405 page_remove_rmap(page); 2425 page_remove_rmap(page);
@@ -2852,7 +2872,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2852 * page now as it is used to determine if a reservation has been 2872 * page now as it is used to determine if a reservation has been
2853 * consumed. 2873 * consumed.
2854 */ 2874 */
2855 if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { 2875 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
2856 if (vma_needs_reservation(h, vma, address) < 0) { 2876 if (vma_needs_reservation(h, vma, address) < 0) {
2857 ret = VM_FAULT_OOM; 2877 ret = VM_FAULT_OOM;
2858 goto out_mutex; 2878 goto out_mutex;
@@ -2882,12 +2902,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2882 2902
2883 2903
2884 if (flags & FAULT_FLAG_WRITE) { 2904 if (flags & FAULT_FLAG_WRITE) {
2885 if (!pte_write(entry)) { 2905 if (!huge_pte_write(entry)) {
2886 ret = hugetlb_cow(mm, vma, address, ptep, entry, 2906 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2887 pagecache_page); 2907 pagecache_page);
2888 goto out_page_table_lock; 2908 goto out_page_table_lock;
2889 } 2909 }
2890 entry = pte_mkdirty(entry); 2910 entry = huge_pte_mkdirty(entry);
2891 } 2911 }
2892 entry = pte_mkyoung(entry); 2912 entry = pte_mkyoung(entry);
2893 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2913 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
@@ -2957,8 +2977,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2957 break; 2977 break;
2958 } 2978 }
2959 2979
2960 if (absent || 2980 /*
2961 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { 2981 * We need call hugetlb_fault for both hugepages under migration
2982 * (in which case hugetlb_fault waits for the migration,) and
2983 * hwpoisoned hugepages (in which case we need to prevent the
2984 * caller from accessing to them.) In order to do this, we use
2985 * here is_swap_pte instead of is_hugetlb_entry_migration and
2986 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
2987 * both cases, and because we can't follow correct pages
2988 * directly from any kind of swap entries.
2989 */
2990 if (absent || is_swap_pte(huge_ptep_get(pte)) ||
2991 ((flags & FOLL_WRITE) &&
2992 !huge_pte_write(huge_ptep_get(pte)))) {
2962 int ret; 2993 int ret;
2963 2994
2964 spin_unlock(&mm->page_table_lock); 2995 spin_unlock(&mm->page_table_lock);
@@ -3028,7 +3059,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3028 } 3059 }
3029 if (!huge_pte_none(huge_ptep_get(ptep))) { 3060 if (!huge_pte_none(huge_ptep_get(ptep))) {
3030 pte = huge_ptep_get_and_clear(mm, address, ptep); 3061 pte = huge_ptep_get_and_clear(mm, address, ptep);
3031 pte = pte_mkhuge(pte_modify(pte, newprot)); 3062 pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3032 pte = arch_make_huge_pte(pte, vma, NULL, 0); 3063 pte = arch_make_huge_pte(pte, vma, NULL, 0);
3033 set_huge_pte_at(mm, address, ptep, pte); 3064 set_huge_pte_at(mm, address, ptep, pte);
3034 pages++; 3065 pages++;
diff --git a/mm/madvise.c b/mm/madvise.c
index c58c94b56c3d..7055883e6e25 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -473,27 +473,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
473 if (!madvise_behavior_valid(behavior)) 473 if (!madvise_behavior_valid(behavior))
474 return error; 474 return error;
475 475
476 write = madvise_need_mmap_write(behavior);
477 if (write)
478 down_write(&current->mm->mmap_sem);
479 else
480 down_read(&current->mm->mmap_sem);
481
482 if (start & ~PAGE_MASK) 476 if (start & ~PAGE_MASK)
483 goto out; 477 return error;
484 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 478 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
485 479
486 /* Check to see whether len was rounded up from small -ve to zero */ 480 /* Check to see whether len was rounded up from small -ve to zero */
487 if (len_in && !len) 481 if (len_in && !len)
488 goto out; 482 return error;
489 483
490 end = start + len; 484 end = start + len;
491 if (end < start) 485 if (end < start)
492 goto out; 486 return error;
493 487
494 error = 0; 488 error = 0;
495 if (end == start) 489 if (end == start)
496 goto out; 490 return error;
491
492 write = madvise_need_mmap_write(behavior);
493 if (write)
494 down_write(&current->mm->mmap_sem);
495 else
496 down_read(&current->mm->mmap_sem);
497 497
498 /* 498 /*
499 * If the interval [start,end) covers some unmapped address 499 * If the interval [start,end) covers some unmapped address
@@ -509,14 +509,14 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
509 /* Still start < end. */ 509 /* Still start < end. */
510 error = -ENOMEM; 510 error = -ENOMEM;
511 if (!vma) 511 if (!vma)
512 goto out_plug; 512 goto out;
513 513
514 /* Here start < (end|vma->vm_end). */ 514 /* Here start < (end|vma->vm_end). */
515 if (start < vma->vm_start) { 515 if (start < vma->vm_start) {
516 unmapped_error = -ENOMEM; 516 unmapped_error = -ENOMEM;
517 start = vma->vm_start; 517 start = vma->vm_start;
518 if (start >= end) 518 if (start >= end)
519 goto out_plug; 519 goto out;
520 } 520 }
521 521
522 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 522 /* Here vma->vm_start <= start < (end|vma->vm_end) */
@@ -527,21 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
527 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 527 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
528 error = madvise_vma(vma, &prev, start, tmp, behavior); 528 error = madvise_vma(vma, &prev, start, tmp, behavior);
529 if (error) 529 if (error)
530 goto out_plug; 530 goto out;
531 start = tmp; 531 start = tmp;
532 if (prev && start < prev->vm_end) 532 if (prev && start < prev->vm_end)
533 start = prev->vm_end; 533 start = prev->vm_end;
534 error = unmapped_error; 534 error = unmapped_error;
535 if (start >= end) 535 if (start >= end)
536 goto out_plug; 536 goto out;
537 if (prev) 537 if (prev)
538 vma = prev->vm_next; 538 vma = prev->vm_next;
539 else /* madvise_remove dropped mmap_sem */ 539 else /* madvise_remove dropped mmap_sem */
540 vma = find_vma(current->mm, start); 540 vma = find_vma(current->mm, start);
541 } 541 }
542out_plug:
543 blk_finish_plug(&plug);
544out: 542out:
543 blk_finish_plug(&plug);
545 if (write) 544 if (write)
546 up_write(&current->mm->mmap_sem); 545 up_write(&current->mm->mmap_sem);
547 else 546 else
diff --git a/mm/memblock.c b/mm/memblock.c
index b8d9147e5c08..c5fad932fa51 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -322,10 +322,11 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
322 322
323/** 323/**
324 * memblock_insert_region - insert new memblock region 324 * memblock_insert_region - insert new memblock region
325 * @type: memblock type to insert into 325 * @type: memblock type to insert into
326 * @idx: index for the insertion point 326 * @idx: index for the insertion point
327 * @base: base address of the new region 327 * @base: base address of the new region
328 * @size: size of the new region 328 * @size: size of the new region
329 * @nid: node id of the new region
329 * 330 *
330 * Insert new memblock region [@base,@base+@size) into @type at @idx. 331 * Insert new memblock region [@base,@base+@size) into @type at @idx.
331 * @type must already have extra room to accomodate the new region. 332 * @type must already have extra room to accomodate the new region.
@@ -771,6 +772,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
771{ 772{
772 phys_addr_t found; 773 phys_addr_t found;
773 774
775 if (WARN_ON(!align))
776 align = __alignof__(long long);
777
774 /* align @size to avoid excessive fragmentation on reserved array */ 778 /* align @size to avoid excessive fragmentation on reserved array */
775 size = round_up(size, align); 779 size = round_up(size, align);
776 780
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2b552224f5cf..0f1d92163f30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
49#include <linux/fs.h> 49#include <linux/fs.h>
50#include <linux/seq_file.h> 50#include <linux/seq_file.h>
51#include <linux/vmalloc.h> 51#include <linux/vmalloc.h>
52#include <linux/vmpressure.h>
52#include <linux/mm_inline.h> 53#include <linux/mm_inline.h>
53#include <linux/page_cgroup.h> 54#include <linux/page_cgroup.h>
54#include <linux/cpu.h> 55#include <linux/cpu.h>
@@ -152,8 +153,13 @@ struct mem_cgroup_stat_cpu {
152}; 153};
153 154
154struct mem_cgroup_reclaim_iter { 155struct mem_cgroup_reclaim_iter {
155 /* css_id of the last scanned hierarchy member */ 156 /*
156 int position; 157 * last scanned hierarchy member. Valid only if last_dead_count
158 * matches memcg->dead_count of the hierarchy root group.
159 */
160 struct mem_cgroup *last_visited;
161 unsigned long last_dead_count;
162
157 /* scan generation, increased every round-trip */ 163 /* scan generation, increased every round-trip */
158 unsigned int generation; 164 unsigned int generation;
159}; 165};
@@ -256,6 +262,9 @@ struct mem_cgroup {
256 */ 262 */
257 struct res_counter res; 263 struct res_counter res;
258 264
265 /* vmpressure notifications */
266 struct vmpressure vmpressure;
267
259 union { 268 union {
260 /* 269 /*
261 * the counter to account for mem+swap usage. 270 * the counter to account for mem+swap usage.
@@ -335,6 +344,7 @@ struct mem_cgroup {
335 struct mem_cgroup_stat_cpu nocpu_base; 344 struct mem_cgroup_stat_cpu nocpu_base;
336 spinlock_t pcp_counter_lock; 345 spinlock_t pcp_counter_lock;
337 346
347 atomic_t dead_count;
338#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 348#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
339 struct tcp_memcontrol tcp_mem; 349 struct tcp_memcontrol tcp_mem;
340#endif 350#endif
@@ -353,6 +363,7 @@ struct mem_cgroup {
353 atomic_t numainfo_events; 363 atomic_t numainfo_events;
354 atomic_t numainfo_updating; 364 atomic_t numainfo_updating;
355#endif 365#endif
366
356 /* 367 /*
357 * Per cgroup active and inactive list, similar to the 368 * Per cgroup active and inactive list, similar to the
358 * per zone LRU lists. 369 * per zone LRU lists.
@@ -504,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
504 return container_of(s, struct mem_cgroup, css); 515 return container_of(s, struct mem_cgroup, css);
505} 516}
506 517
518/* Some nice accessors for the vmpressure. */
519struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
520{
521 if (!memcg)
522 memcg = root_mem_cgroup;
523 return &memcg->vmpressure;
524}
525
526struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
527{
528 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
529}
530
531struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
532{
533 return &mem_cgroup_from_css(css)->vmpressure;
534}
535
507static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 536static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
508{ 537{
509 return (memcg == root_mem_cgroup); 538 return (memcg == root_mem_cgroup);
@@ -1067,6 +1096,51 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1067 return memcg; 1096 return memcg;
1068} 1097}
1069 1098
1099/*
1100 * Returns a next (in a pre-order walk) alive memcg (with elevated css
1101 * ref. count) or NULL if the whole root's subtree has been visited.
1102 *
1103 * helper function to be used by mem_cgroup_iter
1104 */
1105static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1106 struct mem_cgroup *last_visited)
1107{
1108 struct cgroup *prev_cgroup, *next_cgroup;
1109
1110 /*
1111 * Root is not visited by cgroup iterators so it needs an
1112 * explicit visit.
1113 */
1114 if (!last_visited)
1115 return root;
1116
1117 prev_cgroup = (last_visited == root) ? NULL
1118 : last_visited->css.cgroup;
1119skip_node:
1120 next_cgroup = cgroup_next_descendant_pre(
1121 prev_cgroup, root->css.cgroup);
1122
1123 /*
1124 * Even if we found a group we have to make sure it is
1125 * alive. css && !memcg means that the groups should be
1126 * skipped and we should continue the tree walk.
1127 * last_visited css is safe to use because it is
1128 * protected by css_get and the tree walk is rcu safe.
1129 */
1130 if (next_cgroup) {
1131 struct mem_cgroup *mem = mem_cgroup_from_cont(
1132 next_cgroup);
1133 if (css_tryget(&mem->css))
1134 return mem;
1135 else {
1136 prev_cgroup = next_cgroup;
1137 goto skip_node;
1138 }
1139 }
1140
1141 return NULL;
1142}
1143
1070/** 1144/**
1071 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1145 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1072 * @root: hierarchy root 1146 * @root: hierarchy root
@@ -1089,7 +1163,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1089 struct mem_cgroup_reclaim_cookie *reclaim) 1163 struct mem_cgroup_reclaim_cookie *reclaim)
1090{ 1164{
1091 struct mem_cgroup *memcg = NULL; 1165 struct mem_cgroup *memcg = NULL;
1092 int id = 0; 1166 struct mem_cgroup *last_visited = NULL;
1167 unsigned long uninitialized_var(dead_count);
1093 1168
1094 if (mem_cgroup_disabled()) 1169 if (mem_cgroup_disabled())
1095 return NULL; 1170 return NULL;
@@ -1098,20 +1173,17 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1098 root = root_mem_cgroup; 1173 root = root_mem_cgroup;
1099 1174
1100 if (prev && !reclaim) 1175 if (prev && !reclaim)
1101 id = css_id(&prev->css); 1176 last_visited = prev;
1102
1103 if (prev && prev != root)
1104 css_put(&prev->css);
1105 1177
1106 if (!root->use_hierarchy && root != root_mem_cgroup) { 1178 if (!root->use_hierarchy && root != root_mem_cgroup) {
1107 if (prev) 1179 if (prev)
1108 return NULL; 1180 goto out_css_put;
1109 return root; 1181 return root;
1110 } 1182 }
1111 1183
1184 rcu_read_lock();
1112 while (!memcg) { 1185 while (!memcg) {
1113 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1186 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1114 struct cgroup_subsys_state *css;
1115 1187
1116 if (reclaim) { 1188 if (reclaim) {
1117 int nid = zone_to_nid(reclaim->zone); 1189 int nid = zone_to_nid(reclaim->zone);
@@ -1120,31 +1192,60 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1120 1192
1121 mz = mem_cgroup_zoneinfo(root, nid, zid); 1193 mz = mem_cgroup_zoneinfo(root, nid, zid);
1122 iter = &mz->reclaim_iter[reclaim->priority]; 1194 iter = &mz->reclaim_iter[reclaim->priority];
1123 if (prev && reclaim->generation != iter->generation) 1195 last_visited = iter->last_visited;
1124 return NULL; 1196 if (prev && reclaim->generation != iter->generation) {
1125 id = iter->position; 1197 iter->last_visited = NULL;
1198 goto out_unlock;
1199 }
1200
1201 /*
1202 * If the dead_count mismatches, a destruction
1203 * has happened or is happening concurrently.
1204 * If the dead_count matches, a destruction
1205 * might still happen concurrently, but since
1206 * we checked under RCU, that destruction
1207 * won't free the object until we release the
1208 * RCU reader lock. Thus, the dead_count
1209 * check verifies the pointer is still valid,
1210 * css_tryget() verifies the cgroup pointed to
1211 * is alive.
1212 */
1213 dead_count = atomic_read(&root->dead_count);
1214 smp_rmb();
1215 last_visited = iter->last_visited;
1216 if (last_visited) {
1217 if ((dead_count != iter->last_dead_count) ||
1218 !css_tryget(&last_visited->css)) {
1219 last_visited = NULL;
1220 }
1221 }
1126 } 1222 }
1127 1223
1128 rcu_read_lock(); 1224 memcg = __mem_cgroup_iter_next(root, last_visited);
1129 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
1130 if (css) {
1131 if (css == &root->css || css_tryget(css))
1132 memcg = mem_cgroup_from_css(css);
1133 } else
1134 id = 0;
1135 rcu_read_unlock();
1136 1225
1137 if (reclaim) { 1226 if (reclaim) {
1138 iter->position = id; 1227 if (last_visited)
1139 if (!css) 1228 css_put(&last_visited->css);
1229
1230 iter->last_visited = memcg;
1231 smp_wmb();
1232 iter->last_dead_count = dead_count;
1233
1234 if (!memcg)
1140 iter->generation++; 1235 iter->generation++;
1141 else if (!prev && memcg) 1236 else if (!prev && memcg)
1142 reclaim->generation = iter->generation; 1237 reclaim->generation = iter->generation;
1143 } 1238 }
1144 1239
1145 if (prev && !css) 1240 if (prev && !memcg)
1146 return NULL; 1241 goto out_unlock;
1147 } 1242 }
1243out_unlock:
1244 rcu_read_unlock();
1245out_css_put:
1246 if (prev && prev != root)
1247 css_put(&prev->css);
1248
1148 return memcg; 1249 return memcg;
1149} 1250}
1150 1251
@@ -1686,11 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1686 struct task_struct *chosen = NULL; 1787 struct task_struct *chosen = NULL;
1687 1788
1688 /* 1789 /*
1689 * If current has a pending SIGKILL, then automatically select it. The 1790 * If current has a pending SIGKILL or is exiting, then automatically
1690 * goal is to allow it to allocate so that it may quickly exit and free 1791 * select it. The goal is to allow it to allocate so that it may
1691 * its memory. 1792 * quickly exit and free its memory.
1692 */ 1793 */
1693 if (fatal_signal_pending(current)) { 1794 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1694 set_thread_flag(TIF_MEMDIE); 1795 set_thread_flag(TIF_MEMDIE);
1695 return; 1796 return;
1696 } 1797 }
@@ -3114,12 +3215,12 @@ void memcg_release_cache(struct kmem_cache *s)
3114 3215
3115 root = s->memcg_params->root_cache; 3216 root = s->memcg_params->root_cache;
3116 root->memcg_params->memcg_caches[id] = NULL; 3217 root->memcg_params->memcg_caches[id] = NULL;
3117 mem_cgroup_put(memcg);
3118 3218
3119 mutex_lock(&memcg->slab_caches_mutex); 3219 mutex_lock(&memcg->slab_caches_mutex);
3120 list_del(&s->memcg_params->list); 3220 list_del(&s->memcg_params->list);
3121 mutex_unlock(&memcg->slab_caches_mutex); 3221 mutex_unlock(&memcg->slab_caches_mutex);
3122 3222
3223 mem_cgroup_put(memcg);
3123out: 3224out:
3124 kfree(s->memcg_params); 3225 kfree(s->memcg_params);
3125} 3226}
@@ -3220,52 +3321,53 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3220 schedule_work(&cachep->memcg_params->destroy); 3321 schedule_work(&cachep->memcg_params->destroy);
3221} 3322}
3222 3323
3223static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) 3324/*
3224{ 3325 * This lock protects updaters, not readers. We want readers to be as fast as
3225 char *name; 3326 * they can, and they will either see NULL or a valid cache value. Our model
3226 struct dentry *dentry; 3327 * allow them to see NULL, in which case the root memcg will be selected.
3227 3328 *
3228 rcu_read_lock(); 3329 * We need this lock because multiple allocations to the same cache from a non
3229 dentry = rcu_dereference(memcg->css.cgroup->dentry); 3330 * will span more than one worker. Only one of them can create the cache.
3230 rcu_read_unlock(); 3331 */
3231 3332static DEFINE_MUTEX(memcg_cache_mutex);
3232 BUG_ON(dentry == NULL);
3233
3234 name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
3235 memcg_cache_id(memcg), dentry->d_name.name);
3236
3237 return name;
3238}
3239 3333
3334/*
3335 * Called with memcg_cache_mutex held
3336 */
3240static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, 3337static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3241 struct kmem_cache *s) 3338 struct kmem_cache *s)
3242{ 3339{
3243 char *name;
3244 struct kmem_cache *new; 3340 struct kmem_cache *new;
3341 static char *tmp_name = NULL;
3245 3342
3246 name = memcg_cache_name(memcg, s); 3343 lockdep_assert_held(&memcg_cache_mutex);
3247 if (!name) 3344
3248 return NULL; 3345 /*
3346 * kmem_cache_create_memcg duplicates the given name and
3347 * cgroup_name for this name requires RCU context.
3348 * This static temporary buffer is used to prevent from
3349 * pointless shortliving allocation.
3350 */
3351 if (!tmp_name) {
3352 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3353 if (!tmp_name)
3354 return NULL;
3355 }
3356
3357 rcu_read_lock();
3358 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
3359 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
3360 rcu_read_unlock();
3249 3361
3250 new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, 3362 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3251 (s->flags & ~SLAB_PANIC), s->ctor, s); 3363 (s->flags & ~SLAB_PANIC), s->ctor, s);
3252 3364
3253 if (new) 3365 if (new)
3254 new->allocflags |= __GFP_KMEMCG; 3366 new->allocflags |= __GFP_KMEMCG;
3255 3367
3256 kfree(name);
3257 return new; 3368 return new;
3258} 3369}
3259 3370
3260/*
3261 * This lock protects updaters, not readers. We want readers to be as fast as
3262 * they can, and they will either see NULL or a valid cache value. Our model
3263 * allow them to see NULL, in which case the root memcg will be selected.
3264 *
3265 * We need this lock because multiple allocations to the same cache from a non
3266 * will span more than one worker. Only one of them can create the cache.
3267 */
3268static DEFINE_MUTEX(memcg_cache_mutex);
3269static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 3371static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3270 struct kmem_cache *cachep) 3372 struct kmem_cache *cachep)
3271{ 3373{
@@ -3382,7 +3484,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
3382 3484
3383/* 3485/*
3384 * Enqueue the creation of a per-memcg kmem_cache. 3486 * Enqueue the creation of a per-memcg kmem_cache.
3385 * Called with rcu_read_lock.
3386 */ 3487 */
3387static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3488static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3388 struct kmem_cache *cachep) 3489 struct kmem_cache *cachep)
@@ -3390,12 +3491,8 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3390 struct create_work *cw; 3491 struct create_work *cw;
3391 3492
3392 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); 3493 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3393 if (cw == NULL) 3494 if (cw == NULL) {
3394 return; 3495 css_put(&memcg->css);
3395
3396 /* The corresponding put will be done in the workqueue. */
3397 if (!css_tryget(&memcg->css)) {
3398 kfree(cw);
3399 return; 3496 return;
3400 } 3497 }
3401 3498
@@ -3451,10 +3548,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3451 3548
3452 rcu_read_lock(); 3549 rcu_read_lock();
3453 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3550 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3454 rcu_read_unlock();
3455 3551
3456 if (!memcg_can_account_kmem(memcg)) 3552 if (!memcg_can_account_kmem(memcg))
3457 return cachep; 3553 goto out;
3458 3554
3459 idx = memcg_cache_id(memcg); 3555 idx = memcg_cache_id(memcg);
3460 3556
@@ -3463,29 +3559,38 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3463 * code updating memcg_caches will issue a write barrier to match this. 3559 * code updating memcg_caches will issue a write barrier to match this.
3464 */ 3560 */
3465 read_barrier_depends(); 3561 read_barrier_depends();
3466 if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { 3562 if (likely(cachep->memcg_params->memcg_caches[idx])) {
3467 /* 3563 cachep = cachep->memcg_params->memcg_caches[idx];
3468 * If we are in a safe context (can wait, and not in interrupt 3564 goto out;
3469 * context), we could be be predictable and return right away.
3470 * This would guarantee that the allocation being performed
3471 * already belongs in the new cache.
3472 *
3473 * However, there are some clashes that can arrive from locking.
3474 * For instance, because we acquire the slab_mutex while doing
3475 * kmem_cache_dup, this means no further allocation could happen
3476 * with the slab_mutex held.
3477 *
3478 * Also, because cache creation issue get_online_cpus(), this
3479 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3480 * that ends up reversed during cpu hotplug. (cpuset allocates
3481 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3482 * better to defer everything.
3483 */
3484 memcg_create_cache_enqueue(memcg, cachep);
3485 return cachep;
3486 } 3565 }
3487 3566
3488 return cachep->memcg_params->memcg_caches[idx]; 3567 /* The corresponding put will be done in the workqueue. */
3568 if (!css_tryget(&memcg->css))
3569 goto out;
3570 rcu_read_unlock();
3571
3572 /*
3573 * If we are in a safe context (can wait, and not in interrupt
3574 * context), we could be be predictable and return right away.
3575 * This would guarantee that the allocation being performed
3576 * already belongs in the new cache.
3577 *
3578 * However, there are some clashes that can arrive from locking.
3579 * For instance, because we acquire the slab_mutex while doing
3580 * kmem_cache_dup, this means no further allocation could happen
3581 * with the slab_mutex held.
3582 *
3583 * Also, because cache creation issue get_online_cpus(), this
3584 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3585 * that ends up reversed during cpu hotplug. (cpuset allocates
3586 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3587 * better to defer everything.
3588 */
3589 memcg_create_cache_enqueue(memcg, cachep);
3590 return cachep;
3591out:
3592 rcu_read_unlock();
3593 return cachep;
3489} 3594}
3490EXPORT_SYMBOL(__memcg_kmem_get_cache); 3595EXPORT_SYMBOL(__memcg_kmem_get_cache);
3491 3596
@@ -4947,9 +5052,6 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
4947 type = MEMFILE_TYPE(cft->private); 5052 type = MEMFILE_TYPE(cft->private);
4948 name = MEMFILE_ATTR(cft->private); 5053 name = MEMFILE_ATTR(cft->private);
4949 5054
4950 if (!do_swap_account && type == _MEMSWAP)
4951 return -EOPNOTSUPP;
4952
4953 switch (type) { 5055 switch (type) {
4954 case _MEM: 5056 case _MEM:
4955 if (name == RES_USAGE) 5057 if (name == RES_USAGE)
@@ -5084,9 +5186,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
5084 type = MEMFILE_TYPE(cft->private); 5186 type = MEMFILE_TYPE(cft->private);
5085 name = MEMFILE_ATTR(cft->private); 5187 name = MEMFILE_ATTR(cft->private);
5086 5188
5087 if (!do_swap_account && type == _MEMSWAP)
5088 return -EOPNOTSUPP;
5089
5090 switch (name) { 5189 switch (name) {
5091 case RES_LIMIT: 5190 case RES_LIMIT:
5092 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 5191 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -5163,9 +5262,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
5163 type = MEMFILE_TYPE(event); 5262 type = MEMFILE_TYPE(event);
5164 name = MEMFILE_ATTR(event); 5263 name = MEMFILE_ATTR(event);
5165 5264
5166 if (!do_swap_account && type == _MEMSWAP)
5167 return -EOPNOTSUPP;
5168
5169 switch (name) { 5265 switch (name) {
5170 case RES_MAX_USAGE: 5266 case RES_MAX_USAGE:
5171 if (type == _MEM) 5267 if (type == _MEM)
@@ -5744,7 +5840,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5744 return ret; 5840 return ret;
5745 5841
5746 return mem_cgroup_sockets_init(memcg, ss); 5842 return mem_cgroup_sockets_init(memcg, ss);
5747}; 5843}
5748 5844
5749static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5845static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
5750{ 5846{
@@ -5817,6 +5913,7 @@ static struct cftype mem_cgroup_files[] = {
5817 }, 5913 },
5818 { 5914 {
5819 .name = "use_hierarchy", 5915 .name = "use_hierarchy",
5916 .flags = CFTYPE_INSANE,
5820 .write_u64 = mem_cgroup_hierarchy_write, 5917 .write_u64 = mem_cgroup_hierarchy_write,
5821 .read_u64 = mem_cgroup_hierarchy_read, 5918 .read_u64 = mem_cgroup_hierarchy_read,
5822 }, 5919 },
@@ -5838,6 +5935,11 @@ static struct cftype mem_cgroup_files[] = {
5838 .unregister_event = mem_cgroup_oom_unregister_event, 5935 .unregister_event = mem_cgroup_oom_unregister_event,
5839 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5936 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5840 }, 5937 },
5938 {
5939 .name = "pressure_level",
5940 .register_event = vmpressure_register_event,
5941 .unregister_event = vmpressure_unregister_event,
5942 },
5841#ifdef CONFIG_NUMA 5943#ifdef CONFIG_NUMA
5842 { 5944 {
5843 .name = "numa_stat", 5945 .name = "numa_stat",
@@ -6119,6 +6221,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6119 memcg->move_charge_at_immigrate = 0; 6221 memcg->move_charge_at_immigrate = 0;
6120 mutex_init(&memcg->thresholds_lock); 6222 mutex_init(&memcg->thresholds_lock);
6121 spin_lock_init(&memcg->move_lock); 6223 spin_lock_init(&memcg->move_lock);
6224 vmpressure_init(&memcg->vmpressure);
6122 6225
6123 return &memcg->css; 6226 return &memcg->css;
6124 6227
@@ -6184,10 +6287,29 @@ mem_cgroup_css_online(struct cgroup *cont)
6184 return error; 6287 return error;
6185} 6288}
6186 6289
6290/*
6291 * Announce all parents that a group from their hierarchy is gone.
6292 */
6293static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6294{
6295 struct mem_cgroup *parent = memcg;
6296
6297 while ((parent = parent_mem_cgroup(parent)))
6298 atomic_inc(&parent->dead_count);
6299
6300 /*
6301 * if the root memcg is not hierarchical we have to check it
6302 * explicitely.
6303 */
6304 if (!root_mem_cgroup->use_hierarchy)
6305 atomic_inc(&root_mem_cgroup->dead_count);
6306}
6307
6187static void mem_cgroup_css_offline(struct cgroup *cont) 6308static void mem_cgroup_css_offline(struct cgroup *cont)
6188{ 6309{
6189 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6310 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6190 6311
6312 mem_cgroup_invalidate_reclaim_iterators(memcg);
6191 mem_cgroup_reparent_charges(memcg); 6313 mem_cgroup_reparent_charges(memcg);
6192 mem_cgroup_destroy_all_caches(memcg); 6314 mem_cgroup_destroy_all_caches(memcg);
6193} 6315}
@@ -6787,6 +6909,21 @@ static void mem_cgroup_move_task(struct cgroup *cont,
6787} 6909}
6788#endif 6910#endif
6789 6911
6912/*
6913 * Cgroup retains root cgroups across [un]mount cycles making it necessary
6914 * to verify sane_behavior flag on each mount attempt.
6915 */
6916static void mem_cgroup_bind(struct cgroup *root)
6917{
6918 /*
6919 * use_hierarchy is forced with sane_behavior. cgroup core
6920 * guarantees that @root doesn't have any children, so turning it
6921 * on for the root memcg is enough.
6922 */
6923 if (cgroup_sane_behavior(root))
6924 mem_cgroup_from_cont(root)->use_hierarchy = true;
6925}
6926
6790struct cgroup_subsys mem_cgroup_subsys = { 6927struct cgroup_subsys mem_cgroup_subsys = {
6791 .name = "memory", 6928 .name = "memory",
6792 .subsys_id = mem_cgroup_subsys_id, 6929 .subsys_id = mem_cgroup_subsys_id,
@@ -6797,6 +6934,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
6797 .can_attach = mem_cgroup_can_attach, 6934 .can_attach = mem_cgroup_can_attach,
6798 .cancel_attach = mem_cgroup_cancel_attach, 6935 .cancel_attach = mem_cgroup_cancel_attach,
6799 .attach = mem_cgroup_move_task, 6936 .attach = mem_cgroup_move_task,
6937 .bind = mem_cgroup_bind,
6800 .base_cftypes = mem_cgroup_files, 6938 .base_cftypes = mem_cgroup_files,
6801 .early_init = 0, 6939 .early_init = 0,
6802 .use_id = 1, 6940 .use_id = 1,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index df0694c6adef..ceb0c7f1932f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -785,10 +785,10 @@ static struct page_state {
785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, 787 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
788 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, 788 { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
789 789
790 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, 790 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
791 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, 791 { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
792 792
793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
diff --git a/mm/memory.c b/mm/memory.c
index 494526ae024a..6dc1882fbd72 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -216,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
216 tlb->mm = mm; 216 tlb->mm = mm;
217 217
218 tlb->fullmm = fullmm; 218 tlb->fullmm = fullmm;
219 tlb->need_flush_all = 0;
219 tlb->start = -1UL; 220 tlb->start = -1UL;
220 tlb->end = 0; 221 tlb->end = 0;
221 tlb->need_flush = 0; 222 tlb->need_flush = 0;
@@ -714,11 +715,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
714 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y 715 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
715 */ 716 */
716 if (vma->vm_ops) 717 if (vma->vm_ops)
717 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", 718 printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
718 (unsigned long)vma->vm_ops->fault); 719 vma->vm_ops->fault);
719 if (vma->vm_file && vma->vm_file->f_op) 720 if (vma->vm_file && vma->vm_file->f_op)
720 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", 721 printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
721 (unsigned long)vma->vm_file->f_op->mmap); 722 vma->vm_file->f_op->mmap);
722 dump_stack(); 723 dump_stack();
723 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 724 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
724} 725}
@@ -2392,6 +2393,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2392} 2393}
2393EXPORT_SYMBOL(remap_pfn_range); 2394EXPORT_SYMBOL(remap_pfn_range);
2394 2395
2396/**
2397 * vm_iomap_memory - remap memory to userspace
2398 * @vma: user vma to map to
2399 * @start: start of area
2400 * @len: size of area
2401 *
2402 * This is a simplified io_remap_pfn_range() for common driver use. The
2403 * driver just needs to give us the physical memory range to be mapped,
2404 * we'll figure out the rest from the vma information.
2405 *
2406 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
2407 * whatever write-combining details or similar.
2408 */
2409int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2410{
2411 unsigned long vm_len, pfn, pages;
2412
2413 /* Check that the physical memory area passed in looks valid */
2414 if (start + len < start)
2415 return -EINVAL;
2416 /*
2417 * You *really* shouldn't map things that aren't page-aligned,
2418 * but we've historically allowed it because IO memory might
2419 * just have smaller alignment.
2420 */
2421 len += start & ~PAGE_MASK;
2422 pfn = start >> PAGE_SHIFT;
2423 pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
2424 if (pfn + pages < pfn)
2425 return -EINVAL;
2426
2427 /* We start the mapping 'vm_pgoff' pages into the area */
2428 if (vma->vm_pgoff > pages)
2429 return -EINVAL;
2430 pfn += vma->vm_pgoff;
2431 pages -= vma->vm_pgoff;
2432
2433 /* Can we fit all of the mapping? */
2434 vm_len = vma->vm_end - vma->vm_start;
2435 if (vm_len >> PAGE_SHIFT > pages)
2436 return -EINVAL;
2437
2438 /* Ok, let it rip */
2439 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
2440}
2441EXPORT_SYMBOL(vm_iomap_memory);
2442
2395static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, 2443static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2396 unsigned long addr, unsigned long end, 2444 unsigned long addr, unsigned long end,
2397 pte_fn_t fn, void *data) 2445 pte_fn_t fn, void *data)
@@ -3196,6 +3244,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3196 page = alloc_zeroed_user_highpage_movable(vma, address); 3244 page = alloc_zeroed_user_highpage_movable(vma, address);
3197 if (!page) 3245 if (!page)
3198 goto oom; 3246 goto oom;
3247 /*
3248 * The memory barrier inside __SetPageUptodate makes sure that
3249 * preceeding stores to the page contents become visible before
3250 * the set_pte_at() write.
3251 */
3199 __SetPageUptodate(page); 3252 __SetPageUptodate(page);
3200 3253
3201 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) 3254 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b81a367b9f39..a221fac1f47d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -436,6 +436,40 @@ static int __meminit __add_section(int nid, struct zone *zone,
436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
437} 437}
438 438
439/*
440 * Reasonably generic function for adding memory. It is
441 * expected that archs that support memory hotplug will
442 * call this function after deciding the zone to which to
443 * add the new pages.
444 */
445int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
446 unsigned long nr_pages)
447{
448 unsigned long i;
449 int err = 0;
450 int start_sec, end_sec;
451 /* during initialize mem_map, align hot-added range to section */
452 start_sec = pfn_to_section_nr(phys_start_pfn);
453 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
454
455 for (i = start_sec; i <= end_sec; i++) {
456 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
457
458 /*
459 * EEXIST is finally dealt with by ioresource collision
460 * check. see add_memory() => register_memory_resource()
461 * Warning will be printed if there is collision.
462 */
463 if (err && (err != -EEXIST))
464 break;
465 err = 0;
466 }
467
468 return err;
469}
470EXPORT_SYMBOL_GPL(__add_pages);
471
472#ifdef CONFIG_MEMORY_HOTREMOVE
439/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 473/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
440static int find_smallest_section_pfn(int nid, struct zone *zone, 474static int find_smallest_section_pfn(int nid, struct zone *zone,
441 unsigned long start_pfn, 475 unsigned long start_pfn,
@@ -658,39 +692,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
658 return 0; 692 return 0;
659} 693}
660 694
661/*
662 * Reasonably generic function for adding memory. It is
663 * expected that archs that support memory hotplug will
664 * call this function after deciding the zone to which to
665 * add the new pages.
666 */
667int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
668 unsigned long nr_pages)
669{
670 unsigned long i;
671 int err = 0;
672 int start_sec, end_sec;
673 /* during initialize mem_map, align hot-added range to section */
674 start_sec = pfn_to_section_nr(phys_start_pfn);
675 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
676
677 for (i = start_sec; i <= end_sec; i++) {
678 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
679
680 /*
681 * EEXIST is finally dealt with by ioresource collision
682 * check. see add_memory() => register_memory_resource()
683 * Warning will be printed if there is collision.
684 */
685 if (err && (err != -EEXIST))
686 break;
687 err = 0;
688 }
689
690 return err;
691}
692EXPORT_SYMBOL_GPL(__add_pages);
693
694/** 695/**
695 * __remove_pages() - remove sections of pages from a zone 696 * __remove_pages() - remove sections of pages from a zone
696 * @zone: zone from which pages need to be removed 697 * @zone: zone from which pages need to be removed
@@ -705,8 +706,10 @@ EXPORT_SYMBOL_GPL(__add_pages);
705int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 706int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
706 unsigned long nr_pages) 707 unsigned long nr_pages)
707{ 708{
708 unsigned long i, ret = 0; 709 unsigned long i;
709 int sections_to_remove; 710 int sections_to_remove;
711 resource_size_t start, size;
712 int ret = 0;
710 713
711 /* 714 /*
712 * We can only remove entire sections 715 * We can only remove entire sections
@@ -714,7 +717,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
714 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 717 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
715 BUG_ON(nr_pages % PAGES_PER_SECTION); 718 BUG_ON(nr_pages % PAGES_PER_SECTION);
716 719
717 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); 720 start = phys_start_pfn << PAGE_SHIFT;
721 size = nr_pages * PAGE_SIZE;
722 ret = release_mem_region_adjustable(&iomem_resource, start, size);
723 if (ret)
724 pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n",
725 start, start + size - 1, ret);
718 726
719 sections_to_remove = nr_pages / PAGES_PER_SECTION; 727 sections_to_remove = nr_pages / PAGES_PER_SECTION;
720 for (i = 0; i < sections_to_remove; i++) { 728 for (i = 0; i < sections_to_remove; i++) {
@@ -726,6 +734,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
726 return ret; 734 return ret;
727} 735}
728EXPORT_SYMBOL_GPL(__remove_pages); 736EXPORT_SYMBOL_GPL(__remove_pages);
737#endif /* CONFIG_MEMORY_HOTREMOVE */
729 738
730int set_online_page_callback(online_page_callback_t callback) 739int set_online_page_callback(online_page_callback_t callback)
731{ 740{
@@ -1613,7 +1622,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1613/** 1622/**
1614 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1623 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1615 * @start_pfn: start pfn of the memory range 1624 * @start_pfn: start pfn of the memory range
1616 * @end_pfn: end pft of the memory range 1625 * @end_pfn: end pfn of the memory range
1617 * @arg: argument passed to func 1626 * @arg: argument passed to func
1618 * @func: callback for each memory section walked 1627 * @func: callback for each memory section walked
1619 * 1628 *
@@ -1681,11 +1690,15 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1681{ 1690{
1682 int ret = !is_memblock_offlined(mem); 1691 int ret = !is_memblock_offlined(mem);
1683 1692
1684 if (unlikely(ret)) 1693 if (unlikely(ret)) {
1694 phys_addr_t beginpa, endpa;
1695
1696 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1697 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
1685 pr_warn("removing memory fails, because memory " 1698 pr_warn("removing memory fails, because memory "
1686 "[%#010llx-%#010llx] is onlined\n", 1699 "[%pa-%pa] is onlined\n",
1687 PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), 1700 &beginpa, &endpa);
1688 PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); 1701 }
1689 1702
1690 return ret; 1703 return ret;
1691} 1704}
@@ -1779,7 +1792,11 @@ void try_offline_node(int nid)
1779 for (i = 0; i < MAX_NR_ZONES; i++) { 1792 for (i = 0; i < MAX_NR_ZONES; i++) {
1780 struct zone *zone = pgdat->node_zones + i; 1793 struct zone *zone = pgdat->node_zones + i;
1781 1794
1782 if (zone->wait_table) 1795 /*
1796 * wait_table may be allocated from boot memory,
1797 * here only free if it's allocated by vmalloc.
1798 */
1799 if (is_vmalloc_addr(zone->wait_table))
1783 vfree(zone->wait_table); 1800 vfree(zone->wait_table);
1784 } 1801 }
1785 1802
@@ -1801,7 +1818,7 @@ int __ref remove_memory(int nid, u64 start, u64 size)
1801 int retry = 1; 1818 int retry = 1;
1802 1819
1803 start_pfn = PFN_DOWN(start); 1820 start_pfn = PFN_DOWN(start);
1804 end_pfn = start_pfn + PFN_DOWN(size); 1821 end_pfn = PFN_UP(start + size - 1);
1805 1822
1806 /* 1823 /*
1807 * When CONFIG_MEMCG is on, one memory block may be used by other 1824 * When CONFIG_MEMCG is on, one memory block may be used by other
diff --git a/mm/migrate.c b/mm/migrate.c
index 3bbaf5d230b0..27ed22579fd9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -736,7 +736,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
736 736
737 if (PageWriteback(page)) { 737 if (PageWriteback(page)) {
738 /* 738 /*
739 * Only in the case of a full syncronous migration is it 739 * Only in the case of a full synchronous migration is it
740 * necessary to wait for PageWriteback. In the async case, 740 * necessary to wait for PageWriteback. In the async case,
741 * the retry loop is too short and in the sync-light case, 741 * the retry loop is too short and in the sync-light case,
742 * the overhead of stalling is too much 742 * the overhead of stalling is too much
@@ -973,19 +973,23 @@ out:
973} 973}
974 974
975/* 975/*
976 * migrate_pages 976 * migrate_pages - migrate the pages specified in a list, to the free pages
977 * supplied as the target for the page migration
977 * 978 *
978 * The function takes one list of pages to migrate and a function 979 * @from: The list of pages to be migrated.
979 * that determines from the page to be migrated and the private data 980 * @get_new_page: The function used to allocate free pages to be used
980 * the target of the move and allocates the page. 981 * as the target of the page migration.
982 * @private: Private data to be passed on to get_new_page()
983 * @mode: The migration mode that specifies the constraints for
984 * page migration, if any.
985 * @reason: The reason for page migration.
981 * 986 *
982 * The function returns after 10 attempts or if no pages 987 * The function returns after 10 attempts or if no pages are movable any more
983 * are movable anymore because to has become empty 988 * because the list has become empty or no retryable pages exist any more.
984 * or no retryable pages exist anymore. 989 * The caller should call putback_lru_pages() to return pages to the LRU
985 * Caller should call putback_lru_pages to return pages to the LRU
986 * or free list only if ret != 0. 990 * or free list only if ret != 0.
987 * 991 *
988 * Return: Number of pages not migrated or error code. 992 * Returns the number of pages that were not migrated, or an error code.
989 */ 993 */
990int migrate_pages(struct list_head *from, new_page_t get_new_page, 994int migrate_pages(struct list_head *from, new_page_t get_new_page,
991 unsigned long private, enum migrate_mode mode, int reason) 995 unsigned long private, enum migrate_mode mode, int reason)
diff --git a/mm/mlock.c b/mm/mlock.c
index 1c5e33fce639..79b7cf7d1bca 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
358 358
359 newflags = vma->vm_flags & ~VM_LOCKED; 359 newflags = vma->vm_flags & ~VM_LOCKED;
360 if (on) 360 if (on)
361 newflags |= VM_LOCKED | VM_POPULATE; 361 newflags |= VM_LOCKED;
362 362
363 tmp = vma->vm_end; 363 tmp = vma->vm_end;
364 if (tmp > end) 364 if (tmp > end)
@@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
418 * range with the first VMA. Also, skip undesirable VMA types. 418 * range with the first VMA. Also, skip undesirable VMA types.
419 */ 419 */
420 nend = min(end, vma->vm_end); 420 nend = min(end, vma->vm_end);
421 if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != 421 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
422 VM_POPULATE)
423 continue; 422 continue;
424 if (nstart < vma->vm_start) 423 if (nstart < vma->vm_start)
425 nstart = vma->vm_start; 424 nstart = vma->vm_start;
@@ -492,9 +491,9 @@ static int do_mlockall(int flags)
492 struct vm_area_struct * vma, * prev = NULL; 491 struct vm_area_struct * vma, * prev = NULL;
493 492
494 if (flags & MCL_FUTURE) 493 if (flags & MCL_FUTURE)
495 current->mm->def_flags |= VM_LOCKED | VM_POPULATE; 494 current->mm->def_flags |= VM_LOCKED;
496 else 495 else
497 current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); 496 current->mm->def_flags &= ~VM_LOCKED;
498 if (flags == MCL_FUTURE) 497 if (flags == MCL_FUTURE)
499 goto out; 498 goto out;
500 499
@@ -503,7 +502,7 @@ static int do_mlockall(int flags)
503 502
504 newflags = vma->vm_flags & ~VM_LOCKED; 503 newflags = vma->vm_flags & ~VM_LOCKED;
505 if (flags & MCL_CURRENT) 504 if (flags & MCL_CURRENT)
506 newflags |= VM_LOCKED | VM_POPULATE; 505 newflags |= VM_LOCKED;
507 506
508 /* Ignore errors */ 507 /* Ignore errors */
509 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); 508 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 2664a47cec93..da3e9c04bf37 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,6 +6,7 @@
6 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 6 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
7 */ 7 */
8 8
9#include <linux/kernel.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
10#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
@@ -33,6 +34,8 @@
33#include <linux/uprobes.h> 34#include <linux/uprobes.h>
34#include <linux/rbtree_augmented.h> 35#include <linux/rbtree_augmented.h>
35#include <linux/sched/sysctl.h> 36#include <linux/sched/sysctl.h>
37#include <linux/notifier.h>
38#include <linux/memory.h>
36 39
37#include <asm/uaccess.h> 40#include <asm/uaccess.h>
38#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
@@ -84,6 +87,8 @@ EXPORT_SYMBOL(vm_get_page_prot);
84int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
85int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
86int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 89int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
90unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
91unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
87/* 92/*
88 * Make sure vm_committed_as in one cacheline and not cacheline shared with 93 * Make sure vm_committed_as in one cacheline and not cacheline shared with
89 * other variables. It can be updated by several CPUs frequently. 94 * other variables. It can be updated by several CPUs frequently.
@@ -122,7 +127,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
122 */ 127 */
123int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 128int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
124{ 129{
125 unsigned long free, allowed; 130 unsigned long free, allowed, reserve;
126 131
127 vm_acct_memory(pages); 132 vm_acct_memory(pages);
128 133
@@ -163,10 +168,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
163 free -= totalreserve_pages; 168 free -= totalreserve_pages;
164 169
165 /* 170 /*
166 * Leave the last 3% for root 171 * Reserve some for root
167 */ 172 */
168 if (!cap_sys_admin) 173 if (!cap_sys_admin)
169 free -= free / 32; 174 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
170 175
171 if (free > pages) 176 if (free > pages)
172 return 0; 177 return 0;
@@ -177,16 +182,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
177 allowed = (totalram_pages - hugetlb_total_pages()) 182 allowed = (totalram_pages - hugetlb_total_pages())
178 * sysctl_overcommit_ratio / 100; 183 * sysctl_overcommit_ratio / 100;
179 /* 184 /*
180 * Leave the last 3% for root 185 * Reserve some for root
181 */ 186 */
182 if (!cap_sys_admin) 187 if (!cap_sys_admin)
183 allowed -= allowed / 32; 188 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
184 allowed += total_swap_pages; 189 allowed += total_swap_pages;
185 190
186 /* Don't let a single process grow too big: 191 /*
187 leave 3% of the size of this process for other processes */ 192 * Don't let a single process grow so big a user can't recover
188 if (mm) 193 */
189 allowed -= mm->total_vm / 32; 194 if (mm) {
195 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
196 allowed -= min(mm->total_vm / 32, reserve);
197 }
190 198
191 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 199 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
192 return 0; 200 return 0;
@@ -543,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
543 return 0; 551 return 0;
544} 552}
545 553
554static unsigned long count_vma_pages_range(struct mm_struct *mm,
555 unsigned long addr, unsigned long end)
556{
557 unsigned long nr_pages = 0;
558 struct vm_area_struct *vma;
559
560 /* Find first overlaping mapping */
561 vma = find_vma_intersection(mm, addr, end);
562 if (!vma)
563 return 0;
564
565 nr_pages = (min(end, vma->vm_end) -
566 max(addr, vma->vm_start)) >> PAGE_SHIFT;
567
568 /* Iterate over the rest of the overlaps */
569 for (vma = vma->vm_next; vma; vma = vma->vm_next) {
570 unsigned long overlap_len;
571
572 if (vma->vm_start > end)
573 break;
574
575 overlap_len = min(end, vma->vm_end) - vma->vm_start;
576 nr_pages += overlap_len >> PAGE_SHIFT;
577 }
578
579 return nr_pages;
580}
581
546void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 582void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
547 struct rb_node **rb_link, struct rb_node *rb_parent) 583 struct rb_node **rb_link, struct rb_node *rb_parent)
548{ 584{
@@ -829,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end);
829 if (next->anon_vma) 865 if (next->anon_vma)
830 anon_vma_merge(vma, next); 866 anon_vma_merge(vma, next);
831 mm->map_count--; 867 mm->map_count--;
832 mpol_put(vma_policy(next)); 868 vma_set_policy(vma, vma_policy(next));
833 kmem_cache_free(vm_area_cachep, next); 869 kmem_cache_free(vm_area_cachep, next);
834 /* 870 /*
835 * In mprotect's case 6 (see comments on vma_merge), 871 * In mprotect's case 6 (see comments on vma_merge),
@@ -1306,7 +1342,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1306 } 1342 }
1307 1343
1308 addr = mmap_region(file, addr, len, vm_flags, pgoff); 1344 addr = mmap_region(file, addr, len, vm_flags, pgoff);
1309 if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) 1345 if (!IS_ERR_VALUE(addr) &&
1346 ((vm_flags & VM_LOCKED) ||
1347 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
1310 *populate = len; 1348 *populate = len;
1311 return addr; 1349 return addr;
1312} 1350}
@@ -1433,6 +1471,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1433 unsigned long charged = 0; 1471 unsigned long charged = 0;
1434 struct inode *inode = file ? file_inode(file) : NULL; 1472 struct inode *inode = file ? file_inode(file) : NULL;
1435 1473
1474 /* Check against address space limit. */
1475 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
1476 unsigned long nr_pages;
1477
1478 /*
1479 * MAP_FIXED may remove pages of mappings that intersects with
1480 * requested mapping. Account for the pages it would unmap.
1481 */
1482 if (!(vm_flags & MAP_FIXED))
1483 return -ENOMEM;
1484
1485 nr_pages = count_vma_pages_range(mm, addr, addr + len);
1486
1487 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
1488 return -ENOMEM;
1489 }
1490
1436 /* Clear old maps */ 1491 /* Clear old maps */
1437 error = -ENOMEM; 1492 error = -ENOMEM;
1438munmap_back: 1493munmap_back:
@@ -1442,10 +1497,6 @@ munmap_back:
1442 goto munmap_back; 1497 goto munmap_back;
1443 } 1498 }
1444 1499
1445 /* Check against address space limit. */
1446 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1447 return -ENOMEM;
1448
1449 /* 1500 /*
1450 * Private writable mapping: check memory availability 1501 * Private writable mapping: check memory availability
1451 */ 1502 */
@@ -1933,12 +1984,9 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1933{ 1984{
1934 struct vm_area_struct *vma = NULL; 1985 struct vm_area_struct *vma = NULL;
1935 1986
1936 if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
1937 return NULL;
1938
1939 /* Check the cache first. */ 1987 /* Check the cache first. */
1940 /* (Cache hit rate is typically around 35%.) */ 1988 /* (Cache hit rate is typically around 35%.) */
1941 vma = mm->mmap_cache; 1989 vma = ACCESS_ONCE(mm->mmap_cache);
1942 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 1990 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1943 struct rb_node *rb_node; 1991 struct rb_node *rb_node;
1944 1992
@@ -2303,7 +2351,7 @@ static void unmap_region(struct mm_struct *mm,
2303 update_hiwater_rss(mm); 2351 update_hiwater_rss(mm);
2304 unmap_vmas(&tlb, vma, start, end); 2352 unmap_vmas(&tlb, vma, start, end);
2305 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 2353 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
2306 next ? next->vm_start : 0); 2354 next ? next->vm_start : USER_PGTABLES_CEILING);
2307 tlb_finish_mmu(&tlb, start, end); 2355 tlb_finish_mmu(&tlb, start, end);
2308} 2356}
2309 2357
@@ -2683,7 +2731,7 @@ void exit_mmap(struct mm_struct *mm)
2683 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2731 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2684 unmap_vmas(&tlb, vma, 0, -1); 2732 unmap_vmas(&tlb, vma, 0, -1);
2685 2733
2686 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 2734 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
2687 tlb_finish_mmu(&tlb, 0, -1); 2735 tlb_finish_mmu(&tlb, 0, -1);
2688 2736
2689 /* 2737 /*
@@ -3095,3 +3143,115 @@ void __init mmap_init(void)
3095 ret = percpu_counter_init(&vm_committed_as, 0); 3143 ret = percpu_counter_init(&vm_committed_as, 0);
3096 VM_BUG_ON(ret); 3144 VM_BUG_ON(ret);
3097} 3145}
3146
3147/*
3148 * Initialise sysctl_user_reserve_kbytes.
3149 *
3150 * This is intended to prevent a user from starting a single memory hogging
3151 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
3152 * mode.
3153 *
3154 * The default value is min(3% of free memory, 128MB)
3155 * 128MB is enough to recover with sshd/login, bash, and top/kill.
3156 */
3157static int init_user_reserve(void)
3158{
3159 unsigned long free_kbytes;
3160
3161 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3162
3163 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3164 return 0;
3165}
3166module_init(init_user_reserve)
3167
3168/*
3169 * Initialise sysctl_admin_reserve_kbytes.
3170 *
3171 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
3172 * to log in and kill a memory hogging process.
3173 *
3174 * Systems with more than 256MB will reserve 8MB, enough to recover
3175 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
3176 * only reserve 3% of free pages by default.
3177 */
3178static int init_admin_reserve(void)
3179{
3180 unsigned long free_kbytes;
3181
3182 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3183
3184 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3185 return 0;
3186}
3187module_init(init_admin_reserve)
3188
3189/*
3190 * Reinititalise user and admin reserves if memory is added or removed.
3191 *
3192 * The default user reserve max is 128MB, and the default max for the
3193 * admin reserve is 8MB. These are usually, but not always, enough to
3194 * enable recovery from a memory hogging process using login/sshd, a shell,
3195 * and tools like top. It may make sense to increase or even disable the
3196 * reserve depending on the existence of swap or variations in the recovery
3197 * tools. So, the admin may have changed them.
3198 *
3199 * If memory is added and the reserves have been eliminated or increased above
3200 * the default max, then we'll trust the admin.
3201 *
3202 * If memory is removed and there isn't enough free memory, then we
3203 * need to reset the reserves.
3204 *
3205 * Otherwise keep the reserve set by the admin.
3206 */
3207static int reserve_mem_notifier(struct notifier_block *nb,
3208 unsigned long action, void *data)
3209{
3210 unsigned long tmp, free_kbytes;
3211
3212 switch (action) {
3213 case MEM_ONLINE:
3214 /* Default max is 128MB. Leave alone if modified by operator. */
3215 tmp = sysctl_user_reserve_kbytes;
3216 if (0 < tmp && tmp < (1UL << 17))
3217 init_user_reserve();
3218
3219 /* Default max is 8MB. Leave alone if modified by operator. */
3220 tmp = sysctl_admin_reserve_kbytes;
3221 if (0 < tmp && tmp < (1UL << 13))
3222 init_admin_reserve();
3223
3224 break;
3225 case MEM_OFFLINE:
3226 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3227
3228 if (sysctl_user_reserve_kbytes > free_kbytes) {
3229 init_user_reserve();
3230 pr_info("vm.user_reserve_kbytes reset to %lu\n",
3231 sysctl_user_reserve_kbytes);
3232 }
3233
3234 if (sysctl_admin_reserve_kbytes > free_kbytes) {
3235 init_admin_reserve();
3236 pr_info("vm.admin_reserve_kbytes reset to %lu\n",
3237 sysctl_admin_reserve_kbytes);
3238 }
3239 break;
3240 default:
3241 break;
3242 }
3243 return NOTIFY_OK;
3244}
3245
3246static struct notifier_block reserve_mem_nb = {
3247 .notifier_call = reserve_mem_notifier,
3248};
3249
3250static int __meminit init_reserve_notifier(void)
3251{
3252 if (register_hotmemory_notifier(&reserve_mem_nb))
3253 printk("Failed registering memory add/remove notifier for admin reserve");
3254
3255 return 0;
3256}
3257module_init(init_reserve_notifier)
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36e381e..bdd3fa2fc73b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -45,9 +45,9 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
45 if (!addr) 45 if (!addr)
46 return NULL; 46 return NULL;
47 47
48 memblock_reserve(addr, size);
48 ptr = phys_to_virt(addr); 49 ptr = phys_to_virt(addr);
49 memset(ptr, 0, size); 50 memset(ptr, 0, size);
50 memblock_reserve(addr, size);
51 /* 51 /*
52 * The min_count is set to 0 so that bootmem allocated blocks 52 * The min_count is set to 0 so that bootmem allocated blocks
53 * are never reported as leaks. 53 * are never reported as leaks.
@@ -120,7 +120,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
120 return end_pfn - start_pfn; 120 return end_pfn - start_pfn;
121} 121}
122 122
123unsigned long __init free_low_memory_core_early(int nodeid) 123static unsigned long __init free_low_memory_core_early(void)
124{ 124{
125 unsigned long count = 0; 125 unsigned long count = 0;
126 phys_addr_t start, end, size; 126 phys_addr_t start, end, size;
@@ -170,7 +170,7 @@ unsigned long __init free_all_bootmem(void)
170 * because in some case like Node0 doesn't have RAM installed 170 * because in some case like Node0 doesn't have RAM installed
171 * low ram will be on Node1 171 * low ram will be on Node1
172 */ 172 */
173 return free_low_memory_core_early(MAX_NUMNODES); 173 return free_low_memory_core_early();
174} 174}
175 175
176/** 176/**
diff --git a/mm/nommu.c b/mm/nommu.c
index e19328087534..fbe3e2f317eb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -63,6 +63,8 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
63int sysctl_overcommit_ratio = 50; /* default is 50% */ 63int sysctl_overcommit_ratio = 50; /* default is 50% */
64int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 64int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
65int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 65int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
66unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
67unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
66int heap_stack_gap = 0; 68int heap_stack_gap = 0;
67 69
68atomic_long_t mmap_pages_allocated; 70atomic_long_t mmap_pages_allocated;
@@ -228,8 +230,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
228} 230}
229EXPORT_SYMBOL(follow_pfn); 231EXPORT_SYMBOL(follow_pfn);
230 232
231DEFINE_RWLOCK(vmlist_lock); 233LIST_HEAD(vmap_area_list);
232struct vm_struct *vmlist;
233 234
234void vfree(const void *addr) 235void vfree(const void *addr)
235{ 236{
@@ -821,7 +822,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
821 struct vm_area_struct *vma; 822 struct vm_area_struct *vma;
822 823
823 /* check the cache first */ 824 /* check the cache first */
824 vma = mm->mmap_cache; 825 vma = ACCESS_ONCE(mm->mmap_cache);
825 if (vma && vma->vm_start <= addr && vma->vm_end > addr) 826 if (vma && vma->vm_start <= addr && vma->vm_end > addr)
826 return vma; 827 return vma;
827 828
@@ -1838,6 +1839,16 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1838} 1839}
1839EXPORT_SYMBOL(remap_pfn_range); 1840EXPORT_SYMBOL(remap_pfn_range);
1840 1841
1842int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1843{
1844 unsigned long pfn = start >> PAGE_SHIFT;
1845 unsigned long vm_len = vma->vm_end - vma->vm_start;
1846
1847 pfn += vma->vm_pgoff;
1848 return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1849}
1850EXPORT_SYMBOL(vm_iomap_memory);
1851
1841int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 1852int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1842 unsigned long pgoff) 1853 unsigned long pgoff)
1843{ 1854{
@@ -1888,7 +1899,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
1888 */ 1899 */
1889int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 1900int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1890{ 1901{
1891 unsigned long free, allowed; 1902 unsigned long free, allowed, reserve;
1892 1903
1893 vm_acct_memory(pages); 1904 vm_acct_memory(pages);
1894 1905
@@ -1929,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1929 free -= totalreserve_pages; 1940 free -= totalreserve_pages;
1930 1941
1931 /* 1942 /*
1932 * Leave the last 3% for root 1943 * Reserve some for root
1933 */ 1944 */
1934 if (!cap_sys_admin) 1945 if (!cap_sys_admin)
1935 free -= free / 32; 1946 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
1936 1947
1937 if (free > pages) 1948 if (free > pages)
1938 return 0; 1949 return 0;
@@ -1942,16 +1953,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1942 1953
1943 allowed = totalram_pages * sysctl_overcommit_ratio / 100; 1954 allowed = totalram_pages * sysctl_overcommit_ratio / 100;
1944 /* 1955 /*
1945 * Leave the last 3% for root 1956 * Reserve some 3% for root
1946 */ 1957 */
1947 if (!cap_sys_admin) 1958 if (!cap_sys_admin)
1948 allowed -= allowed / 32; 1959 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
1949 allowed += total_swap_pages; 1960 allowed += total_swap_pages;
1950 1961
1951 /* Don't let a single process grow too big: 1962 /*
1952 leave 3% of the size of this process for other processes */ 1963 * Don't let a single process grow so big a user can't recover
1953 if (mm) 1964 */
1954 allowed -= mm->total_vm / 32; 1965 if (mm) {
1966 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
1967 allowed -= min(mm->total_vm / 32, reserve);
1968 }
1955 1969
1956 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 1970 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
1957 return 0; 1971 return 0;
@@ -2113,3 +2127,45 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2113 up_write(&nommu_region_sem); 2127 up_write(&nommu_region_sem);
2114 return 0; 2128 return 0;
2115} 2129}
2130
2131/*
2132 * Initialise sysctl_user_reserve_kbytes.
2133 *
2134 * This is intended to prevent a user from starting a single memory hogging
2135 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
2136 * mode.
2137 *
2138 * The default value is min(3% of free memory, 128MB)
2139 * 128MB is enough to recover with sshd/login, bash, and top/kill.
2140 */
2141static int __meminit init_user_reserve(void)
2142{
2143 unsigned long free_kbytes;
2144
2145 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
2146
2147 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
2148 return 0;
2149}
2150module_init(init_user_reserve)
2151
2152/*
2153 * Initialise sysctl_admin_reserve_kbytes.
2154 *
2155 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
2156 * to log in and kill a memory hogging process.
2157 *
2158 * Systems with more than 256MB will reserve 8MB, enough to recover
2159 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
2160 * only reserve 3% of free pages by default.
2161 */
2162static int __meminit init_admin_reserve(void)
2163{
2164 unsigned long free_kbytes;
2165
2166 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
2167
2168 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
2169 return 0;
2170}
2171module_init(init_admin_reserve)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index efe68148f621..4514ad7415c3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2311,10 +2311,6 @@ void wait_for_stable_page(struct page *page)
2311 2311
2312 if (!bdi_cap_stable_pages_required(bdi)) 2312 if (!bdi_cap_stable_pages_required(bdi))
2313 return; 2313 return;
2314#ifdef CONFIG_NEED_BOUNCE_POOL
2315 if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
2316 return;
2317#endif /* CONFIG_NEED_BOUNCE_POOL */
2318 2314
2319 wait_on_page_writeback(page); 2315 wait_on_page_writeback(page);
2320} 2316}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7823fa..98cbdf6e5532 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
58#include <linux/prefetch.h> 58#include <linux/prefetch.h>
59#include <linux/migrate.h> 59#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 60#include <linux/page-debug-flags.h>
61#include <linux/hugetlb.h>
61#include <linux/sched/rt.h> 62#include <linux/sched/rt.h>
62 63
63#include <asm/tlbflush.h> 64#include <asm/tlbflush.h>
@@ -1397,6 +1398,7 @@ void split_page(struct page *page, unsigned int order)
1397 for (i = 1; i < (1 << order); i++) 1398 for (i = 1; i < (1 << order); i++)
1398 set_page_refcounted(page + i); 1399 set_page_refcounted(page + i);
1399} 1400}
1401EXPORT_SYMBOL_GPL(split_page);
1400 1402
1401static int __isolate_free_page(struct page *page, unsigned int order) 1403static int __isolate_free_page(struct page *page, unsigned int order)
1402{ 1404{
@@ -1940,9 +1942,24 @@ zonelist_scan:
1940 continue; 1942 continue;
1941 default: 1943 default:
1942 /* did we reclaim enough */ 1944 /* did we reclaim enough */
1943 if (!zone_watermark_ok(zone, order, mark, 1945 if (zone_watermark_ok(zone, order, mark,
1944 classzone_idx, alloc_flags)) 1946 classzone_idx, alloc_flags))
1947 goto try_this_zone;
1948
1949 /*
1950 * Failed to reclaim enough to meet watermark.
1951 * Only mark the zone full if checking the min
1952 * watermark or if we failed to reclaim just
1953 * 1<<order pages or else the page allocator
1954 * fastpath will prematurely mark zones full
1955 * when the watermark is between the low and
1956 * min watermarks.
1957 */
1958 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
1959 ret == ZONE_RECLAIM_SOME)
1945 goto this_zone_full; 1960 goto this_zone_full;
1961
1962 continue;
1946 } 1963 }
1947 } 1964 }
1948 1965
@@ -2002,6 +2019,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2002 return; 2019 return;
2003 2020
2004 /* 2021 /*
2022 * Walking all memory to count page types is very expensive and should
2023 * be inhibited in non-blockable contexts.
2024 */
2025 if (!(gfp_mask & __GFP_WAIT))
2026 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2027
2028 /*
2005 * This documents exceptions given to allocations in certain 2029 * This documents exceptions given to allocations in certain
2006 * contexts that are allowed to allocate outside current's set 2030 * contexts that are allowed to allocate outside current's set
2007 * of allowed nodes. 2031 * of allowed nodes.
@@ -3105,6 +3129,8 @@ void show_free_areas(unsigned int filter)
3105 printk("= %lukB\n", K(total)); 3129 printk("= %lukB\n", K(total));
3106 } 3130 }
3107 3131
3132 hugetlb_show_meminfo();
3133
3108 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 3134 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3109 3135
3110 show_swap_cache_info(); 3136 show_swap_cache_info();
@@ -4161,10 +4187,23 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
4161{ 4187{
4162 unsigned long start_pfn, end_pfn; 4188 unsigned long start_pfn, end_pfn;
4163 int i, nid; 4189 int i, nid;
4190 /*
4191 * NOTE: The following SMP-unsafe globals are only used early in boot
4192 * when the kernel is running single-threaded.
4193 */
4194 static unsigned long __meminitdata last_start_pfn, last_end_pfn;
4195 static int __meminitdata last_nid;
4196
4197 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4198 return last_nid;
4164 4199
4165 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4200 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4166 if (start_pfn <= pfn && pfn < end_pfn) 4201 if (start_pfn <= pfn && pfn < end_pfn) {
4202 last_start_pfn = start_pfn;
4203 last_end_pfn = end_pfn;
4204 last_nid = nid;
4167 return nid; 4205 return nid;
4206 }
4168 /* This is a memory hole */ 4207 /* This is a memory hole */
4169 return -1; 4208 return -1;
4170} 4209}
@@ -4710,7 +4749,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4710/* 4749/*
4711 * Figure out the number of possible node ids. 4750 * Figure out the number of possible node ids.
4712 */ 4751 */
4713static void __init setup_nr_node_ids(void) 4752void __init setup_nr_node_ids(void)
4714{ 4753{
4715 unsigned int node; 4754 unsigned int node;
4716 unsigned int highest = 0; 4755 unsigned int highest = 0;
@@ -4719,10 +4758,6 @@ static void __init setup_nr_node_ids(void)
4719 highest = node; 4758 highest = node;
4720 nr_node_ids = highest + 1; 4759 nr_node_ids = highest + 1;
4721} 4760}
4722#else
4723static inline void setup_nr_node_ids(void)
4724{
4725}
4726#endif 4761#endif
4727 4762
4728/** 4763/**
@@ -5113,6 +5148,35 @@ early_param("movablecore", cmdline_parse_movablecore);
5113 5148
5114#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5149#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5115 5150
5151unsigned long free_reserved_area(unsigned long start, unsigned long end,
5152 int poison, char *s)
5153{
5154 unsigned long pages, pos;
5155
5156 pos = start = PAGE_ALIGN(start);
5157 end &= PAGE_MASK;
5158 for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
5159 if (poison)
5160 memset((void *)pos, poison, PAGE_SIZE);
5161 free_reserved_page(virt_to_page(pos));
5162 }
5163
5164 if (pages && s)
5165 pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",
5166 s, pages << (PAGE_SHIFT - 10), start, end);
5167
5168 return pages;
5169}
5170
5171#ifdef CONFIG_HIGHMEM
5172void free_highmem_page(struct page *page)
5173{
5174 __free_reserved_page(page);
5175 totalram_pages++;
5176 totalhigh_pages++;
5177}
5178#endif
5179
5116/** 5180/**
5117 * set_dma_reserve - set the specified number of pages reserved in the first zone 5181 * set_dma_reserve - set the specified number of pages reserved in the first zone
5118 * @new_dma_reserve: The number of pages to mark reserved 5182 * @new_dma_reserve: The number of pages to mark reserved
diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32ee486..bb5d75274686 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -42,7 +42,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
42 return bio; 42 return bio;
43} 43}
44 44
45static void end_swap_bio_write(struct bio *bio, int err) 45void end_swap_bio_write(struct bio *bio, int err)
46{ 46{
47 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 47 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
48 struct page *page = bio->bi_io_vec[0].bv_page; 48 struct page *page = bio->bi_io_vec[0].bv_page;
@@ -185,9 +185,7 @@ bad_bmap:
185 */ 185 */
186int swap_writepage(struct page *page, struct writeback_control *wbc) 186int swap_writepage(struct page *page, struct writeback_control *wbc)
187{ 187{
188 struct bio *bio; 188 int ret = 0;
189 int ret = 0, rw = WRITE;
190 struct swap_info_struct *sis = page_swap_info(page);
191 189
192 if (try_to_free_swap(page)) { 190 if (try_to_free_swap(page)) {
193 unlock_page(page); 191 unlock_page(page);
@@ -199,6 +197,17 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
199 end_page_writeback(page); 197 end_page_writeback(page);
200 goto out; 198 goto out;
201 } 199 }
200 ret = __swap_writepage(page, wbc, end_swap_bio_write);
201out:
202 return ret;
203}
204
205int __swap_writepage(struct page *page, struct writeback_control *wbc,
206 void (*end_write_func)(struct bio *, int))
207{
208 struct bio *bio;
209 int ret = 0, rw = WRITE;
210 struct swap_info_struct *sis = page_swap_info(page);
202 211
203 if (sis->flags & SWP_FILE) { 212 if (sis->flags & SWP_FILE) {
204 struct kiocb kiocb; 213 struct kiocb kiocb;
@@ -214,6 +223,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
214 kiocb.ki_left = PAGE_SIZE; 223 kiocb.ki_left = PAGE_SIZE;
215 kiocb.ki_nbytes = PAGE_SIZE; 224 kiocb.ki_nbytes = PAGE_SIZE;
216 225
226 set_page_writeback(page);
217 unlock_page(page); 227 unlock_page(page);
218 ret = mapping->a_ops->direct_IO(KERNEL_WRITE, 228 ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
219 &kiocb, &iov, 229 &kiocb, &iov,
@@ -222,11 +232,27 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
222 if (ret == PAGE_SIZE) { 232 if (ret == PAGE_SIZE) {
223 count_vm_event(PSWPOUT); 233 count_vm_event(PSWPOUT);
224 ret = 0; 234 ret = 0;
235 } else {
236 /*
237 * In the case of swap-over-nfs, this can be a
238 * temporary failure if the system has limited
239 * memory for allocating transmit buffers.
240 * Mark the page dirty and avoid
241 * rotate_reclaimable_page but rate-limit the
242 * messages but do not flag PageError like
243 * the normal direct-to-bio case as it could
244 * be temporary.
245 */
246 set_page_dirty(page);
247 ClearPageReclaim(page);
248 pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
249 page_file_offset(page));
225 } 250 }
251 end_page_writeback(page);
226 return ret; 252 return ret;
227 } 253 }
228 254
229 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 255 bio = get_swap_bio(GFP_NOIO, page, end_write_func);
230 if (bio == NULL) { 256 if (bio == NULL) {
231 set_page_dirty(page); 257 set_page_dirty(page);
232 unlock_page(page); 258 unlock_page(page);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 926b46649749..fd26d0433509 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -429,12 +429,6 @@ compat_process_vm_rw(compat_pid_t pid,
429 if (flags != 0) 429 if (flags != 0)
430 return -EINVAL; 430 return -EINVAL;
431 431
432 if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
433 goto out;
434
435 if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
436 goto out;
437
438 if (vm_write) 432 if (vm_write)
439 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, 433 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
440 UIO_FASTIOV, iovstack_l, 434 UIO_FASTIOV, iovstack_l,
@@ -459,8 +453,6 @@ free_iovecs:
459 kfree(iov_r); 453 kfree(iov_r);
460 if (iov_l != iovstack_l) 454 if (iov_l != iovstack_l)
461 kfree(iov_l); 455 kfree(iov_l);
462
463out:
464 return rc; 456 return rc;
465} 457}
466 458
diff --git a/mm/rmap.c b/mm/rmap.c
index 807c96bf0dc6..6280da86b5d6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1513,6 +1513,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1513 unsigned long max_nl_size = 0; 1513 unsigned long max_nl_size = 0;
1514 unsigned int mapcount; 1514 unsigned int mapcount;
1515 1515
1516 if (PageHuge(page))
1517 pgoff = page->index << compound_order(page);
1518
1516 mutex_lock(&mapping->i_mmap_mutex); 1519 mutex_lock(&mapping->i_mmap_mutex);
1517 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1520 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1518 unsigned long address = vma_address(page, vma); 1521 unsigned long address = vma_address(page, vma);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1c44af71fcf5..39b2a0b86fe8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -25,6 +25,7 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/vfs.h> 26#include <linux/vfs.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/ramfs.h>
28#include <linux/pagemap.h> 29#include <linux/pagemap.h>
29#include <linux/file.h> 30#include <linux/file.h>
30#include <linux/mm.h> 31#include <linux/mm.h>
@@ -2830,8 +2831,6 @@ out4:
2830 * effectively equivalent, but much lighter weight. 2831 * effectively equivalent, but much lighter weight.
2831 */ 2832 */
2832 2833
2833#include <linux/ramfs.h>
2834
2835static struct file_system_type shmem_fs_type = { 2834static struct file_system_type shmem_fs_type = {
2836 .name = "tmpfs", 2835 .name = "tmpfs",
2837 .mount = ramfs_mount, 2836 .mount = ramfs_mount,
@@ -2931,11 +2930,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2931 d_instantiate(path.dentry, inode); 2930 d_instantiate(path.dentry, inode);
2932 inode->i_size = size; 2931 inode->i_size = size;
2933 clear_nlink(inode); /* It is unlinked */ 2932 clear_nlink(inode); /* It is unlinked */
2934#ifndef CONFIG_MMU
2935 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 2933 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
2936 if (IS_ERR(res)) 2934 if (IS_ERR(res))
2937 goto put_dentry; 2935 goto put_dentry;
2938#endif
2939 2936
2940 res = alloc_file(&path, FMODE_WRITE | FMODE_READ, 2937 res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2941 &shmem_file_operations); 2938 &shmem_file_operations);
diff --git a/mm/slab.c b/mm/slab.c
index 856e4a192d25..96079244c860 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2040,11 +2040,9 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
2040 } 2040 }
2041 2041
2042 if (cachep->flags & SLAB_STORE_USER) { 2042 if (cachep->flags & SLAB_STORE_USER) {
2043 printk(KERN_ERR "Last user: [<%p>]", 2043 printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
2044 *dbg_userword(cachep, objp)); 2044 *dbg_userword(cachep, objp),
2045 print_symbol("(%s)", 2045 *dbg_userword(cachep, objp));
2046 (unsigned long)*dbg_userword(cachep, objp));
2047 printk("\n");
2048 } 2046 }
2049 realobj = (char *)objp + obj_offset(cachep); 2047 realobj = (char *)objp + obj_offset(cachep);
2050 size = cachep->object_size; 2048 size = cachep->object_size;
diff --git a/mm/slub.c b/mm/slub.c
index 4aec53705e4f..a0206df88aba 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "slab.h" 19#include "slab.h"
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/notifier.h>
21#include <linux/seq_file.h> 22#include <linux/seq_file.h>
22#include <linux/kmemcheck.h> 23#include <linux/kmemcheck.h>
23#include <linux/cpu.h> 24#include <linux/cpu.h>
@@ -3483,7 +3484,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
3483} 3484}
3484EXPORT_SYMBOL(kmem_cache_shrink); 3485EXPORT_SYMBOL(kmem_cache_shrink);
3485 3486
3486#if defined(CONFIG_MEMORY_HOTPLUG)
3487static int slab_mem_going_offline_callback(void *arg) 3487static int slab_mem_going_offline_callback(void *arg)
3488{ 3488{
3489 struct kmem_cache *s; 3489 struct kmem_cache *s;
@@ -3598,7 +3598,10 @@ static int slab_memory_callback(struct notifier_block *self,
3598 return ret; 3598 return ret;
3599} 3599}
3600 3600
3601#endif /* CONFIG_MEMORY_HOTPLUG */ 3601static struct notifier_block slab_memory_callback_nb = {
3602 .notifier_call = slab_memory_callback,
3603 .priority = SLAB_CALLBACK_PRI,
3604};
3602 3605
3603/******************************************************************** 3606/********************************************************************
3604 * Basic setup of slabs 3607 * Basic setup of slabs
@@ -3651,7 +3654,7 @@ void __init kmem_cache_init(void)
3651 create_boot_cache(kmem_cache_node, "kmem_cache_node", 3654 create_boot_cache(kmem_cache_node, "kmem_cache_node",
3652 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); 3655 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
3653 3656
3654 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3657 register_hotmemory_notifier(&slab_memory_callback_nb);
3655 3658
3656 /* Able to allocate the per node structures */ 3659 /* Able to allocate the per node structures */
3657 slab_state = PARTIAL; 3660 slab_state = PARTIAL;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1b7e22ab9b09..27eeab3be757 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -53,10 +53,12 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
53 struct page *page; 53 struct page *page;
54 54
55 if (node_state(node, N_HIGH_MEMORY)) 55 if (node_state(node, N_HIGH_MEMORY))
56 page = alloc_pages_node(node, 56 page = alloc_pages_node(
57 GFP_KERNEL | __GFP_ZERO, get_order(size)); 57 node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
58 get_order(size));
58 else 59 else
59 page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 60 page = alloc_pages(
61 GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
60 get_order(size)); 62 get_order(size));
61 if (page) 63 if (page)
62 return page_address(page); 64 return page_address(page);
@@ -145,11 +147,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
145 return pgd; 147 return pgd;
146} 148}
147 149
148int __meminit vmemmap_populate_basepages(struct page *start_page, 150int __meminit vmemmap_populate_basepages(unsigned long start,
149 unsigned long size, int node) 151 unsigned long end, int node)
150{ 152{
151 unsigned long addr = (unsigned long)start_page; 153 unsigned long addr = start;
152 unsigned long end = (unsigned long)(start_page + size);
153 pgd_t *pgd; 154 pgd_t *pgd;
154 pud_t *pud; 155 pud_t *pud;
155 pmd_t *pmd; 156 pmd_t *pmd;
@@ -176,9 +177,15 @@ int __meminit vmemmap_populate_basepages(struct page *start_page,
176 177
177struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) 178struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
178{ 179{
179 struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); 180 unsigned long start;
180 int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); 181 unsigned long end;
181 if (error) 182 struct page *map;
183
184 map = pfn_to_page(pnum * PAGES_PER_SECTION);
185 start = (unsigned long)map;
186 end = (unsigned long)(map + PAGES_PER_SECTION);
187
188 if (vmemmap_populate(start, end, nid))
182 return NULL; 189 return NULL;
183 190
184 return map; 191 return map;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc847947..1c91f0d3f6ab 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,12 +615,20 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
615} 615}
616static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 616static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
617{ 617{
618 vmemmap_free(memmap, nr_pages); 618 unsigned long start = (unsigned long)memmap;
619 unsigned long end = (unsigned long)(memmap + nr_pages);
620
621 vmemmap_free(start, end);
619} 622}
623#ifdef CONFIG_MEMORY_HOTREMOVE
620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 624static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
621{ 625{
622 vmemmap_free(memmap, nr_pages); 626 unsigned long start = (unsigned long)memmap;
627 unsigned long end = (unsigned long)(memmap + nr_pages);
628
629 vmemmap_free(start, end);
623} 630}
631#endif /* CONFIG_MEMORY_HOTREMOVE */
624#else 632#else
625static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 633static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
626{ 634{
@@ -658,6 +666,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
658 get_order(sizeof(struct page) * nr_pages)); 666 get_order(sizeof(struct page) * nr_pages));
659} 667}
660 668
669#ifdef CONFIG_MEMORY_HOTREMOVE
661static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 670static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
662{ 671{
663 unsigned long maps_section_nr, removing_section_nr, i; 672 unsigned long maps_section_nr, removing_section_nr, i;
@@ -684,40 +693,9 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
684 put_page_bootmem(page); 693 put_page_bootmem(page);
685 } 694 }
686} 695}
696#endif /* CONFIG_MEMORY_HOTREMOVE */
687#endif /* CONFIG_SPARSEMEM_VMEMMAP */ 697#endif /* CONFIG_SPARSEMEM_VMEMMAP */
688 698
689static void free_section_usemap(struct page *memmap, unsigned long *usemap)
690{
691 struct page *usemap_page;
692 unsigned long nr_pages;
693
694 if (!usemap)
695 return;
696
697 usemap_page = virt_to_page(usemap);
698 /*
699 * Check to see if allocation came from hot-plug-add
700 */
701 if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
702 kfree(usemap);
703 if (memmap)
704 __kfree_section_memmap(memmap, PAGES_PER_SECTION);
705 return;
706 }
707
708 /*
709 * The usemap came from bootmem. This is packed with other usemaps
710 * on the section which has pgdat at boot time. Just keep it as is now.
711 */
712
713 if (memmap) {
714 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
715 >> PAGE_SHIFT;
716
717 free_map_bootmem(memmap, nr_pages);
718 }
719}
720
721/* 699/*
722 * returns the number of sections whose mem_maps were properly 700 * returns the number of sections whose mem_maps were properly
723 * set. If this is <=0, then that means that the passed-in 701 * set. If this is <=0, then that means that the passed-in
@@ -794,6 +772,39 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
794} 772}
795#endif 773#endif
796 774
775#ifdef CONFIG_MEMORY_HOTREMOVE
776static void free_section_usemap(struct page *memmap, unsigned long *usemap)
777{
778 struct page *usemap_page;
779 unsigned long nr_pages;
780
781 if (!usemap)
782 return;
783
784 usemap_page = virt_to_page(usemap);
785 /*
786 * Check to see if allocation came from hot-plug-add
787 */
788 if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
789 kfree(usemap);
790 if (memmap)
791 __kfree_section_memmap(memmap, PAGES_PER_SECTION);
792 return;
793 }
794
795 /*
796 * The usemap came from bootmem. This is packed with other usemaps
797 * on the section which has pgdat at boot time. Just keep it as is now.
798 */
799
800 if (memmap) {
801 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
802 >> PAGE_SHIFT;
803
804 free_map_bootmem(memmap, nr_pages);
805 }
806}
807
797void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 808void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
798{ 809{
799 struct page *memmap = NULL; 810 struct page *memmap = NULL;
@@ -813,4 +824,5 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
813 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); 824 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
814 free_section_usemap(memmap, usemap); 825 free_section_usemap(memmap, usemap);
815} 826}
816#endif 827#endif /* CONFIG_MEMORY_HOTREMOVE */
828#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 8a529a01e8fc..acd40bfffa82 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -737,7 +737,7 @@ EXPORT_SYMBOL(__pagevec_release);
737#ifdef CONFIG_TRANSPARENT_HUGEPAGE 737#ifdef CONFIG_TRANSPARENT_HUGEPAGE
738/* used by __split_huge_page_refcount() */ 738/* used by __split_huge_page_refcount() */
739void lru_add_page_tail(struct page *page, struct page *page_tail, 739void lru_add_page_tail(struct page *page, struct page *page_tail,
740 struct lruvec *lruvec) 740 struct lruvec *lruvec, struct list_head *list)
741{ 741{
742 int uninitialized_var(active); 742 int uninitialized_var(active);
743 enum lru_list lru; 743 enum lru_list lru;
@@ -749,7 +749,8 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
749 VM_BUG_ON(NR_CPUS != 1 && 749 VM_BUG_ON(NR_CPUS != 1 &&
750 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 750 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
751 751
752 SetPageLRU(page_tail); 752 if (!list)
753 SetPageLRU(page_tail);
753 754
754 if (page_evictable(page_tail)) { 755 if (page_evictable(page_tail)) {
755 if (PageActive(page)) { 756 if (PageActive(page)) {
@@ -767,7 +768,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
767 768
768 if (likely(PageLRU(page))) 769 if (likely(PageLRU(page)))
769 list_add_tail(&page_tail->lru, &page->lru); 770 list_add_tail(&page_tail->lru, &page->lru);
770 else { 771 else if (list) {
772 /* page reclaim is reclaiming a huge page */
773 get_page(page_tail);
774 list_add_tail(&page_tail->lru, list);
775 } else {
771 struct list_head *list_head; 776 struct list_head *list_head;
772 /* 777 /*
773 * Head page has not yet been counted, as an hpage, 778 * Head page has not yet been counted, as an hpage,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7efcf1525921..b3d40dcf3624 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -78,7 +78,7 @@ void show_swap_cache_info(void)
78 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 78 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
79 * but sets SwapCache flag and private instead of mapping and index. 79 * but sets SwapCache flag and private instead of mapping and index.
80 */ 80 */
81static int __add_to_swap_cache(struct page *page, swp_entry_t entry) 81int __add_to_swap_cache(struct page *page, swp_entry_t entry)
82{ 82{
83 int error; 83 int error;
84 struct address_space *address_space; 84 struct address_space *address_space;
@@ -160,7 +160,7 @@ void __delete_from_swap_cache(struct page *page)
160 * Allocate swap space for the page and add the page to the 160 * Allocate swap space for the page and add the page to the
161 * swap cache. Caller needs to hold the page lock. 161 * swap cache. Caller needs to hold the page lock.
162 */ 162 */
163int add_to_swap(struct page *page) 163int add_to_swap(struct page *page, struct list_head *list)
164{ 164{
165 swp_entry_t entry; 165 swp_entry_t entry;
166 int err; 166 int err;
@@ -173,7 +173,7 @@ int add_to_swap(struct page *page)
173 return 0; 173 return 0;
174 174
175 if (unlikely(PageTransHuge(page))) 175 if (unlikely(PageTransHuge(page)))
176 if (unlikely(split_huge_page(page))) { 176 if (unlikely(split_huge_page_to_list(page, list))) {
177 swapcache_free(entry, NULL); 177 swapcache_free(entry, NULL);
178 return 0; 178 return 0;
179 } 179 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a1f7772a01fc..d417efddfe74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2120,7 +2120,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2120 if (p->bdev) { 2120 if (p->bdev) {
2121 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2121 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2122 p->flags |= SWP_SOLIDSTATE; 2122 p->flags |= SWP_SOLIDSTATE;
2123 p->cluster_next = 1 + (random32() % p->highest_bit); 2123 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2124 } 2124 }
2125 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) 2125 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2126 p->flags |= SWP_DISCARDABLE; 2126 p->flags |= SWP_DISCARDABLE;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f751f2068c3..72043d6c88c0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -249,19 +249,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
249#define VM_LAZY_FREEING 0x02 249#define VM_LAZY_FREEING 0x02
250#define VM_VM_AREA 0x04 250#define VM_VM_AREA 0x04
251 251
252struct vmap_area {
253 unsigned long va_start;
254 unsigned long va_end;
255 unsigned long flags;
256 struct rb_node rb_node; /* address sorted rbtree */
257 struct list_head list; /* address sorted list */
258 struct list_head purge_list; /* "lazy purge" list */
259 struct vm_struct *vm;
260 struct rcu_head rcu_head;
261};
262
263static DEFINE_SPINLOCK(vmap_area_lock); 252static DEFINE_SPINLOCK(vmap_area_lock);
264static LIST_HEAD(vmap_area_list); 253/* Export for kexec only */
254LIST_HEAD(vmap_area_list);
265static struct rb_root vmap_area_root = RB_ROOT; 255static struct rb_root vmap_area_root = RB_ROOT;
266 256
267/* The vmap cache globals are protected by vmap_area_lock */ 257/* The vmap cache globals are protected by vmap_area_lock */
@@ -313,7 +303,7 @@ static void __insert_vmap_area(struct vmap_area *va)
313 rb_link_node(&va->rb_node, parent, p); 303 rb_link_node(&va->rb_node, parent, p);
314 rb_insert_color(&va->rb_node, &vmap_area_root); 304 rb_insert_color(&va->rb_node, &vmap_area_root);
315 305
316 /* address-sort this list so it is usable like the vmlist */ 306 /* address-sort this list */
317 tmp = rb_prev(&va->rb_node); 307 tmp = rb_prev(&va->rb_node);
318 if (tmp) { 308 if (tmp) {
319 struct vmap_area *prev; 309 struct vmap_area *prev;
@@ -1125,6 +1115,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
1125} 1115}
1126EXPORT_SYMBOL(vm_map_ram); 1116EXPORT_SYMBOL(vm_map_ram);
1127 1117
1118static struct vm_struct *vmlist __initdata;
1128/** 1119/**
1129 * vm_area_add_early - add vmap area early during boot 1120 * vm_area_add_early - add vmap area early during boot
1130 * @vm: vm_struct to add 1121 * @vm: vm_struct to add
@@ -1283,41 +1274,35 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
1283} 1274}
1284EXPORT_SYMBOL_GPL(map_vm_area); 1275EXPORT_SYMBOL_GPL(map_vm_area);
1285 1276
1286/*** Old vmalloc interfaces ***/
1287DEFINE_RWLOCK(vmlist_lock);
1288struct vm_struct *vmlist;
1289
1290static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1277static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1291 unsigned long flags, const void *caller) 1278 unsigned long flags, const void *caller)
1292{ 1279{
1280 spin_lock(&vmap_area_lock);
1293 vm->flags = flags; 1281 vm->flags = flags;
1294 vm->addr = (void *)va->va_start; 1282 vm->addr = (void *)va->va_start;
1295 vm->size = va->va_end - va->va_start; 1283 vm->size = va->va_end - va->va_start;
1296 vm->caller = caller; 1284 vm->caller = caller;
1297 va->vm = vm; 1285 va->vm = vm;
1298 va->flags |= VM_VM_AREA; 1286 va->flags |= VM_VM_AREA;
1287 spin_unlock(&vmap_area_lock);
1299} 1288}
1300 1289
1301static void insert_vmalloc_vmlist(struct vm_struct *vm) 1290static void clear_vm_unlist(struct vm_struct *vm)
1302{ 1291{
1303 struct vm_struct *tmp, **p; 1292 /*
1304 1293 * Before removing VM_UNLIST,
1294 * we should make sure that vm has proper values.
1295 * Pair with smp_rmb() in show_numa_info().
1296 */
1297 smp_wmb();
1305 vm->flags &= ~VM_UNLIST; 1298 vm->flags &= ~VM_UNLIST;
1306 write_lock(&vmlist_lock);
1307 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1308 if (tmp->addr >= vm->addr)
1309 break;
1310 }
1311 vm->next = *p;
1312 *p = vm;
1313 write_unlock(&vmlist_lock);
1314} 1299}
1315 1300
1316static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1301static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1317 unsigned long flags, const void *caller) 1302 unsigned long flags, const void *caller)
1318{ 1303{
1319 setup_vmalloc_vm(vm, va, flags, caller); 1304 setup_vmalloc_vm(vm, va, flags, caller);
1320 insert_vmalloc_vmlist(vm); 1305 clear_vm_unlist(vm);
1321} 1306}
1322 1307
1323static struct vm_struct *__get_vm_area_node(unsigned long size, 1308static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1360,10 +1345,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1360 1345
1361 /* 1346 /*
1362 * When this function is called from __vmalloc_node_range, 1347 * When this function is called from __vmalloc_node_range,
1363 * we do not add vm_struct to vmlist here to avoid 1348 * we add VM_UNLIST flag to avoid accessing uninitialized
1364 * accessing uninitialized members of vm_struct such as 1349 * members of vm_struct such as pages and nr_pages fields.
1365 * pages and nr_pages fields. They will be set later. 1350 * They will be set later.
1366 * To distinguish it from others, we use a VM_UNLIST flag.
1367 */ 1351 */
1368 if (flags & VM_UNLIST) 1352 if (flags & VM_UNLIST)
1369 setup_vmalloc_vm(area, va, flags, caller); 1353 setup_vmalloc_vm(area, va, flags, caller);
@@ -1447,19 +1431,10 @@ struct vm_struct *remove_vm_area(const void *addr)
1447 if (va && va->flags & VM_VM_AREA) { 1431 if (va && va->flags & VM_VM_AREA) {
1448 struct vm_struct *vm = va->vm; 1432 struct vm_struct *vm = va->vm;
1449 1433
1450 if (!(vm->flags & VM_UNLIST)) { 1434 spin_lock(&vmap_area_lock);
1451 struct vm_struct *tmp, **p; 1435 va->vm = NULL;
1452 /* 1436 va->flags &= ~VM_VM_AREA;
1453 * remove from list and disallow access to 1437 spin_unlock(&vmap_area_lock);
1454 * this vm_struct before unmap. (address range
1455 * confliction is maintained by vmap.)
1456 */
1457 write_lock(&vmlist_lock);
1458 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1459 ;
1460 *p = tmp->next;
1461 write_unlock(&vmlist_lock);
1462 }
1463 1438
1464 vmap_debug_free_range(va->va_start, va->va_end); 1439 vmap_debug_free_range(va->va_start, va->va_end);
1465 free_unmap_vmap_area(va); 1440 free_unmap_vmap_area(va);
@@ -1680,10 +1655,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1680 return NULL; 1655 return NULL;
1681 1656
1682 /* 1657 /*
1683 * In this function, newly allocated vm_struct is not added 1658 * In this function, newly allocated vm_struct has VM_UNLIST flag.
1684 * to vmlist at __get_vm_area_node(). so, it is added here. 1659 * It means that vm_struct is not fully initialized.
1660 * Now, it is fully initialized, so remove this flag here.
1685 */ 1661 */
1686 insert_vmalloc_vmlist(area); 1662 clear_vm_unlist(area);
1687 1663
1688 /* 1664 /*
1689 * A ref_count = 3 is needed because the vm_struct and vmap_area 1665 * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2005,7 +1981,8 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
2005 1981
2006long vread(char *buf, char *addr, unsigned long count) 1982long vread(char *buf, char *addr, unsigned long count)
2007{ 1983{
2008 struct vm_struct *tmp; 1984 struct vmap_area *va;
1985 struct vm_struct *vm;
2009 char *vaddr, *buf_start = buf; 1986 char *vaddr, *buf_start = buf;
2010 unsigned long buflen = count; 1987 unsigned long buflen = count;
2011 unsigned long n; 1988 unsigned long n;
@@ -2014,10 +1991,17 @@ long vread(char *buf, char *addr, unsigned long count)
2014 if ((unsigned long) addr + count < count) 1991 if ((unsigned long) addr + count < count)
2015 count = -(unsigned long) addr; 1992 count = -(unsigned long) addr;
2016 1993
2017 read_lock(&vmlist_lock); 1994 spin_lock(&vmap_area_lock);
2018 for (tmp = vmlist; count && tmp; tmp = tmp->next) { 1995 list_for_each_entry(va, &vmap_area_list, list) {
2019 vaddr = (char *) tmp->addr; 1996 if (!count)
2020 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1997 break;
1998
1999 if (!(va->flags & VM_VM_AREA))
2000 continue;
2001
2002 vm = va->vm;
2003 vaddr = (char *) vm->addr;
2004 if (addr >= vaddr + vm->size - PAGE_SIZE)
2021 continue; 2005 continue;
2022 while (addr < vaddr) { 2006 while (addr < vaddr) {
2023 if (count == 0) 2007 if (count == 0)
@@ -2027,10 +2011,10 @@ long vread(char *buf, char *addr, unsigned long count)
2027 addr++; 2011 addr++;
2028 count--; 2012 count--;
2029 } 2013 }
2030 n = vaddr + tmp->size - PAGE_SIZE - addr; 2014 n = vaddr + vm->size - PAGE_SIZE - addr;
2031 if (n > count) 2015 if (n > count)
2032 n = count; 2016 n = count;
2033 if (!(tmp->flags & VM_IOREMAP)) 2017 if (!(vm->flags & VM_IOREMAP))
2034 aligned_vread(buf, addr, n); 2018 aligned_vread(buf, addr, n);
2035 else /* IOREMAP area is treated as memory hole */ 2019 else /* IOREMAP area is treated as memory hole */
2036 memset(buf, 0, n); 2020 memset(buf, 0, n);
@@ -2039,7 +2023,7 @@ long vread(char *buf, char *addr, unsigned long count)
2039 count -= n; 2023 count -= n;
2040 } 2024 }
2041finished: 2025finished:
2042 read_unlock(&vmlist_lock); 2026 spin_unlock(&vmap_area_lock);
2043 2027
2044 if (buf == buf_start) 2028 if (buf == buf_start)
2045 return 0; 2029 return 0;
@@ -2078,7 +2062,8 @@ finished:
2078 2062
2079long vwrite(char *buf, char *addr, unsigned long count) 2063long vwrite(char *buf, char *addr, unsigned long count)
2080{ 2064{
2081 struct vm_struct *tmp; 2065 struct vmap_area *va;
2066 struct vm_struct *vm;
2082 char *vaddr; 2067 char *vaddr;
2083 unsigned long n, buflen; 2068 unsigned long n, buflen;
2084 int copied = 0; 2069 int copied = 0;
@@ -2088,10 +2073,17 @@ long vwrite(char *buf, char *addr, unsigned long count)
2088 count = -(unsigned long) addr; 2073 count = -(unsigned long) addr;
2089 buflen = count; 2074 buflen = count;
2090 2075
2091 read_lock(&vmlist_lock); 2076 spin_lock(&vmap_area_lock);
2092 for (tmp = vmlist; count && tmp; tmp = tmp->next) { 2077 list_for_each_entry(va, &vmap_area_list, list) {
2093 vaddr = (char *) tmp->addr; 2078 if (!count)
2094 if (addr >= vaddr + tmp->size - PAGE_SIZE) 2079 break;
2080
2081 if (!(va->flags & VM_VM_AREA))
2082 continue;
2083
2084 vm = va->vm;
2085 vaddr = (char *) vm->addr;
2086 if (addr >= vaddr + vm->size - PAGE_SIZE)
2095 continue; 2087 continue;
2096 while (addr < vaddr) { 2088 while (addr < vaddr) {
2097 if (count == 0) 2089 if (count == 0)
@@ -2100,10 +2092,10 @@ long vwrite(char *buf, char *addr, unsigned long count)
2100 addr++; 2092 addr++;
2101 count--; 2093 count--;
2102 } 2094 }
2103 n = vaddr + tmp->size - PAGE_SIZE - addr; 2095 n = vaddr + vm->size - PAGE_SIZE - addr;
2104 if (n > count) 2096 if (n > count)
2105 n = count; 2097 n = count;
2106 if (!(tmp->flags & VM_IOREMAP)) { 2098 if (!(vm->flags & VM_IOREMAP)) {
2107 aligned_vwrite(buf, addr, n); 2099 aligned_vwrite(buf, addr, n);
2108 copied++; 2100 copied++;
2109 } 2101 }
@@ -2112,7 +2104,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
2112 count -= n; 2104 count -= n;
2113 } 2105 }
2114finished: 2106finished:
2115 read_unlock(&vmlist_lock); 2107 spin_unlock(&vmap_area_lock);
2116 if (!copied) 2108 if (!copied)
2117 return 0; 2109 return 0;
2118 return buflen; 2110 return buflen;
@@ -2519,19 +2511,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2519 2511
2520#ifdef CONFIG_PROC_FS 2512#ifdef CONFIG_PROC_FS
2521static void *s_start(struct seq_file *m, loff_t *pos) 2513static void *s_start(struct seq_file *m, loff_t *pos)
2522 __acquires(&vmlist_lock) 2514 __acquires(&vmap_area_lock)
2523{ 2515{
2524 loff_t n = *pos; 2516 loff_t n = *pos;
2525 struct vm_struct *v; 2517 struct vmap_area *va;
2526 2518
2527 read_lock(&vmlist_lock); 2519 spin_lock(&vmap_area_lock);
2528 v = vmlist; 2520 va = list_entry((&vmap_area_list)->next, typeof(*va), list);
2529 while (n > 0 && v) { 2521 while (n > 0 && &va->list != &vmap_area_list) {
2530 n--; 2522 n--;
2531 v = v->next; 2523 va = list_entry(va->list.next, typeof(*va), list);
2532 } 2524 }
2533 if (!n) 2525 if (!n && &va->list != &vmap_area_list)
2534 return v; 2526 return va;
2535 2527
2536 return NULL; 2528 return NULL;
2537 2529
@@ -2539,16 +2531,20 @@ static void *s_start(struct seq_file *m, loff_t *pos)
2539 2531
2540static void *s_next(struct seq_file *m, void *p, loff_t *pos) 2532static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2541{ 2533{
2542 struct vm_struct *v = p; 2534 struct vmap_area *va = p, *next;
2543 2535
2544 ++*pos; 2536 ++*pos;
2545 return v->next; 2537 next = list_entry(va->list.next, typeof(*va), list);
2538 if (&next->list != &vmap_area_list)
2539 return next;
2540
2541 return NULL;
2546} 2542}
2547 2543
2548static void s_stop(struct seq_file *m, void *p) 2544static void s_stop(struct seq_file *m, void *p)
2549 __releases(&vmlist_lock) 2545 __releases(&vmap_area_lock)
2550{ 2546{
2551 read_unlock(&vmlist_lock); 2547 spin_unlock(&vmap_area_lock);
2552} 2548}
2553 2549
2554static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2550static void show_numa_info(struct seq_file *m, struct vm_struct *v)
@@ -2559,6 +2555,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2559 if (!counters) 2555 if (!counters)
2560 return; 2556 return;
2561 2557
2558 /* Pair with smp_wmb() in clear_vm_unlist() */
2559 smp_rmb();
2560 if (v->flags & VM_UNLIST)
2561 return;
2562
2562 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2563 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2563 2564
2564 for (nr = 0; nr < v->nr_pages; nr++) 2565 for (nr = 0; nr < v->nr_pages; nr++)
@@ -2572,7 +2573,20 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2572 2573
2573static int s_show(struct seq_file *m, void *p) 2574static int s_show(struct seq_file *m, void *p)
2574{ 2575{
2575 struct vm_struct *v = p; 2576 struct vmap_area *va = p;
2577 struct vm_struct *v;
2578
2579 if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
2580 return 0;
2581
2582 if (!(va->flags & VM_VM_AREA)) {
2583 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
2584 (void *)va->va_start, (void *)va->va_end,
2585 va->va_end - va->va_start);
2586 return 0;
2587 }
2588
2589 v = va->vm;
2576 2590
2577 seq_printf(m, "0x%pK-0x%pK %7ld", 2591 seq_printf(m, "0x%pK-0x%pK %7ld",
2578 v->addr, v->addr + v->size, v->size); 2592 v->addr, v->addr + v->size, v->size);
@@ -2645,5 +2659,53 @@ static int __init proc_vmalloc_init(void)
2645 return 0; 2659 return 0;
2646} 2660}
2647module_init(proc_vmalloc_init); 2661module_init(proc_vmalloc_init);
2662
2663void get_vmalloc_info(struct vmalloc_info *vmi)
2664{
2665 struct vmap_area *va;
2666 unsigned long free_area_size;
2667 unsigned long prev_end;
2668
2669 vmi->used = 0;
2670 vmi->largest_chunk = 0;
2671
2672 prev_end = VMALLOC_START;
2673
2674 spin_lock(&vmap_area_lock);
2675
2676 if (list_empty(&vmap_area_list)) {
2677 vmi->largest_chunk = VMALLOC_TOTAL;
2678 goto out;
2679 }
2680
2681 list_for_each_entry(va, &vmap_area_list, list) {
2682 unsigned long addr = va->va_start;
2683
2684 /*
2685 * Some archs keep another range for modules in vmalloc space
2686 */
2687 if (addr < VMALLOC_START)
2688 continue;
2689 if (addr >= VMALLOC_END)
2690 break;
2691
2692 if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
2693 continue;
2694
2695 vmi->used += (va->va_end - va->va_start);
2696
2697 free_area_size = addr - prev_end;
2698 if (vmi->largest_chunk < free_area_size)
2699 vmi->largest_chunk = free_area_size;
2700
2701 prev_end = va->va_end;
2702 }
2703
2704 if (VMALLOC_END - prev_end > vmi->largest_chunk)
2705 vmi->largest_chunk = VMALLOC_END - prev_end;
2706
2707out:
2708 spin_unlock(&vmap_area_lock);
2709}
2648#endif 2710#endif
2649 2711
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 000000000000..736a6011c2c8
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
1/*
2 * Linux VM pressure
3 *
4 * Copyright 2012 Linaro Ltd.
5 * Anton Vorontsov <anton.vorontsov@linaro.org>
6 *
7 * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
8 * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License version 2 as published
12 * by the Free Software Foundation.
13 */
14
15#include <linux/cgroup.h>
16#include <linux/fs.h>
17#include <linux/log2.h>
18#include <linux/sched.h>
19#include <linux/mm.h>
20#include <linux/vmstat.h>
21#include <linux/eventfd.h>
22#include <linux/swap.h>
23#include <linux/printk.h>
24#include <linux/vmpressure.h>
25
26/*
27 * The window size (vmpressure_win) is the number of scanned pages before
28 * we try to analyze scanned/reclaimed ratio. So the window is used as a
29 * rate-limit tunable for the "low" level notification, and also for
30 * averaging the ratio for medium/critical levels. Using small window
31 * sizes can cause lot of false positives, but too big window size will
32 * delay the notifications.
33 *
34 * As the vmscan reclaimer logic works with chunks which are multiple of
35 * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
36 *
37 * TODO: Make the window size depend on machine size, as we do for vmstat
38 * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
39 */
40static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
41
42/*
43 * These thresholds are used when we account memory pressure through
44 * scanned/reclaimed ratio. The current values were chosen empirically. In
45 * essence, they are percents: the higher the value, the more number
46 * unsuccessful reclaims there were.
47 */
48static const unsigned int vmpressure_level_med = 60;
49static const unsigned int vmpressure_level_critical = 95;
50
51/*
52 * When there are too little pages left to scan, vmpressure() may miss the
53 * critical pressure as number of pages will be less than "window size".
54 * However, in that case the vmscan priority will raise fast as the
55 * reclaimer will try to scan LRUs more deeply.
56 *
57 * The vmscan logic considers these special priorities:
58 *
59 * prio == DEF_PRIORITY (12): reclaimer starts with that value
60 * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
61 * prio == 0 : close to OOM, kernel scans every page in an lru
62 *
63 * Any value in this range is acceptable for this tunable (i.e. from 12 to
64 * 0). Current value for the vmpressure_level_critical_prio is chosen
65 * empirically, but the number, in essence, means that we consider
66 * critical level when scanning depth is ~10% of the lru size (vmscan
67 * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
68 * eights).
69 */
70static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
71
72static struct vmpressure *work_to_vmpressure(struct work_struct *work)
73{
74 return container_of(work, struct vmpressure, work);
75}
76
77static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
78{
79 return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
80}
81
82static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
83{
84 struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
85 struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
86
87 memcg = parent_mem_cgroup(memcg);
88 if (!memcg)
89 return NULL;
90 return memcg_to_vmpressure(memcg);
91}
92
93enum vmpressure_levels {
94 VMPRESSURE_LOW = 0,
95 VMPRESSURE_MEDIUM,
96 VMPRESSURE_CRITICAL,
97 VMPRESSURE_NUM_LEVELS,
98};
99
100static const char * const vmpressure_str_levels[] = {
101 [VMPRESSURE_LOW] = "low",
102 [VMPRESSURE_MEDIUM] = "medium",
103 [VMPRESSURE_CRITICAL] = "critical",
104};
105
106static enum vmpressure_levels vmpressure_level(unsigned long pressure)
107{
108 if (pressure >= vmpressure_level_critical)
109 return VMPRESSURE_CRITICAL;
110 else if (pressure >= vmpressure_level_med)
111 return VMPRESSURE_MEDIUM;
112 return VMPRESSURE_LOW;
113}
114
115static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
116 unsigned long reclaimed)
117{
118 unsigned long scale = scanned + reclaimed;
119 unsigned long pressure;
120
121 /*
122 * We calculate the ratio (in percents) of how many pages were
123 * scanned vs. reclaimed in a given time frame (window). Note that
124 * time is in VM reclaimer's "ticks", i.e. number of pages
125 * scanned. This makes it possible to set desired reaction time
126 * and serves as a ratelimit.
127 */
128 pressure = scale - (reclaimed * scale / scanned);
129 pressure = pressure * 100 / scale;
130
131 pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
132 scanned, reclaimed);
133
134 return vmpressure_level(pressure);
135}
136
137struct vmpressure_event {
138 struct eventfd_ctx *efd;
139 enum vmpressure_levels level;
140 struct list_head node;
141};
142
143static bool vmpressure_event(struct vmpressure *vmpr,
144 unsigned long scanned, unsigned long reclaimed)
145{
146 struct vmpressure_event *ev;
147 enum vmpressure_levels level;
148 bool signalled = false;
149
150 level = vmpressure_calc_level(scanned, reclaimed);
151
152 mutex_lock(&vmpr->events_lock);
153
154 list_for_each_entry(ev, &vmpr->events, node) {
155 if (level >= ev->level) {
156 eventfd_signal(ev->efd, 1);
157 signalled = true;
158 }
159 }
160
161 mutex_unlock(&vmpr->events_lock);
162
163 return signalled;
164}
165
166static void vmpressure_work_fn(struct work_struct *work)
167{
168 struct vmpressure *vmpr = work_to_vmpressure(work);
169 unsigned long scanned;
170 unsigned long reclaimed;
171
172 /*
173 * Several contexts might be calling vmpressure(), so it is
174 * possible that the work was rescheduled again before the old
175 * work context cleared the counters. In that case we will run
176 * just after the old work returns, but then scanned might be zero
177 * here. No need for any locks here since we don't care if
178 * vmpr->reclaimed is in sync.
179 */
180 if (!vmpr->scanned)
181 return;
182
183 mutex_lock(&vmpr->sr_lock);
184 scanned = vmpr->scanned;
185 reclaimed = vmpr->reclaimed;
186 vmpr->scanned = 0;
187 vmpr->reclaimed = 0;
188 mutex_unlock(&vmpr->sr_lock);
189
190 do {
191 if (vmpressure_event(vmpr, scanned, reclaimed))
192 break;
193 /*
194 * If not handled, propagate the event upward into the
195 * hierarchy.
196 */
197 } while ((vmpr = vmpressure_parent(vmpr)));
198}
199
200/**
201 * vmpressure() - Account memory pressure through scanned/reclaimed ratio
202 * @gfp: reclaimer's gfp mask
203 * @memcg: cgroup memory controller handle
204 * @scanned: number of pages scanned
205 * @reclaimed: number of pages reclaimed
206 *
207 * This function should be called from the vmscan reclaim path to account
208 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
209 * pressure index is then further refined and averaged over time.
210 *
211 * This function does not return any value.
212 */
213void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
214 unsigned long scanned, unsigned long reclaimed)
215{
216 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
217
218 /*
219 * Here we only want to account pressure that userland is able to
220 * help us with. For example, suppose that DMA zone is under
221 * pressure; if we notify userland about that kind of pressure,
222 * then it will be mostly a waste as it will trigger unnecessary
223 * freeing of memory by userland (since userland is more likely to
224 * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
225 * is why we include only movable, highmem and FS/IO pages.
226 * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
227 * we account it too.
228 */
229 if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
230 return;
231
232 /*
233 * If we got here with no pages scanned, then that is an indicator
234 * that reclaimer was unable to find any shrinkable LRUs at the
235 * current scanning depth. But it does not mean that we should
236 * report the critical pressure, yet. If the scanning priority
237 * (scanning depth) goes too high (deep), we will be notified
238 * through vmpressure_prio(). But so far, keep calm.
239 */
240 if (!scanned)
241 return;
242
243 mutex_lock(&vmpr->sr_lock);
244 vmpr->scanned += scanned;
245 vmpr->reclaimed += reclaimed;
246 scanned = vmpr->scanned;
247 mutex_unlock(&vmpr->sr_lock);
248
249 if (scanned < vmpressure_win || work_pending(&vmpr->work))
250 return;
251 schedule_work(&vmpr->work);
252}
253
254/**
255 * vmpressure_prio() - Account memory pressure through reclaimer priority level
256 * @gfp: reclaimer's gfp mask
257 * @memcg: cgroup memory controller handle
258 * @prio: reclaimer's priority
259 *
260 * This function should be called from the reclaim path every time when
261 * the vmscan's reclaiming priority (scanning depth) changes.
262 *
263 * This function does not return any value.
264 */
265void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
266{
267 /*
268 * We only use prio for accounting critical level. For more info
269 * see comment for vmpressure_level_critical_prio variable above.
270 */
271 if (prio > vmpressure_level_critical_prio)
272 return;
273
274 /*
275 * OK, the prio is below the threshold, updating vmpressure
276 * information before shrinker dives into long shrinking of long
277 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
278 * to the vmpressure() basically means that we signal 'critical'
279 * level.
280 */
281 vmpressure(gfp, memcg, vmpressure_win, 0);
282}
283
284/**
285 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
286 * @cg: cgroup that is interested in vmpressure notifications
287 * @cft: cgroup control files handle
288 * @eventfd: eventfd context to link notifications with
289 * @args: event arguments (used to set up a pressure level threshold)
290 *
291 * This function associates eventfd context with the vmpressure
292 * infrastructure, so that the notifications will be delivered to the
293 * @eventfd. The @args parameter is a string that denotes pressure level
294 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
295 * "critical").
296 *
297 * This function should not be used directly, just pass it to (struct
298 * cftype).register_event, and then cgroup core will handle everything by
299 * itself.
300 */
301int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
302 struct eventfd_ctx *eventfd, const char *args)
303{
304 struct vmpressure *vmpr = cg_to_vmpressure(cg);
305 struct vmpressure_event *ev;
306 int level;
307
308 for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
309 if (!strcmp(vmpressure_str_levels[level], args))
310 break;
311 }
312
313 if (level >= VMPRESSURE_NUM_LEVELS)
314 return -EINVAL;
315
316 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
317 if (!ev)
318 return -ENOMEM;
319
320 ev->efd = eventfd;
321 ev->level = level;
322
323 mutex_lock(&vmpr->events_lock);
324 list_add(&ev->node, &vmpr->events);
325 mutex_unlock(&vmpr->events_lock);
326
327 return 0;
328}
329
330/**
331 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
332 * @cg: cgroup handle
333 * @cft: cgroup control files handle
334 * @eventfd: eventfd context that was used to link vmpressure with the @cg
335 *
336 * This function does internal manipulations to detach the @eventfd from
337 * the vmpressure notifications, and then frees internal resources
338 * associated with the @eventfd (but the @eventfd itself is not freed).
339 *
340 * This function should not be used directly, just pass it to (struct
341 * cftype).unregister_event, and then cgroup core will handle everything
342 * by itself.
343 */
344void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
345 struct eventfd_ctx *eventfd)
346{
347 struct vmpressure *vmpr = cg_to_vmpressure(cg);
348 struct vmpressure_event *ev;
349
350 mutex_lock(&vmpr->events_lock);
351 list_for_each_entry(ev, &vmpr->events, node) {
352 if (ev->efd != eventfd)
353 continue;
354 list_del(&ev->node);
355 kfree(ev);
356 break;
357 }
358 mutex_unlock(&vmpr->events_lock);
359}
360
361/**
362 * vmpressure_init() - Initialize vmpressure control structure
363 * @vmpr: Structure to be initialized
364 *
365 * This function should be called on every allocated vmpressure structure
366 * before any usage.
367 */
368void vmpressure_init(struct vmpressure *vmpr)
369{
370 mutex_init(&vmpr->sr_lock);
371 mutex_init(&vmpr->events_lock);
372 INIT_LIST_HEAD(&vmpr->events);
373 INIT_WORK(&vmpr->work, vmpressure_work_fn);
374}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 88c5fed8b9a4..fa6a85378ee4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/vmpressure.h>
22#include <linux/vmstat.h> 23#include <linux/vmstat.h>
23#include <linux/file.h> 24#include <linux/file.h>
24#include <linux/writeback.h> 25#include <linux/writeback.h>
@@ -780,7 +781,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
780 if (PageAnon(page) && !PageSwapCache(page)) { 781 if (PageAnon(page) && !PageSwapCache(page)) {
781 if (!(sc->gfp_mask & __GFP_IO)) 782 if (!(sc->gfp_mask & __GFP_IO))
782 goto keep_locked; 783 goto keep_locked;
783 if (!add_to_swap(page)) 784 if (!add_to_swap(page, page_list))
784 goto activate_locked; 785 goto activate_locked;
785 may_enter_fs = 1; 786 may_enter_fs = 1;
786 } 787 }
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
1982 } 1983 }
1983 memcg = mem_cgroup_iter(root, memcg, &reclaim); 1984 memcg = mem_cgroup_iter(root, memcg, &reclaim);
1984 } while (memcg); 1985 } while (memcg);
1986
1987 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
1988 sc->nr_scanned - nr_scanned,
1989 sc->nr_reclaimed - nr_reclaimed);
1990
1985 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 1991 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
1986 sc->nr_scanned - nr_scanned, sc)); 1992 sc->nr_scanned - nr_scanned, sc));
1987} 1993}
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2167 count_vm_event(ALLOCSTALL); 2173 count_vm_event(ALLOCSTALL);
2168 2174
2169 do { 2175 do {
2176 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2177 sc->priority);
2170 sc->nr_scanned = 0; 2178 sc->nr_scanned = 0;
2171 aborted_reclaim = shrink_zones(zonelist, sc); 2179 aborted_reclaim = shrink_zones(zonelist, sc);
2172 2180
@@ -2619,7 +2627,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2619 bool pgdat_is_balanced = false; 2627 bool pgdat_is_balanced = false;
2620 int i; 2628 int i;
2621 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2629 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2622 unsigned long total_scanned;
2623 struct reclaim_state *reclaim_state = current->reclaim_state; 2630 struct reclaim_state *reclaim_state = current->reclaim_state;
2624 unsigned long nr_soft_reclaimed; 2631 unsigned long nr_soft_reclaimed;
2625 unsigned long nr_soft_scanned; 2632 unsigned long nr_soft_scanned;
@@ -2639,7 +2646,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2639 .gfp_mask = sc.gfp_mask, 2646 .gfp_mask = sc.gfp_mask,
2640 }; 2647 };
2641loop_again: 2648loop_again:
2642 total_scanned = 0;
2643 sc.priority = DEF_PRIORITY; 2649 sc.priority = DEF_PRIORITY;
2644 sc.nr_reclaimed = 0; 2650 sc.nr_reclaimed = 0;
2645 sc.may_writepage = !laptop_mode; 2651 sc.may_writepage = !laptop_mode;
@@ -2730,7 +2736,6 @@ loop_again:
2730 order, sc.gfp_mask, 2736 order, sc.gfp_mask,
2731 &nr_soft_scanned); 2737 &nr_soft_scanned);
2732 sc.nr_reclaimed += nr_soft_reclaimed; 2738 sc.nr_reclaimed += nr_soft_reclaimed;
2733 total_scanned += nr_soft_scanned;
2734 2739
2735 /* 2740 /*
2736 * We put equal pressure on every zone, unless 2741 * We put equal pressure on every zone, unless
@@ -2765,7 +2770,6 @@ loop_again:
2765 reclaim_state->reclaimed_slab = 0; 2770 reclaim_state->reclaimed_slab = 0;
2766 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); 2771 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2767 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2772 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2768 total_scanned += sc.nr_scanned;
2769 2773
2770 if (nr_slab == 0 && !zone_reclaimable(zone)) 2774 if (nr_slab == 0 && !zone_reclaimable(zone))
2771 zone->all_unreclaimable = 1; 2775 zone->all_unreclaimable = 1;
@@ -3188,9 +3192,9 @@ int kswapd_run(int nid)
3188 if (IS_ERR(pgdat->kswapd)) { 3192 if (IS_ERR(pgdat->kswapd)) {
3189 /* failure at boot is fatal */ 3193 /* failure at boot is fatal */
3190 BUG_ON(system_state == SYSTEM_BOOTING); 3194 BUG_ON(system_state == SYSTEM_BOOTING);
3191 pgdat->kswapd = NULL;
3192 pr_err("Failed to start kswapd on node %d\n", nid); 3195 pr_err("Failed to start kswapd on node %d\n", nid);
3193 ret = PTR_ERR(pgdat->kswapd); 3196 ret = PTR_ERR(pgdat->kswapd);
3197 pgdat->kswapd = NULL;
3194 } 3198 }
3195 return ret; 3199 return ret;
3196} 3200}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e1d8ed172c42..f42745e65780 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -52,7 +52,6 @@ void all_vm_events(unsigned long *ret)
52} 52}
53EXPORT_SYMBOL_GPL(all_vm_events); 53EXPORT_SYMBOL_GPL(all_vm_events);
54 54
55#ifdef CONFIG_HOTPLUG
56/* 55/*
57 * Fold the foreign cpu events into our own. 56 * Fold the foreign cpu events into our own.
58 * 57 *
@@ -69,7 +68,6 @@ void vm_events_fold_cpu(int cpu)
69 fold_state->event[i] = 0; 68 fold_state->event[i] = 0;
70 } 69 }
71} 70}
72#endif /* CONFIG_HOTPLUG */
73 71
74#endif /* CONFIG_VM_EVENT_COUNTERS */ 72#endif /* CONFIG_VM_EVENT_COUNTERS */
75 73
@@ -495,6 +493,10 @@ void refresh_cpu_vm_stats(int cpu)
495 atomic_long_add(global_diff[i], &vm_stat[i]); 493 atomic_long_add(global_diff[i], &vm_stat[i]);
496} 494}
497 495
496/*
497 * this is only called if !populated_zone(zone), which implies no other users of
498 * pset->vm_stat_diff[] exsist.
499 */
498void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) 500void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
499{ 501{
500 int i; 502 int i;