diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 16 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/bounce.c | 21 | ||||
-rw-r--r-- | mm/filemap.c | 34 | ||||
-rw-r--r-- | mm/fremap.c | 17 | ||||
-rw-r--r-- | mm/huge_memory.c | 77 | ||||
-rw-r--r-- | mm/hugetlb.c | 63 | ||||
-rw-r--r-- | mm/madvise.c | 31 | ||||
-rw-r--r-- | mm/memblock.c | 12 | ||||
-rw-r--r-- | mm/memcontrol.c | 338 | ||||
-rw-r--r-- | mm/memory-failure.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 61 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 101 | ||||
-rw-r--r-- | mm/migrate.c | 24 | ||||
-rw-r--r-- | mm/mlock.c | 11 | ||||
-rw-r--r-- | mm/mmap.c | 202 | ||||
-rw-r--r-- | mm/nobootmem.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 80 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 78 | ||||
-rw-r--r-- | mm/page_io.c | 36 | ||||
-rw-r--r-- | mm/process_vm_access.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 3 | ||||
-rw-r--r-- | mm/shmem.c | 5 | ||||
-rw-r--r-- | mm/slab.c | 8 | ||||
-rw-r--r-- | mm/slub.c | 9 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 27 | ||||
-rw-r--r-- | mm/sparse.c | 82 | ||||
-rw-r--r-- | mm/swap.c | 11 | ||||
-rw-r--r-- | mm/swap_state.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 218 | ||||
-rw-r--r-- | mm/vmpressure.c | 374 | ||||
-rw-r--r-- | mm/vmscan.c | 16 | ||||
-rw-r--r-- | mm/vmstat.c | 6 |
35 files changed, 1504 insertions, 489 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ae55c1e04d10..e742d06285b7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -263,8 +263,14 @@ config ZONE_DMA_FLAG | |||
263 | default "1" | 263 | default "1" |
264 | 264 | ||
265 | config BOUNCE | 265 | config BOUNCE |
266 | def_bool y | 266 | bool "Enable bounce buffers" |
267 | default y | ||
267 | depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) | 268 | depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) |
269 | help | ||
270 | Enable bounce buffers for devices that cannot access | ||
271 | the full range of memory available to the CPU. Enabled | ||
272 | by default when ZONE_DMA or HIGHMEM is selected, but you | ||
273 | may say n to override this. | ||
268 | 274 | ||
269 | # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often | 275 | # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often |
270 | # have more than 4GB of memory, but we don't currently use the IOTLB to present | 276 | # have more than 4GB of memory, but we don't currently use the IOTLB to present |
@@ -286,8 +292,12 @@ config NR_QUICK | |||
286 | default "1" | 292 | default "1" |
287 | 293 | ||
288 | config VIRT_TO_BUS | 294 | config VIRT_TO_BUS |
289 | def_bool y | 295 | bool |
290 | depends on HAVE_VIRT_TO_BUS | 296 | help |
297 | An architecture should select this if it implements the | ||
298 | deprecated interface virt_to_bus(). All new architectures | ||
299 | should probably not select this. | ||
300 | |||
291 | 301 | ||
292 | config MMU_NOTIFIER | 302 | config MMU_NOTIFIER |
293 | bool | 303 | bool |
diff --git a/mm/Makefile b/mm/Makefile index 3a4628751f89..72c5acb9345f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
50 | obj-$(CONFIG_MIGRATION) += migrate.o | 50 | obj-$(CONFIG_MIGRATION) += migrate.o |
51 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 51 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
52 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 52 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
53 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o | 53 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o |
54 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o | 54 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o |
55 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 55 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
56 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 56 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
diff --git a/mm/bounce.c b/mm/bounce.c index 5f8901768602..a5c2ec3589cb 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -181,32 +181,13 @@ static void bounce_end_io_read_isa(struct bio *bio, int err) | |||
181 | #ifdef CONFIG_NEED_BOUNCE_POOL | 181 | #ifdef CONFIG_NEED_BOUNCE_POOL |
182 | static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) | 182 | static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) |
183 | { | 183 | { |
184 | struct page *page; | ||
185 | struct backing_dev_info *bdi; | ||
186 | struct address_space *mapping; | ||
187 | struct bio_vec *from; | ||
188 | int i; | ||
189 | |||
190 | if (bio_data_dir(bio) != WRITE) | 184 | if (bio_data_dir(bio) != WRITE) |
191 | return 0; | 185 | return 0; |
192 | 186 | ||
193 | if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) | 187 | if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) |
194 | return 0; | 188 | return 0; |
195 | 189 | ||
196 | /* | 190 | return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); |
197 | * Based on the first page that has a valid mapping, decide whether or | ||
198 | * not we have to employ bounce buffering to guarantee stable pages. | ||
199 | */ | ||
200 | bio_for_each_segment(from, bio, i) { | ||
201 | page = from->bv_page; | ||
202 | mapping = page_mapping(page); | ||
203 | if (!mapping) | ||
204 | continue; | ||
205 | bdi = mapping->backing_dev_info; | ||
206 | return mapping->host->i_sb->s_flags & MS_SNAP_STABLE; | ||
207 | } | ||
208 | |||
209 | return 0; | ||
210 | } | 191 | } |
211 | #else | 192 | #else |
212 | static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) | 193 | static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) |
diff --git a/mm/filemap.c b/mm/filemap.c index e1979fdca805..e989fb1eaa72 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -35,6 +35,9 @@ | |||
35 | #include <linux/cleancache.h> | 35 | #include <linux/cleancache.h> |
36 | #include "internal.h" | 36 | #include "internal.h" |
37 | 37 | ||
38 | #define CREATE_TRACE_POINTS | ||
39 | #include <trace/events/filemap.h> | ||
40 | |||
38 | /* | 41 | /* |
39 | * FIXME: remove all knowledge of the buffer layer from the core VM | 42 | * FIXME: remove all knowledge of the buffer layer from the core VM |
40 | */ | 43 | */ |
@@ -113,6 +116,7 @@ void __delete_from_page_cache(struct page *page) | |||
113 | { | 116 | { |
114 | struct address_space *mapping = page->mapping; | 117 | struct address_space *mapping = page->mapping; |
115 | 118 | ||
119 | trace_mm_filemap_delete_from_page_cache(page); | ||
116 | /* | 120 | /* |
117 | * if we're uptodate, flush out into the cleancache, otherwise | 121 | * if we're uptodate, flush out into the cleancache, otherwise |
118 | * invalidate any existing cleancache entries. We can't leave | 122 | * invalidate any existing cleancache entries. We can't leave |
@@ -184,6 +188,17 @@ static int sleep_on_page_killable(void *word) | |||
184 | return fatal_signal_pending(current) ? -EINTR : 0; | 188 | return fatal_signal_pending(current) ? -EINTR : 0; |
185 | } | 189 | } |
186 | 190 | ||
191 | static int filemap_check_errors(struct address_space *mapping) | ||
192 | { | ||
193 | int ret = 0; | ||
194 | /* Check for outstanding write errors */ | ||
195 | if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | ||
196 | ret = -ENOSPC; | ||
197 | if (test_and_clear_bit(AS_EIO, &mapping->flags)) | ||
198 | ret = -EIO; | ||
199 | return ret; | ||
200 | } | ||
201 | |||
187 | /** | 202 | /** |
188 | * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range | 203 | * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range |
189 | * @mapping: address space structure to write | 204 | * @mapping: address space structure to write |
@@ -265,10 +280,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | |||
265 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; | 280 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; |
266 | struct pagevec pvec; | 281 | struct pagevec pvec; |
267 | int nr_pages; | 282 | int nr_pages; |
268 | int ret = 0; | 283 | int ret2, ret = 0; |
269 | 284 | ||
270 | if (end_byte < start_byte) | 285 | if (end_byte < start_byte) |
271 | return 0; | 286 | goto out; |
272 | 287 | ||
273 | pagevec_init(&pvec, 0); | 288 | pagevec_init(&pvec, 0); |
274 | while ((index <= end) && | 289 | while ((index <= end) && |
@@ -291,12 +306,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | |||
291 | pagevec_release(&pvec); | 306 | pagevec_release(&pvec); |
292 | cond_resched(); | 307 | cond_resched(); |
293 | } | 308 | } |
294 | 309 | out: | |
295 | /* Check for outstanding write errors */ | 310 | ret2 = filemap_check_errors(mapping); |
296 | if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | 311 | if (!ret) |
297 | ret = -ENOSPC; | 312 | ret = ret2; |
298 | if (test_and_clear_bit(AS_EIO, &mapping->flags)) | ||
299 | ret = -EIO; | ||
300 | 313 | ||
301 | return ret; | 314 | return ret; |
302 | } | 315 | } |
@@ -337,6 +350,8 @@ int filemap_write_and_wait(struct address_space *mapping) | |||
337 | if (!err) | 350 | if (!err) |
338 | err = err2; | 351 | err = err2; |
339 | } | 352 | } |
353 | } else { | ||
354 | err = filemap_check_errors(mapping); | ||
340 | } | 355 | } |
341 | return err; | 356 | return err; |
342 | } | 357 | } |
@@ -368,6 +383,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
368 | if (!err) | 383 | if (!err) |
369 | err = err2; | 384 | err = err2; |
370 | } | 385 | } |
386 | } else { | ||
387 | err = filemap_check_errors(mapping); | ||
371 | } | 388 | } |
372 | return err; | 389 | return err; |
373 | } | 390 | } |
@@ -464,6 +481,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
464 | mapping->nrpages++; | 481 | mapping->nrpages++; |
465 | __inc_zone_page_state(page, NR_FILE_PAGES); | 482 | __inc_zone_page_state(page, NR_FILE_PAGES); |
466 | spin_unlock_irq(&mapping->tree_lock); | 483 | spin_unlock_irq(&mapping->tree_lock); |
484 | trace_mm_filemap_add_to_page_cache(page); | ||
467 | } else { | 485 | } else { |
468 | page->mapping = NULL; | 486 | page->mapping = NULL; |
469 | /* Leave page->index set: truncation relies upon it */ | 487 | /* Leave page->index set: truncation relies upon it */ |
diff --git a/mm/fremap.c b/mm/fremap.c index 0cd4c11488ed..87da3590c61e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -129,7 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
129 | struct vm_area_struct *vma; | 129 | struct vm_area_struct *vma; |
130 | int err = -EINVAL; | 130 | int err = -EINVAL; |
131 | int has_write_lock = 0; | 131 | int has_write_lock = 0; |
132 | vm_flags_t vm_flags; | 132 | vm_flags_t vm_flags = 0; |
133 | 133 | ||
134 | if (prot) | 134 | if (prot) |
135 | return err; | 135 | return err; |
@@ -204,10 +204,8 @@ get_write_lock: | |||
204 | unsigned long addr; | 204 | unsigned long addr; |
205 | struct file *file = get_file(vma->vm_file); | 205 | struct file *file = get_file(vma->vm_file); |
206 | 206 | ||
207 | vm_flags = vma->vm_flags; | 207 | addr = mmap_region(file, start, size, |
208 | if (!(flags & MAP_NONBLOCK)) | 208 | vma->vm_flags, pgoff); |
209 | vm_flags |= VM_POPULATE; | ||
210 | addr = mmap_region(file, start, size, vm_flags, pgoff); | ||
211 | fput(file); | 209 | fput(file); |
212 | if (IS_ERR_VALUE(addr)) { | 210 | if (IS_ERR_VALUE(addr)) { |
213 | err = addr; | 211 | err = addr; |
@@ -226,12 +224,6 @@ get_write_lock: | |||
226 | mutex_unlock(&mapping->i_mmap_mutex); | 224 | mutex_unlock(&mapping->i_mmap_mutex); |
227 | } | 225 | } |
228 | 226 | ||
229 | if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { | ||
230 | if (!has_write_lock) | ||
231 | goto get_write_lock; | ||
232 | vma->vm_flags |= VM_POPULATE; | ||
233 | } | ||
234 | |||
235 | if (vma->vm_flags & VM_LOCKED) { | 227 | if (vma->vm_flags & VM_LOCKED) { |
236 | /* | 228 | /* |
237 | * drop PG_Mlocked flag for over-mapped range | 229 | * drop PG_Mlocked flag for over-mapped range |
@@ -254,7 +246,8 @@ get_write_lock: | |||
254 | */ | 246 | */ |
255 | 247 | ||
256 | out: | 248 | out: |
257 | vm_flags = vma->vm_flags; | 249 | if (vma) |
250 | vm_flags = vma->vm_flags; | ||
258 | if (likely(!has_write_lock)) | 251 | if (likely(!has_write_lock)) |
259 | up_read(&mm->mmap_sem); | 252 | up_read(&mm->mmap_sem); |
260 | else | 253 | else |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2f7f5aaaafb..03a89a2f464b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -163,35 +163,34 @@ static int start_khugepaged(void) | |||
163 | } | 163 | } |
164 | 164 | ||
165 | static atomic_t huge_zero_refcount; | 165 | static atomic_t huge_zero_refcount; |
166 | static unsigned long huge_zero_pfn __read_mostly; | 166 | static struct page *huge_zero_page __read_mostly; |
167 | 167 | ||
168 | static inline bool is_huge_zero_pfn(unsigned long pfn) | 168 | static inline bool is_huge_zero_page(struct page *page) |
169 | { | 169 | { |
170 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); | 170 | return ACCESS_ONCE(huge_zero_page) == page; |
171 | return zero_pfn && pfn == zero_pfn; | ||
172 | } | 171 | } |
173 | 172 | ||
174 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 173 | static inline bool is_huge_zero_pmd(pmd_t pmd) |
175 | { | 174 | { |
176 | return is_huge_zero_pfn(pmd_pfn(pmd)); | 175 | return is_huge_zero_page(pmd_page(pmd)); |
177 | } | 176 | } |
178 | 177 | ||
179 | static unsigned long get_huge_zero_page(void) | 178 | static struct page *get_huge_zero_page(void) |
180 | { | 179 | { |
181 | struct page *zero_page; | 180 | struct page *zero_page; |
182 | retry: | 181 | retry: |
183 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | 182 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) |
184 | return ACCESS_ONCE(huge_zero_pfn); | 183 | return ACCESS_ONCE(huge_zero_page); |
185 | 184 | ||
186 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | 185 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, |
187 | HPAGE_PMD_ORDER); | 186 | HPAGE_PMD_ORDER); |
188 | if (!zero_page) { | 187 | if (!zero_page) { |
189 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); | 188 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); |
190 | return 0; | 189 | return NULL; |
191 | } | 190 | } |
192 | count_vm_event(THP_ZERO_PAGE_ALLOC); | 191 | count_vm_event(THP_ZERO_PAGE_ALLOC); |
193 | preempt_disable(); | 192 | preempt_disable(); |
194 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { | 193 | if (cmpxchg(&huge_zero_page, NULL, zero_page)) { |
195 | preempt_enable(); | 194 | preempt_enable(); |
196 | __free_page(zero_page); | 195 | __free_page(zero_page); |
197 | goto retry; | 196 | goto retry; |
@@ -200,7 +199,7 @@ retry: | |||
200 | /* We take additional reference here. It will be put back by shrinker */ | 199 | /* We take additional reference here. It will be put back by shrinker */ |
201 | atomic_set(&huge_zero_refcount, 2); | 200 | atomic_set(&huge_zero_refcount, 2); |
202 | preempt_enable(); | 201 | preempt_enable(); |
203 | return ACCESS_ONCE(huge_zero_pfn); | 202 | return ACCESS_ONCE(huge_zero_page); |
204 | } | 203 | } |
205 | 204 | ||
206 | static void put_huge_zero_page(void) | 205 | static void put_huge_zero_page(void) |
@@ -220,9 +219,9 @@ static int shrink_huge_zero_page(struct shrinker *shrink, | |||
220 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | 219 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; |
221 | 220 | ||
222 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | 221 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { |
223 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | 222 | struct page *zero_page = xchg(&huge_zero_page, NULL); |
224 | BUG_ON(zero_pfn == 0); | 223 | BUG_ON(zero_page == NULL); |
225 | __free_page(__pfn_to_page(zero_pfn)); | 224 | __free_page(zero_page); |
226 | } | 225 | } |
227 | 226 | ||
228 | return 0; | 227 | return 0; |
@@ -713,6 +712,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
713 | return VM_FAULT_OOM; | 712 | return VM_FAULT_OOM; |
714 | 713 | ||
715 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | 714 | clear_huge_page(page, haddr, HPAGE_PMD_NR); |
715 | /* | ||
716 | * The memory barrier inside __SetPageUptodate makes sure that | ||
717 | * clear_huge_page writes become visible before the set_pmd_at() | ||
718 | * write. | ||
719 | */ | ||
716 | __SetPageUptodate(page); | 720 | __SetPageUptodate(page); |
717 | 721 | ||
718 | spin_lock(&mm->page_table_lock); | 722 | spin_lock(&mm->page_table_lock); |
@@ -724,12 +728,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
724 | } else { | 728 | } else { |
725 | pmd_t entry; | 729 | pmd_t entry; |
726 | entry = mk_huge_pmd(page, vma); | 730 | entry = mk_huge_pmd(page, vma); |
727 | /* | ||
728 | * The spinlocking to take the lru_lock inside | ||
729 | * page_add_new_anon_rmap() acts as a full memory | ||
730 | * barrier to be sure clear_huge_page writes become | ||
731 | * visible after the set_pmd_at() write. | ||
732 | */ | ||
733 | page_add_new_anon_rmap(page, vma, haddr); | 731 | page_add_new_anon_rmap(page, vma, haddr); |
734 | set_pmd_at(mm, haddr, pmd, entry); | 732 | set_pmd_at(mm, haddr, pmd, entry); |
735 | pgtable_trans_huge_deposit(mm, pgtable); | 733 | pgtable_trans_huge_deposit(mm, pgtable); |
@@ -765,12 +763,12 @@ static inline struct page *alloc_hugepage(int defrag) | |||
765 | 763 | ||
766 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 764 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
767 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 765 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
768 | unsigned long zero_pfn) | 766 | struct page *zero_page) |
769 | { | 767 | { |
770 | pmd_t entry; | 768 | pmd_t entry; |
771 | if (!pmd_none(*pmd)) | 769 | if (!pmd_none(*pmd)) |
772 | return false; | 770 | return false; |
773 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); | 771 | entry = mk_pmd(zero_page, vma->vm_page_prot); |
774 | entry = pmd_wrprotect(entry); | 772 | entry = pmd_wrprotect(entry); |
775 | entry = pmd_mkhuge(entry); | 773 | entry = pmd_mkhuge(entry); |
776 | set_pmd_at(mm, haddr, pmd, entry); | 774 | set_pmd_at(mm, haddr, pmd, entry); |
@@ -795,20 +793,20 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
795 | if (!(flags & FAULT_FLAG_WRITE) && | 793 | if (!(flags & FAULT_FLAG_WRITE) && |
796 | transparent_hugepage_use_zero_page()) { | 794 | transparent_hugepage_use_zero_page()) { |
797 | pgtable_t pgtable; | 795 | pgtable_t pgtable; |
798 | unsigned long zero_pfn; | 796 | struct page *zero_page; |
799 | bool set; | 797 | bool set; |
800 | pgtable = pte_alloc_one(mm, haddr); | 798 | pgtable = pte_alloc_one(mm, haddr); |
801 | if (unlikely(!pgtable)) | 799 | if (unlikely(!pgtable)) |
802 | return VM_FAULT_OOM; | 800 | return VM_FAULT_OOM; |
803 | zero_pfn = get_huge_zero_page(); | 801 | zero_page = get_huge_zero_page(); |
804 | if (unlikely(!zero_pfn)) { | 802 | if (unlikely(!zero_page)) { |
805 | pte_free(mm, pgtable); | 803 | pte_free(mm, pgtable); |
806 | count_vm_event(THP_FAULT_FALLBACK); | 804 | count_vm_event(THP_FAULT_FALLBACK); |
807 | goto out; | 805 | goto out; |
808 | } | 806 | } |
809 | spin_lock(&mm->page_table_lock); | 807 | spin_lock(&mm->page_table_lock); |
810 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | 808 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, |
811 | zero_pfn); | 809 | zero_page); |
812 | spin_unlock(&mm->page_table_lock); | 810 | spin_unlock(&mm->page_table_lock); |
813 | if (!set) { | 811 | if (!set) { |
814 | pte_free(mm, pgtable); | 812 | pte_free(mm, pgtable); |
@@ -887,16 +885,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
887 | * a page table. | 885 | * a page table. |
888 | */ | 886 | */ |
889 | if (is_huge_zero_pmd(pmd)) { | 887 | if (is_huge_zero_pmd(pmd)) { |
890 | unsigned long zero_pfn; | 888 | struct page *zero_page; |
891 | bool set; | 889 | bool set; |
892 | /* | 890 | /* |
893 | * get_huge_zero_page() will never allocate a new page here, | 891 | * get_huge_zero_page() will never allocate a new page here, |
894 | * since we already have a zero page to copy. It just takes a | 892 | * since we already have a zero page to copy. It just takes a |
895 | * reference. | 893 | * reference. |
896 | */ | 894 | */ |
897 | zero_pfn = get_huge_zero_page(); | 895 | zero_page = get_huge_zero_page(); |
898 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | 896 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, |
899 | zero_pfn); | 897 | zero_page); |
900 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | 898 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ |
901 | ret = 0; | 899 | ret = 0; |
902 | goto out_unlock; | 900 | goto out_unlock; |
@@ -1560,7 +1558,8 @@ static int __split_huge_page_splitting(struct page *page, | |||
1560 | return ret; | 1558 | return ret; |
1561 | } | 1559 | } |
1562 | 1560 | ||
1563 | static void __split_huge_page_refcount(struct page *page) | 1561 | static void __split_huge_page_refcount(struct page *page, |
1562 | struct list_head *list) | ||
1564 | { | 1563 | { |
1565 | int i; | 1564 | int i; |
1566 | struct zone *zone = page_zone(page); | 1565 | struct zone *zone = page_zone(page); |
@@ -1646,7 +1645,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1646 | BUG_ON(!PageDirty(page_tail)); | 1645 | BUG_ON(!PageDirty(page_tail)); |
1647 | BUG_ON(!PageSwapBacked(page_tail)); | 1646 | BUG_ON(!PageSwapBacked(page_tail)); |
1648 | 1647 | ||
1649 | lru_add_page_tail(page, page_tail, lruvec); | 1648 | lru_add_page_tail(page, page_tail, lruvec, list); |
1650 | } | 1649 | } |
1651 | atomic_sub(tail_count, &page->_count); | 1650 | atomic_sub(tail_count, &page->_count); |
1652 | BUG_ON(atomic_read(&page->_count) <= 0); | 1651 | BUG_ON(atomic_read(&page->_count) <= 0); |
@@ -1753,7 +1752,8 @@ static int __split_huge_page_map(struct page *page, | |||
1753 | 1752 | ||
1754 | /* must be called with anon_vma->root->rwsem held */ | 1753 | /* must be called with anon_vma->root->rwsem held */ |
1755 | static void __split_huge_page(struct page *page, | 1754 | static void __split_huge_page(struct page *page, |
1756 | struct anon_vma *anon_vma) | 1755 | struct anon_vma *anon_vma, |
1756 | struct list_head *list) | ||
1757 | { | 1757 | { |
1758 | int mapcount, mapcount2; | 1758 | int mapcount, mapcount2; |
1759 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1759 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -1784,7 +1784,7 @@ static void __split_huge_page(struct page *page, | |||
1784 | mapcount, page_mapcount(page)); | 1784 | mapcount, page_mapcount(page)); |
1785 | BUG_ON(mapcount != page_mapcount(page)); | 1785 | BUG_ON(mapcount != page_mapcount(page)); |
1786 | 1786 | ||
1787 | __split_huge_page_refcount(page); | 1787 | __split_huge_page_refcount(page, list); |
1788 | 1788 | ||
1789 | mapcount2 = 0; | 1789 | mapcount2 = 0; |
1790 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1790 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
@@ -1799,12 +1799,19 @@ static void __split_huge_page(struct page *page, | |||
1799 | BUG_ON(mapcount != mapcount2); | 1799 | BUG_ON(mapcount != mapcount2); |
1800 | } | 1800 | } |
1801 | 1801 | ||
1802 | int split_huge_page(struct page *page) | 1802 | /* |
1803 | * Split a hugepage into normal pages. This doesn't change the position of head | ||
1804 | * page. If @list is null, tail pages will be added to LRU list, otherwise, to | ||
1805 | * @list. Both head page and tail pages will inherit mapping, flags, and so on | ||
1806 | * from the hugepage. | ||
1807 | * Return 0 if the hugepage is split successfully otherwise return 1. | ||
1808 | */ | ||
1809 | int split_huge_page_to_list(struct page *page, struct list_head *list) | ||
1803 | { | 1810 | { |
1804 | struct anon_vma *anon_vma; | 1811 | struct anon_vma *anon_vma; |
1805 | int ret = 1; | 1812 | int ret = 1; |
1806 | 1813 | ||
1807 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | 1814 | BUG_ON(is_huge_zero_page(page)); |
1808 | BUG_ON(!PageAnon(page)); | 1815 | BUG_ON(!PageAnon(page)); |
1809 | 1816 | ||
1810 | /* | 1817 | /* |
@@ -1824,7 +1831,7 @@ int split_huge_page(struct page *page) | |||
1824 | goto out_unlock; | 1831 | goto out_unlock; |
1825 | 1832 | ||
1826 | BUG_ON(!PageSwapBacked(page)); | 1833 | BUG_ON(!PageSwapBacked(page)); |
1827 | __split_huge_page(page, anon_vma); | 1834 | __split_huge_page(page, anon_vma, list); |
1828 | count_vm_event(THP_SPLIT); | 1835 | count_vm_event(THP_SPLIT); |
1829 | 1836 | ||
1830 | BUG_ON(PageCompound(page)); | 1837 | BUG_ON(PageCompound(page)); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a0be33bb199..f8feeeca6686 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1761,7 +1761,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | |||
1761 | * Unregister hstate attributes from a single node device. | 1761 | * Unregister hstate attributes from a single node device. |
1762 | * No-op if no hstate attributes attached. | 1762 | * No-op if no hstate attributes attached. |
1763 | */ | 1763 | */ |
1764 | void hugetlb_unregister_node(struct node *node) | 1764 | static void hugetlb_unregister_node(struct node *node) |
1765 | { | 1765 | { |
1766 | struct hstate *h; | 1766 | struct hstate *h; |
1767 | struct node_hstate *nhs = &node_hstates[node->dev.id]; | 1767 | struct node_hstate *nhs = &node_hstates[node->dev.id]; |
@@ -1805,7 +1805,7 @@ static void hugetlb_unregister_all_nodes(void) | |||
1805 | * Register hstate attributes for a single node device. | 1805 | * Register hstate attributes for a single node device. |
1806 | * No-op if attributes already registered. | 1806 | * No-op if attributes already registered. |
1807 | */ | 1807 | */ |
1808 | void hugetlb_register_node(struct node *node) | 1808 | static void hugetlb_register_node(struct node *node) |
1809 | { | 1809 | { |
1810 | struct hstate *h; | 1810 | struct hstate *h; |
1811 | struct node_hstate *nhs = &node_hstates[node->dev.id]; | 1811 | struct node_hstate *nhs = &node_hstates[node->dev.id]; |
@@ -2121,11 +2121,30 @@ int hugetlb_report_node_meminfo(int nid, char *buf) | |||
2121 | nid, h->surplus_huge_pages_node[nid]); | 2121 | nid, h->surplus_huge_pages_node[nid]); |
2122 | } | 2122 | } |
2123 | 2123 | ||
2124 | void hugetlb_show_meminfo(void) | ||
2125 | { | ||
2126 | struct hstate *h; | ||
2127 | int nid; | ||
2128 | |||
2129 | for_each_node_state(nid, N_MEMORY) | ||
2130 | for_each_hstate(h) | ||
2131 | pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", | ||
2132 | nid, | ||
2133 | h->nr_huge_pages_node[nid], | ||
2134 | h->free_huge_pages_node[nid], | ||
2135 | h->surplus_huge_pages_node[nid], | ||
2136 | 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); | ||
2137 | } | ||
2138 | |||
2124 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 2139 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
2125 | unsigned long hugetlb_total_pages(void) | 2140 | unsigned long hugetlb_total_pages(void) |
2126 | { | 2141 | { |
2127 | struct hstate *h = &default_hstate; | 2142 | struct hstate *h; |
2128 | return h->nr_huge_pages * pages_per_huge_page(h); | 2143 | unsigned long nr_total_pages = 0; |
2144 | |||
2145 | for_each_hstate(h) | ||
2146 | nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); | ||
2147 | return nr_total_pages; | ||
2129 | } | 2148 | } |
2130 | 2149 | ||
2131 | static int hugetlb_acct_memory(struct hstate *h, long delta) | 2150 | static int hugetlb_acct_memory(struct hstate *h, long delta) |
@@ -2243,10 +2262,11 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, | |||
2243 | pte_t entry; | 2262 | pte_t entry; |
2244 | 2263 | ||
2245 | if (writable) { | 2264 | if (writable) { |
2246 | entry = | 2265 | entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, |
2247 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | 2266 | vma->vm_page_prot))); |
2248 | } else { | 2267 | } else { |
2249 | entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); | 2268 | entry = huge_pte_wrprotect(mk_huge_pte(page, |
2269 | vma->vm_page_prot)); | ||
2250 | } | 2270 | } |
2251 | entry = pte_mkyoung(entry); | 2271 | entry = pte_mkyoung(entry); |
2252 | entry = pte_mkhuge(entry); | 2272 | entry = pte_mkhuge(entry); |
@@ -2260,7 +2280,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
2260 | { | 2280 | { |
2261 | pte_t entry; | 2281 | pte_t entry; |
2262 | 2282 | ||
2263 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2283 | entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); |
2264 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) | 2284 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) |
2265 | update_mmu_cache(vma, address, ptep); | 2285 | update_mmu_cache(vma, address, ptep); |
2266 | } | 2286 | } |
@@ -2375,7 +2395,7 @@ again: | |||
2375 | * HWPoisoned hugepage is already unmapped and dropped reference | 2395 | * HWPoisoned hugepage is already unmapped and dropped reference |
2376 | */ | 2396 | */ |
2377 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { | 2397 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { |
2378 | pte_clear(mm, address, ptep); | 2398 | huge_pte_clear(mm, address, ptep); |
2379 | continue; | 2399 | continue; |
2380 | } | 2400 | } |
2381 | 2401 | ||
@@ -2399,7 +2419,7 @@ again: | |||
2399 | 2419 | ||
2400 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 2420 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
2401 | tlb_remove_tlb_entry(tlb, ptep, address); | 2421 | tlb_remove_tlb_entry(tlb, ptep, address); |
2402 | if (pte_dirty(pte)) | 2422 | if (huge_pte_dirty(pte)) |
2403 | set_page_dirty(page); | 2423 | set_page_dirty(page); |
2404 | 2424 | ||
2405 | page_remove_rmap(page); | 2425 | page_remove_rmap(page); |
@@ -2852,7 +2872,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2852 | * page now as it is used to determine if a reservation has been | 2872 | * page now as it is used to determine if a reservation has been |
2853 | * consumed. | 2873 | * consumed. |
2854 | */ | 2874 | */ |
2855 | if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { | 2875 | if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { |
2856 | if (vma_needs_reservation(h, vma, address) < 0) { | 2876 | if (vma_needs_reservation(h, vma, address) < 0) { |
2857 | ret = VM_FAULT_OOM; | 2877 | ret = VM_FAULT_OOM; |
2858 | goto out_mutex; | 2878 | goto out_mutex; |
@@ -2882,12 +2902,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2882 | 2902 | ||
2883 | 2903 | ||
2884 | if (flags & FAULT_FLAG_WRITE) { | 2904 | if (flags & FAULT_FLAG_WRITE) { |
2885 | if (!pte_write(entry)) { | 2905 | if (!huge_pte_write(entry)) { |
2886 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 2906 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
2887 | pagecache_page); | 2907 | pagecache_page); |
2888 | goto out_page_table_lock; | 2908 | goto out_page_table_lock; |
2889 | } | 2909 | } |
2890 | entry = pte_mkdirty(entry); | 2910 | entry = huge_pte_mkdirty(entry); |
2891 | } | 2911 | } |
2892 | entry = pte_mkyoung(entry); | 2912 | entry = pte_mkyoung(entry); |
2893 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 2913 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
@@ -2957,8 +2977,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2957 | break; | 2977 | break; |
2958 | } | 2978 | } |
2959 | 2979 | ||
2960 | if (absent || | 2980 | /* |
2961 | ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { | 2981 | * We need call hugetlb_fault for both hugepages under migration |
2982 | * (in which case hugetlb_fault waits for the migration,) and | ||
2983 | * hwpoisoned hugepages (in which case we need to prevent the | ||
2984 | * caller from accessing to them.) In order to do this, we use | ||
2985 | * here is_swap_pte instead of is_hugetlb_entry_migration and | ||
2986 | * is_hugetlb_entry_hwpoisoned. This is because it simply covers | ||
2987 | * both cases, and because we can't follow correct pages | ||
2988 | * directly from any kind of swap entries. | ||
2989 | */ | ||
2990 | if (absent || is_swap_pte(huge_ptep_get(pte)) || | ||
2991 | ((flags & FOLL_WRITE) && | ||
2992 | !huge_pte_write(huge_ptep_get(pte)))) { | ||
2962 | int ret; | 2993 | int ret; |
2963 | 2994 | ||
2964 | spin_unlock(&mm->page_table_lock); | 2995 | spin_unlock(&mm->page_table_lock); |
@@ -3028,7 +3059,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3028 | } | 3059 | } |
3029 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3060 | if (!huge_pte_none(huge_ptep_get(ptep))) { |
3030 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3061 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
3031 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 3062 | pte = pte_mkhuge(huge_pte_modify(pte, newprot)); |
3032 | pte = arch_make_huge_pte(pte, vma, NULL, 0); | 3063 | pte = arch_make_huge_pte(pte, vma, NULL, 0); |
3033 | set_huge_pte_at(mm, address, ptep, pte); | 3064 | set_huge_pte_at(mm, address, ptep, pte); |
3034 | pages++; | 3065 | pages++; |
diff --git a/mm/madvise.c b/mm/madvise.c index c58c94b56c3d..7055883e6e25 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -473,27 +473,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
473 | if (!madvise_behavior_valid(behavior)) | 473 | if (!madvise_behavior_valid(behavior)) |
474 | return error; | 474 | return error; |
475 | 475 | ||
476 | write = madvise_need_mmap_write(behavior); | ||
477 | if (write) | ||
478 | down_write(¤t->mm->mmap_sem); | ||
479 | else | ||
480 | down_read(¤t->mm->mmap_sem); | ||
481 | |||
482 | if (start & ~PAGE_MASK) | 476 | if (start & ~PAGE_MASK) |
483 | goto out; | 477 | return error; |
484 | len = (len_in + ~PAGE_MASK) & PAGE_MASK; | 478 | len = (len_in + ~PAGE_MASK) & PAGE_MASK; |
485 | 479 | ||
486 | /* Check to see whether len was rounded up from small -ve to zero */ | 480 | /* Check to see whether len was rounded up from small -ve to zero */ |
487 | if (len_in && !len) | 481 | if (len_in && !len) |
488 | goto out; | 482 | return error; |
489 | 483 | ||
490 | end = start + len; | 484 | end = start + len; |
491 | if (end < start) | 485 | if (end < start) |
492 | goto out; | 486 | return error; |
493 | 487 | ||
494 | error = 0; | 488 | error = 0; |
495 | if (end == start) | 489 | if (end == start) |
496 | goto out; | 490 | return error; |
491 | |||
492 | write = madvise_need_mmap_write(behavior); | ||
493 | if (write) | ||
494 | down_write(¤t->mm->mmap_sem); | ||
495 | else | ||
496 | down_read(¤t->mm->mmap_sem); | ||
497 | 497 | ||
498 | /* | 498 | /* |
499 | * If the interval [start,end) covers some unmapped address | 499 | * If the interval [start,end) covers some unmapped address |
@@ -509,14 +509,14 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
509 | /* Still start < end. */ | 509 | /* Still start < end. */ |
510 | error = -ENOMEM; | 510 | error = -ENOMEM; |
511 | if (!vma) | 511 | if (!vma) |
512 | goto out_plug; | 512 | goto out; |
513 | 513 | ||
514 | /* Here start < (end|vma->vm_end). */ | 514 | /* Here start < (end|vma->vm_end). */ |
515 | if (start < vma->vm_start) { | 515 | if (start < vma->vm_start) { |
516 | unmapped_error = -ENOMEM; | 516 | unmapped_error = -ENOMEM; |
517 | start = vma->vm_start; | 517 | start = vma->vm_start; |
518 | if (start >= end) | 518 | if (start >= end) |
519 | goto out_plug; | 519 | goto out; |
520 | } | 520 | } |
521 | 521 | ||
522 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ | 522 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ |
@@ -527,21 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
527 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ | 527 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ |
528 | error = madvise_vma(vma, &prev, start, tmp, behavior); | 528 | error = madvise_vma(vma, &prev, start, tmp, behavior); |
529 | if (error) | 529 | if (error) |
530 | goto out_plug; | 530 | goto out; |
531 | start = tmp; | 531 | start = tmp; |
532 | if (prev && start < prev->vm_end) | 532 | if (prev && start < prev->vm_end) |
533 | start = prev->vm_end; | 533 | start = prev->vm_end; |
534 | error = unmapped_error; | 534 | error = unmapped_error; |
535 | if (start >= end) | 535 | if (start >= end) |
536 | goto out_plug; | 536 | goto out; |
537 | if (prev) | 537 | if (prev) |
538 | vma = prev->vm_next; | 538 | vma = prev->vm_next; |
539 | else /* madvise_remove dropped mmap_sem */ | 539 | else /* madvise_remove dropped mmap_sem */ |
540 | vma = find_vma(current->mm, start); | 540 | vma = find_vma(current->mm, start); |
541 | } | 541 | } |
542 | out_plug: | ||
543 | blk_finish_plug(&plug); | ||
544 | out: | 542 | out: |
543 | blk_finish_plug(&plug); | ||
545 | if (write) | 544 | if (write) |
546 | up_write(¤t->mm->mmap_sem); | 545 | up_write(¤t->mm->mmap_sem); |
547 | else | 546 | else |
diff --git a/mm/memblock.c b/mm/memblock.c index b8d9147e5c08..c5fad932fa51 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -322,10 +322,11 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
322 | 322 | ||
323 | /** | 323 | /** |
324 | * memblock_insert_region - insert new memblock region | 324 | * memblock_insert_region - insert new memblock region |
325 | * @type: memblock type to insert into | 325 | * @type: memblock type to insert into |
326 | * @idx: index for the insertion point | 326 | * @idx: index for the insertion point |
327 | * @base: base address of the new region | 327 | * @base: base address of the new region |
328 | * @size: size of the new region | 328 | * @size: size of the new region |
329 | * @nid: node id of the new region | ||
329 | * | 330 | * |
330 | * Insert new memblock region [@base,@base+@size) into @type at @idx. | 331 | * Insert new memblock region [@base,@base+@size) into @type at @idx. |
331 | * @type must already have extra room to accomodate the new region. | 332 | * @type must already have extra room to accomodate the new region. |
@@ -771,6 +772,9 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | |||
771 | { | 772 | { |
772 | phys_addr_t found; | 773 | phys_addr_t found; |
773 | 774 | ||
775 | if (WARN_ON(!align)) | ||
776 | align = __alignof__(long long); | ||
777 | |||
774 | /* align @size to avoid excessive fragmentation on reserved array */ | 778 | /* align @size to avoid excessive fragmentation on reserved array */ |
775 | size = round_up(size, align); | 779 | size = round_up(size, align); |
776 | 780 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b552224f5cf..0f1d92163f30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/fs.h> | 49 | #include <linux/fs.h> |
50 | #include <linux/seq_file.h> | 50 | #include <linux/seq_file.h> |
51 | #include <linux/vmalloc.h> | 51 | #include <linux/vmalloc.h> |
52 | #include <linux/vmpressure.h> | ||
52 | #include <linux/mm_inline.h> | 53 | #include <linux/mm_inline.h> |
53 | #include <linux/page_cgroup.h> | 54 | #include <linux/page_cgroup.h> |
54 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
@@ -152,8 +153,13 @@ struct mem_cgroup_stat_cpu { | |||
152 | }; | 153 | }; |
153 | 154 | ||
154 | struct mem_cgroup_reclaim_iter { | 155 | struct mem_cgroup_reclaim_iter { |
155 | /* css_id of the last scanned hierarchy member */ | 156 | /* |
156 | int position; | 157 | * last scanned hierarchy member. Valid only if last_dead_count |
158 | * matches memcg->dead_count of the hierarchy root group. | ||
159 | */ | ||
160 | struct mem_cgroup *last_visited; | ||
161 | unsigned long last_dead_count; | ||
162 | |||
157 | /* scan generation, increased every round-trip */ | 163 | /* scan generation, increased every round-trip */ |
158 | unsigned int generation; | 164 | unsigned int generation; |
159 | }; | 165 | }; |
@@ -256,6 +262,9 @@ struct mem_cgroup { | |||
256 | */ | 262 | */ |
257 | struct res_counter res; | 263 | struct res_counter res; |
258 | 264 | ||
265 | /* vmpressure notifications */ | ||
266 | struct vmpressure vmpressure; | ||
267 | |||
259 | union { | 268 | union { |
260 | /* | 269 | /* |
261 | * the counter to account for mem+swap usage. | 270 | * the counter to account for mem+swap usage. |
@@ -335,6 +344,7 @@ struct mem_cgroup { | |||
335 | struct mem_cgroup_stat_cpu nocpu_base; | 344 | struct mem_cgroup_stat_cpu nocpu_base; |
336 | spinlock_t pcp_counter_lock; | 345 | spinlock_t pcp_counter_lock; |
337 | 346 | ||
347 | atomic_t dead_count; | ||
338 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) | 348 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
339 | struct tcp_memcontrol tcp_mem; | 349 | struct tcp_memcontrol tcp_mem; |
340 | #endif | 350 | #endif |
@@ -353,6 +363,7 @@ struct mem_cgroup { | |||
353 | atomic_t numainfo_events; | 363 | atomic_t numainfo_events; |
354 | atomic_t numainfo_updating; | 364 | atomic_t numainfo_updating; |
355 | #endif | 365 | #endif |
366 | |||
356 | /* | 367 | /* |
357 | * Per cgroup active and inactive list, similar to the | 368 | * Per cgroup active and inactive list, similar to the |
358 | * per zone LRU lists. | 369 | * per zone LRU lists. |
@@ -504,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | |||
504 | return container_of(s, struct mem_cgroup, css); | 515 | return container_of(s, struct mem_cgroup, css); |
505 | } | 516 | } |
506 | 517 | ||
518 | /* Some nice accessors for the vmpressure. */ | ||
519 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) | ||
520 | { | ||
521 | if (!memcg) | ||
522 | memcg = root_mem_cgroup; | ||
523 | return &memcg->vmpressure; | ||
524 | } | ||
525 | |||
526 | struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | ||
527 | { | ||
528 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | ||
529 | } | ||
530 | |||
531 | struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) | ||
532 | { | ||
533 | return &mem_cgroup_from_css(css)->vmpressure; | ||
534 | } | ||
535 | |||
507 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 536 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
508 | { | 537 | { |
509 | return (memcg == root_mem_cgroup); | 538 | return (memcg == root_mem_cgroup); |
@@ -1067,6 +1096,51 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
1067 | return memcg; | 1096 | return memcg; |
1068 | } | 1097 | } |
1069 | 1098 | ||
1099 | /* | ||
1100 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | ||
1101 | * ref. count) or NULL if the whole root's subtree has been visited. | ||
1102 | * | ||
1103 | * helper function to be used by mem_cgroup_iter | ||
1104 | */ | ||
1105 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | ||
1106 | struct mem_cgroup *last_visited) | ||
1107 | { | ||
1108 | struct cgroup *prev_cgroup, *next_cgroup; | ||
1109 | |||
1110 | /* | ||
1111 | * Root is not visited by cgroup iterators so it needs an | ||
1112 | * explicit visit. | ||
1113 | */ | ||
1114 | if (!last_visited) | ||
1115 | return root; | ||
1116 | |||
1117 | prev_cgroup = (last_visited == root) ? NULL | ||
1118 | : last_visited->css.cgroup; | ||
1119 | skip_node: | ||
1120 | next_cgroup = cgroup_next_descendant_pre( | ||
1121 | prev_cgroup, root->css.cgroup); | ||
1122 | |||
1123 | /* | ||
1124 | * Even if we found a group we have to make sure it is | ||
1125 | * alive. css && !memcg means that the groups should be | ||
1126 | * skipped and we should continue the tree walk. | ||
1127 | * last_visited css is safe to use because it is | ||
1128 | * protected by css_get and the tree walk is rcu safe. | ||
1129 | */ | ||
1130 | if (next_cgroup) { | ||
1131 | struct mem_cgroup *mem = mem_cgroup_from_cont( | ||
1132 | next_cgroup); | ||
1133 | if (css_tryget(&mem->css)) | ||
1134 | return mem; | ||
1135 | else { | ||
1136 | prev_cgroup = next_cgroup; | ||
1137 | goto skip_node; | ||
1138 | } | ||
1139 | } | ||
1140 | |||
1141 | return NULL; | ||
1142 | } | ||
1143 | |||
1070 | /** | 1144 | /** |
1071 | * mem_cgroup_iter - iterate over memory cgroup hierarchy | 1145 | * mem_cgroup_iter - iterate over memory cgroup hierarchy |
1072 | * @root: hierarchy root | 1146 | * @root: hierarchy root |
@@ -1089,7 +1163,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1089 | struct mem_cgroup_reclaim_cookie *reclaim) | 1163 | struct mem_cgroup_reclaim_cookie *reclaim) |
1090 | { | 1164 | { |
1091 | struct mem_cgroup *memcg = NULL; | 1165 | struct mem_cgroup *memcg = NULL; |
1092 | int id = 0; | 1166 | struct mem_cgroup *last_visited = NULL; |
1167 | unsigned long uninitialized_var(dead_count); | ||
1093 | 1168 | ||
1094 | if (mem_cgroup_disabled()) | 1169 | if (mem_cgroup_disabled()) |
1095 | return NULL; | 1170 | return NULL; |
@@ -1098,20 +1173,17 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1098 | root = root_mem_cgroup; | 1173 | root = root_mem_cgroup; |
1099 | 1174 | ||
1100 | if (prev && !reclaim) | 1175 | if (prev && !reclaim) |
1101 | id = css_id(&prev->css); | 1176 | last_visited = prev; |
1102 | |||
1103 | if (prev && prev != root) | ||
1104 | css_put(&prev->css); | ||
1105 | 1177 | ||
1106 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1178 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1107 | if (prev) | 1179 | if (prev) |
1108 | return NULL; | 1180 | goto out_css_put; |
1109 | return root; | 1181 | return root; |
1110 | } | 1182 | } |
1111 | 1183 | ||
1184 | rcu_read_lock(); | ||
1112 | while (!memcg) { | 1185 | while (!memcg) { |
1113 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); | 1186 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); |
1114 | struct cgroup_subsys_state *css; | ||
1115 | 1187 | ||
1116 | if (reclaim) { | 1188 | if (reclaim) { |
1117 | int nid = zone_to_nid(reclaim->zone); | 1189 | int nid = zone_to_nid(reclaim->zone); |
@@ -1120,31 +1192,60 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1120 | 1192 | ||
1121 | mz = mem_cgroup_zoneinfo(root, nid, zid); | 1193 | mz = mem_cgroup_zoneinfo(root, nid, zid); |
1122 | iter = &mz->reclaim_iter[reclaim->priority]; | 1194 | iter = &mz->reclaim_iter[reclaim->priority]; |
1123 | if (prev && reclaim->generation != iter->generation) | 1195 | last_visited = iter->last_visited; |
1124 | return NULL; | 1196 | if (prev && reclaim->generation != iter->generation) { |
1125 | id = iter->position; | 1197 | iter->last_visited = NULL; |
1198 | goto out_unlock; | ||
1199 | } | ||
1200 | |||
1201 | /* | ||
1202 | * If the dead_count mismatches, a destruction | ||
1203 | * has happened or is happening concurrently. | ||
1204 | * If the dead_count matches, a destruction | ||
1205 | * might still happen concurrently, but since | ||
1206 | * we checked under RCU, that destruction | ||
1207 | * won't free the object until we release the | ||
1208 | * RCU reader lock. Thus, the dead_count | ||
1209 | * check verifies the pointer is still valid, | ||
1210 | * css_tryget() verifies the cgroup pointed to | ||
1211 | * is alive. | ||
1212 | */ | ||
1213 | dead_count = atomic_read(&root->dead_count); | ||
1214 | smp_rmb(); | ||
1215 | last_visited = iter->last_visited; | ||
1216 | if (last_visited) { | ||
1217 | if ((dead_count != iter->last_dead_count) || | ||
1218 | !css_tryget(&last_visited->css)) { | ||
1219 | last_visited = NULL; | ||
1220 | } | ||
1221 | } | ||
1126 | } | 1222 | } |
1127 | 1223 | ||
1128 | rcu_read_lock(); | 1224 | memcg = __mem_cgroup_iter_next(root, last_visited); |
1129 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); | ||
1130 | if (css) { | ||
1131 | if (css == &root->css || css_tryget(css)) | ||
1132 | memcg = mem_cgroup_from_css(css); | ||
1133 | } else | ||
1134 | id = 0; | ||
1135 | rcu_read_unlock(); | ||
1136 | 1225 | ||
1137 | if (reclaim) { | 1226 | if (reclaim) { |
1138 | iter->position = id; | 1227 | if (last_visited) |
1139 | if (!css) | 1228 | css_put(&last_visited->css); |
1229 | |||
1230 | iter->last_visited = memcg; | ||
1231 | smp_wmb(); | ||
1232 | iter->last_dead_count = dead_count; | ||
1233 | |||
1234 | if (!memcg) | ||
1140 | iter->generation++; | 1235 | iter->generation++; |
1141 | else if (!prev && memcg) | 1236 | else if (!prev && memcg) |
1142 | reclaim->generation = iter->generation; | 1237 | reclaim->generation = iter->generation; |
1143 | } | 1238 | } |
1144 | 1239 | ||
1145 | if (prev && !css) | 1240 | if (prev && !memcg) |
1146 | return NULL; | 1241 | goto out_unlock; |
1147 | } | 1242 | } |
1243 | out_unlock: | ||
1244 | rcu_read_unlock(); | ||
1245 | out_css_put: | ||
1246 | if (prev && prev != root) | ||
1247 | css_put(&prev->css); | ||
1248 | |||
1148 | return memcg; | 1249 | return memcg; |
1149 | } | 1250 | } |
1150 | 1251 | ||
@@ -1686,11 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1686 | struct task_struct *chosen = NULL; | 1787 | struct task_struct *chosen = NULL; |
1687 | 1788 | ||
1688 | /* | 1789 | /* |
1689 | * If current has a pending SIGKILL, then automatically select it. The | 1790 | * If current has a pending SIGKILL or is exiting, then automatically |
1690 | * goal is to allow it to allocate so that it may quickly exit and free | 1791 | * select it. The goal is to allow it to allocate so that it may |
1691 | * its memory. | 1792 | * quickly exit and free its memory. |
1692 | */ | 1793 | */ |
1693 | if (fatal_signal_pending(current)) { | 1794 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { |
1694 | set_thread_flag(TIF_MEMDIE); | 1795 | set_thread_flag(TIF_MEMDIE); |
1695 | return; | 1796 | return; |
1696 | } | 1797 | } |
@@ -3114,12 +3215,12 @@ void memcg_release_cache(struct kmem_cache *s) | |||
3114 | 3215 | ||
3115 | root = s->memcg_params->root_cache; | 3216 | root = s->memcg_params->root_cache; |
3116 | root->memcg_params->memcg_caches[id] = NULL; | 3217 | root->memcg_params->memcg_caches[id] = NULL; |
3117 | mem_cgroup_put(memcg); | ||
3118 | 3218 | ||
3119 | mutex_lock(&memcg->slab_caches_mutex); | 3219 | mutex_lock(&memcg->slab_caches_mutex); |
3120 | list_del(&s->memcg_params->list); | 3220 | list_del(&s->memcg_params->list); |
3121 | mutex_unlock(&memcg->slab_caches_mutex); | 3221 | mutex_unlock(&memcg->slab_caches_mutex); |
3122 | 3222 | ||
3223 | mem_cgroup_put(memcg); | ||
3123 | out: | 3224 | out: |
3124 | kfree(s->memcg_params); | 3225 | kfree(s->memcg_params); |
3125 | } | 3226 | } |
@@ -3220,52 +3321,53 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) | |||
3220 | schedule_work(&cachep->memcg_params->destroy); | 3321 | schedule_work(&cachep->memcg_params->destroy); |
3221 | } | 3322 | } |
3222 | 3323 | ||
3223 | static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) | 3324 | /* |
3224 | { | 3325 | * This lock protects updaters, not readers. We want readers to be as fast as |
3225 | char *name; | 3326 | * they can, and they will either see NULL or a valid cache value. Our model |
3226 | struct dentry *dentry; | 3327 | * allow them to see NULL, in which case the root memcg will be selected. |
3227 | 3328 | * | |
3228 | rcu_read_lock(); | 3329 | * We need this lock because multiple allocations to the same cache from a non |
3229 | dentry = rcu_dereference(memcg->css.cgroup->dentry); | 3330 | * will span more than one worker. Only one of them can create the cache. |
3230 | rcu_read_unlock(); | 3331 | */ |
3231 | 3332 | static DEFINE_MUTEX(memcg_cache_mutex); | |
3232 | BUG_ON(dentry == NULL); | ||
3233 | |||
3234 | name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, | ||
3235 | memcg_cache_id(memcg), dentry->d_name.name); | ||
3236 | |||
3237 | return name; | ||
3238 | } | ||
3239 | 3333 | ||
3334 | /* | ||
3335 | * Called with memcg_cache_mutex held | ||
3336 | */ | ||
3240 | static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | 3337 | static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, |
3241 | struct kmem_cache *s) | 3338 | struct kmem_cache *s) |
3242 | { | 3339 | { |
3243 | char *name; | ||
3244 | struct kmem_cache *new; | 3340 | struct kmem_cache *new; |
3341 | static char *tmp_name = NULL; | ||
3245 | 3342 | ||
3246 | name = memcg_cache_name(memcg, s); | 3343 | lockdep_assert_held(&memcg_cache_mutex); |
3247 | if (!name) | 3344 | |
3248 | return NULL; | 3345 | /* |
3346 | * kmem_cache_create_memcg duplicates the given name and | ||
3347 | * cgroup_name for this name requires RCU context. | ||
3348 | * This static temporary buffer is used to prevent from | ||
3349 | * pointless shortliving allocation. | ||
3350 | */ | ||
3351 | if (!tmp_name) { | ||
3352 | tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); | ||
3353 | if (!tmp_name) | ||
3354 | return NULL; | ||
3355 | } | ||
3356 | |||
3357 | rcu_read_lock(); | ||
3358 | snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, | ||
3359 | memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); | ||
3360 | rcu_read_unlock(); | ||
3249 | 3361 | ||
3250 | new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, | 3362 | new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, |
3251 | (s->flags & ~SLAB_PANIC), s->ctor, s); | 3363 | (s->flags & ~SLAB_PANIC), s->ctor, s); |
3252 | 3364 | ||
3253 | if (new) | 3365 | if (new) |
3254 | new->allocflags |= __GFP_KMEMCG; | 3366 | new->allocflags |= __GFP_KMEMCG; |
3255 | 3367 | ||
3256 | kfree(name); | ||
3257 | return new; | 3368 | return new; |
3258 | } | 3369 | } |
3259 | 3370 | ||
3260 | /* | ||
3261 | * This lock protects updaters, not readers. We want readers to be as fast as | ||
3262 | * they can, and they will either see NULL or a valid cache value. Our model | ||
3263 | * allow them to see NULL, in which case the root memcg will be selected. | ||
3264 | * | ||
3265 | * We need this lock because multiple allocations to the same cache from a non | ||
3266 | * will span more than one worker. Only one of them can create the cache. | ||
3267 | */ | ||
3268 | static DEFINE_MUTEX(memcg_cache_mutex); | ||
3269 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | 3371 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, |
3270 | struct kmem_cache *cachep) | 3372 | struct kmem_cache *cachep) |
3271 | { | 3373 | { |
@@ -3382,7 +3484,6 @@ static void memcg_create_cache_work_func(struct work_struct *w) | |||
3382 | 3484 | ||
3383 | /* | 3485 | /* |
3384 | * Enqueue the creation of a per-memcg kmem_cache. | 3486 | * Enqueue the creation of a per-memcg kmem_cache. |
3385 | * Called with rcu_read_lock. | ||
3386 | */ | 3487 | */ |
3387 | static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, | 3488 | static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, |
3388 | struct kmem_cache *cachep) | 3489 | struct kmem_cache *cachep) |
@@ -3390,12 +3491,8 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, | |||
3390 | struct create_work *cw; | 3491 | struct create_work *cw; |
3391 | 3492 | ||
3392 | cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); | 3493 | cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); |
3393 | if (cw == NULL) | 3494 | if (cw == NULL) { |
3394 | return; | 3495 | css_put(&memcg->css); |
3395 | |||
3396 | /* The corresponding put will be done in the workqueue. */ | ||
3397 | if (!css_tryget(&memcg->css)) { | ||
3398 | kfree(cw); | ||
3399 | return; | 3496 | return; |
3400 | } | 3497 | } |
3401 | 3498 | ||
@@ -3451,10 +3548,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3451 | 3548 | ||
3452 | rcu_read_lock(); | 3549 | rcu_read_lock(); |
3453 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | 3550 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); |
3454 | rcu_read_unlock(); | ||
3455 | 3551 | ||
3456 | if (!memcg_can_account_kmem(memcg)) | 3552 | if (!memcg_can_account_kmem(memcg)) |
3457 | return cachep; | 3553 | goto out; |
3458 | 3554 | ||
3459 | idx = memcg_cache_id(memcg); | 3555 | idx = memcg_cache_id(memcg); |
3460 | 3556 | ||
@@ -3463,29 +3559,38 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3463 | * code updating memcg_caches will issue a write barrier to match this. | 3559 | * code updating memcg_caches will issue a write barrier to match this. |
3464 | */ | 3560 | */ |
3465 | read_barrier_depends(); | 3561 | read_barrier_depends(); |
3466 | if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { | 3562 | if (likely(cachep->memcg_params->memcg_caches[idx])) { |
3467 | /* | 3563 | cachep = cachep->memcg_params->memcg_caches[idx]; |
3468 | * If we are in a safe context (can wait, and not in interrupt | 3564 | goto out; |
3469 | * context), we could be be predictable and return right away. | ||
3470 | * This would guarantee that the allocation being performed | ||
3471 | * already belongs in the new cache. | ||
3472 | * | ||
3473 | * However, there are some clashes that can arrive from locking. | ||
3474 | * For instance, because we acquire the slab_mutex while doing | ||
3475 | * kmem_cache_dup, this means no further allocation could happen | ||
3476 | * with the slab_mutex held. | ||
3477 | * | ||
3478 | * Also, because cache creation issue get_online_cpus(), this | ||
3479 | * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, | ||
3480 | * that ends up reversed during cpu hotplug. (cpuset allocates | ||
3481 | * a bunch of GFP_KERNEL memory during cpuup). Due to all that, | ||
3482 | * better to defer everything. | ||
3483 | */ | ||
3484 | memcg_create_cache_enqueue(memcg, cachep); | ||
3485 | return cachep; | ||
3486 | } | 3565 | } |
3487 | 3566 | ||
3488 | return cachep->memcg_params->memcg_caches[idx]; | 3567 | /* The corresponding put will be done in the workqueue. */ |
3568 | if (!css_tryget(&memcg->css)) | ||
3569 | goto out; | ||
3570 | rcu_read_unlock(); | ||
3571 | |||
3572 | /* | ||
3573 | * If we are in a safe context (can wait, and not in interrupt | ||
3574 | * context), we could be be predictable and return right away. | ||
3575 | * This would guarantee that the allocation being performed | ||
3576 | * already belongs in the new cache. | ||
3577 | * | ||
3578 | * However, there are some clashes that can arrive from locking. | ||
3579 | * For instance, because we acquire the slab_mutex while doing | ||
3580 | * kmem_cache_dup, this means no further allocation could happen | ||
3581 | * with the slab_mutex held. | ||
3582 | * | ||
3583 | * Also, because cache creation issue get_online_cpus(), this | ||
3584 | * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, | ||
3585 | * that ends up reversed during cpu hotplug. (cpuset allocates | ||
3586 | * a bunch of GFP_KERNEL memory during cpuup). Due to all that, | ||
3587 | * better to defer everything. | ||
3588 | */ | ||
3589 | memcg_create_cache_enqueue(memcg, cachep); | ||
3590 | return cachep; | ||
3591 | out: | ||
3592 | rcu_read_unlock(); | ||
3593 | return cachep; | ||
3489 | } | 3594 | } |
3490 | EXPORT_SYMBOL(__memcg_kmem_get_cache); | 3595 | EXPORT_SYMBOL(__memcg_kmem_get_cache); |
3491 | 3596 | ||
@@ -4947,9 +5052,6 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | |||
4947 | type = MEMFILE_TYPE(cft->private); | 5052 | type = MEMFILE_TYPE(cft->private); |
4948 | name = MEMFILE_ATTR(cft->private); | 5053 | name = MEMFILE_ATTR(cft->private); |
4949 | 5054 | ||
4950 | if (!do_swap_account && type == _MEMSWAP) | ||
4951 | return -EOPNOTSUPP; | ||
4952 | |||
4953 | switch (type) { | 5055 | switch (type) { |
4954 | case _MEM: | 5056 | case _MEM: |
4955 | if (name == RES_USAGE) | 5057 | if (name == RES_USAGE) |
@@ -5084,9 +5186,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
5084 | type = MEMFILE_TYPE(cft->private); | 5186 | type = MEMFILE_TYPE(cft->private); |
5085 | name = MEMFILE_ATTR(cft->private); | 5187 | name = MEMFILE_ATTR(cft->private); |
5086 | 5188 | ||
5087 | if (!do_swap_account && type == _MEMSWAP) | ||
5088 | return -EOPNOTSUPP; | ||
5089 | |||
5090 | switch (name) { | 5189 | switch (name) { |
5091 | case RES_LIMIT: | 5190 | case RES_LIMIT: |
5092 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | 5191 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ |
@@ -5163,9 +5262,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
5163 | type = MEMFILE_TYPE(event); | 5262 | type = MEMFILE_TYPE(event); |
5164 | name = MEMFILE_ATTR(event); | 5263 | name = MEMFILE_ATTR(event); |
5165 | 5264 | ||
5166 | if (!do_swap_account && type == _MEMSWAP) | ||
5167 | return -EOPNOTSUPP; | ||
5168 | |||
5169 | switch (name) { | 5265 | switch (name) { |
5170 | case RES_MAX_USAGE: | 5266 | case RES_MAX_USAGE: |
5171 | if (type == _MEM) | 5267 | if (type == _MEM) |
@@ -5744,7 +5840,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
5744 | return ret; | 5840 | return ret; |
5745 | 5841 | ||
5746 | return mem_cgroup_sockets_init(memcg, ss); | 5842 | return mem_cgroup_sockets_init(memcg, ss); |
5747 | }; | 5843 | } |
5748 | 5844 | ||
5749 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) | 5845 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) |
5750 | { | 5846 | { |
@@ -5817,6 +5913,7 @@ static struct cftype mem_cgroup_files[] = { | |||
5817 | }, | 5913 | }, |
5818 | { | 5914 | { |
5819 | .name = "use_hierarchy", | 5915 | .name = "use_hierarchy", |
5916 | .flags = CFTYPE_INSANE, | ||
5820 | .write_u64 = mem_cgroup_hierarchy_write, | 5917 | .write_u64 = mem_cgroup_hierarchy_write, |
5821 | .read_u64 = mem_cgroup_hierarchy_read, | 5918 | .read_u64 = mem_cgroup_hierarchy_read, |
5822 | }, | 5919 | }, |
@@ -5838,6 +5935,11 @@ static struct cftype mem_cgroup_files[] = { | |||
5838 | .unregister_event = mem_cgroup_oom_unregister_event, | 5935 | .unregister_event = mem_cgroup_oom_unregister_event, |
5839 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 5936 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
5840 | }, | 5937 | }, |
5938 | { | ||
5939 | .name = "pressure_level", | ||
5940 | .register_event = vmpressure_register_event, | ||
5941 | .unregister_event = vmpressure_unregister_event, | ||
5942 | }, | ||
5841 | #ifdef CONFIG_NUMA | 5943 | #ifdef CONFIG_NUMA |
5842 | { | 5944 | { |
5843 | .name = "numa_stat", | 5945 | .name = "numa_stat", |
@@ -6119,6 +6221,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6119 | memcg->move_charge_at_immigrate = 0; | 6221 | memcg->move_charge_at_immigrate = 0; |
6120 | mutex_init(&memcg->thresholds_lock); | 6222 | mutex_init(&memcg->thresholds_lock); |
6121 | spin_lock_init(&memcg->move_lock); | 6223 | spin_lock_init(&memcg->move_lock); |
6224 | vmpressure_init(&memcg->vmpressure); | ||
6122 | 6225 | ||
6123 | return &memcg->css; | 6226 | return &memcg->css; |
6124 | 6227 | ||
@@ -6184,10 +6287,29 @@ mem_cgroup_css_online(struct cgroup *cont) | |||
6184 | return error; | 6287 | return error; |
6185 | } | 6288 | } |
6186 | 6289 | ||
6290 | /* | ||
6291 | * Announce all parents that a group from their hierarchy is gone. | ||
6292 | */ | ||
6293 | static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | ||
6294 | { | ||
6295 | struct mem_cgroup *parent = memcg; | ||
6296 | |||
6297 | while ((parent = parent_mem_cgroup(parent))) | ||
6298 | atomic_inc(&parent->dead_count); | ||
6299 | |||
6300 | /* | ||
6301 | * if the root memcg is not hierarchical we have to check it | ||
6302 | * explicitely. | ||
6303 | */ | ||
6304 | if (!root_mem_cgroup->use_hierarchy) | ||
6305 | atomic_inc(&root_mem_cgroup->dead_count); | ||
6306 | } | ||
6307 | |||
6187 | static void mem_cgroup_css_offline(struct cgroup *cont) | 6308 | static void mem_cgroup_css_offline(struct cgroup *cont) |
6188 | { | 6309 | { |
6189 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 6310 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
6190 | 6311 | ||
6312 | mem_cgroup_invalidate_reclaim_iterators(memcg); | ||
6191 | mem_cgroup_reparent_charges(memcg); | 6313 | mem_cgroup_reparent_charges(memcg); |
6192 | mem_cgroup_destroy_all_caches(memcg); | 6314 | mem_cgroup_destroy_all_caches(memcg); |
6193 | } | 6315 | } |
@@ -6787,6 +6909,21 @@ static void mem_cgroup_move_task(struct cgroup *cont, | |||
6787 | } | 6909 | } |
6788 | #endif | 6910 | #endif |
6789 | 6911 | ||
6912 | /* | ||
6913 | * Cgroup retains root cgroups across [un]mount cycles making it necessary | ||
6914 | * to verify sane_behavior flag on each mount attempt. | ||
6915 | */ | ||
6916 | static void mem_cgroup_bind(struct cgroup *root) | ||
6917 | { | ||
6918 | /* | ||
6919 | * use_hierarchy is forced with sane_behavior. cgroup core | ||
6920 | * guarantees that @root doesn't have any children, so turning it | ||
6921 | * on for the root memcg is enough. | ||
6922 | */ | ||
6923 | if (cgroup_sane_behavior(root)) | ||
6924 | mem_cgroup_from_cont(root)->use_hierarchy = true; | ||
6925 | } | ||
6926 | |||
6790 | struct cgroup_subsys mem_cgroup_subsys = { | 6927 | struct cgroup_subsys mem_cgroup_subsys = { |
6791 | .name = "memory", | 6928 | .name = "memory", |
6792 | .subsys_id = mem_cgroup_subsys_id, | 6929 | .subsys_id = mem_cgroup_subsys_id, |
@@ -6797,6 +6934,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
6797 | .can_attach = mem_cgroup_can_attach, | 6934 | .can_attach = mem_cgroup_can_attach, |
6798 | .cancel_attach = mem_cgroup_cancel_attach, | 6935 | .cancel_attach = mem_cgroup_cancel_attach, |
6799 | .attach = mem_cgroup_move_task, | 6936 | .attach = mem_cgroup_move_task, |
6937 | .bind = mem_cgroup_bind, | ||
6800 | .base_cftypes = mem_cgroup_files, | 6938 | .base_cftypes = mem_cgroup_files, |
6801 | .early_init = 0, | 6939 | .early_init = 0, |
6802 | .use_id = 1, | 6940 | .use_id = 1, |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index df0694c6adef..ceb0c7f1932f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -785,10 +785,10 @@ static struct page_state { | |||
785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, | 785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, |
786 | 786 | ||
787 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, | 787 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, |
788 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, | 788 | { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, |
789 | 789 | ||
790 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | 790 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, |
791 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, | 791 | { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, |
792 | 792 | ||
793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, | 793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, |
794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
diff --git a/mm/memory.c b/mm/memory.c index 494526ae024a..6dc1882fbd72 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -216,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) | |||
216 | tlb->mm = mm; | 216 | tlb->mm = mm; |
217 | 217 | ||
218 | tlb->fullmm = fullmm; | 218 | tlb->fullmm = fullmm; |
219 | tlb->need_flush_all = 0; | ||
219 | tlb->start = -1UL; | 220 | tlb->start = -1UL; |
220 | tlb->end = 0; | 221 | tlb->end = 0; |
221 | tlb->need_flush = 0; | 222 | tlb->need_flush = 0; |
@@ -714,11 +715,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
714 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | 715 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y |
715 | */ | 716 | */ |
716 | if (vma->vm_ops) | 717 | if (vma->vm_ops) |
717 | print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", | 718 | printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", |
718 | (unsigned long)vma->vm_ops->fault); | 719 | vma->vm_ops->fault); |
719 | if (vma->vm_file && vma->vm_file->f_op) | 720 | if (vma->vm_file && vma->vm_file->f_op) |
720 | print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", | 721 | printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", |
721 | (unsigned long)vma->vm_file->f_op->mmap); | 722 | vma->vm_file->f_op->mmap); |
722 | dump_stack(); | 723 | dump_stack(); |
723 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 724 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
724 | } | 725 | } |
@@ -2392,6 +2393,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
2392 | } | 2393 | } |
2393 | EXPORT_SYMBOL(remap_pfn_range); | 2394 | EXPORT_SYMBOL(remap_pfn_range); |
2394 | 2395 | ||
2396 | /** | ||
2397 | * vm_iomap_memory - remap memory to userspace | ||
2398 | * @vma: user vma to map to | ||
2399 | * @start: start of area | ||
2400 | * @len: size of area | ||
2401 | * | ||
2402 | * This is a simplified io_remap_pfn_range() for common driver use. The | ||
2403 | * driver just needs to give us the physical memory range to be mapped, | ||
2404 | * we'll figure out the rest from the vma information. | ||
2405 | * | ||
2406 | * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get | ||
2407 | * whatever write-combining details or similar. | ||
2408 | */ | ||
2409 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) | ||
2410 | { | ||
2411 | unsigned long vm_len, pfn, pages; | ||
2412 | |||
2413 | /* Check that the physical memory area passed in looks valid */ | ||
2414 | if (start + len < start) | ||
2415 | return -EINVAL; | ||
2416 | /* | ||
2417 | * You *really* shouldn't map things that aren't page-aligned, | ||
2418 | * but we've historically allowed it because IO memory might | ||
2419 | * just have smaller alignment. | ||
2420 | */ | ||
2421 | len += start & ~PAGE_MASK; | ||
2422 | pfn = start >> PAGE_SHIFT; | ||
2423 | pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; | ||
2424 | if (pfn + pages < pfn) | ||
2425 | return -EINVAL; | ||
2426 | |||
2427 | /* We start the mapping 'vm_pgoff' pages into the area */ | ||
2428 | if (vma->vm_pgoff > pages) | ||
2429 | return -EINVAL; | ||
2430 | pfn += vma->vm_pgoff; | ||
2431 | pages -= vma->vm_pgoff; | ||
2432 | |||
2433 | /* Can we fit all of the mapping? */ | ||
2434 | vm_len = vma->vm_end - vma->vm_start; | ||
2435 | if (vm_len >> PAGE_SHIFT > pages) | ||
2436 | return -EINVAL; | ||
2437 | |||
2438 | /* Ok, let it rip */ | ||
2439 | return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); | ||
2440 | } | ||
2441 | EXPORT_SYMBOL(vm_iomap_memory); | ||
2442 | |||
2395 | static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | 2443 | static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, |
2396 | unsigned long addr, unsigned long end, | 2444 | unsigned long addr, unsigned long end, |
2397 | pte_fn_t fn, void *data) | 2445 | pte_fn_t fn, void *data) |
@@ -3196,6 +3244,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3196 | page = alloc_zeroed_user_highpage_movable(vma, address); | 3244 | page = alloc_zeroed_user_highpage_movable(vma, address); |
3197 | if (!page) | 3245 | if (!page) |
3198 | goto oom; | 3246 | goto oom; |
3247 | /* | ||
3248 | * The memory barrier inside __SetPageUptodate makes sure that | ||
3249 | * preceeding stores to the page contents become visible before | ||
3250 | * the set_pte_at() write. | ||
3251 | */ | ||
3199 | __SetPageUptodate(page); | 3252 | __SetPageUptodate(page); |
3200 | 3253 | ||
3201 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) | 3254 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b81a367b9f39..a221fac1f47d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -436,6 +436,40 @@ static int __meminit __add_section(int nid, struct zone *zone, | |||
436 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); | 436 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
437 | } | 437 | } |
438 | 438 | ||
439 | /* | ||
440 | * Reasonably generic function for adding memory. It is | ||
441 | * expected that archs that support memory hotplug will | ||
442 | * call this function after deciding the zone to which to | ||
443 | * add the new pages. | ||
444 | */ | ||
445 | int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, | ||
446 | unsigned long nr_pages) | ||
447 | { | ||
448 | unsigned long i; | ||
449 | int err = 0; | ||
450 | int start_sec, end_sec; | ||
451 | /* during initialize mem_map, align hot-added range to section */ | ||
452 | start_sec = pfn_to_section_nr(phys_start_pfn); | ||
453 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | ||
454 | |||
455 | for (i = start_sec; i <= end_sec; i++) { | ||
456 | err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); | ||
457 | |||
458 | /* | ||
459 | * EEXIST is finally dealt with by ioresource collision | ||
460 | * check. see add_memory() => register_memory_resource() | ||
461 | * Warning will be printed if there is collision. | ||
462 | */ | ||
463 | if (err && (err != -EEXIST)) | ||
464 | break; | ||
465 | err = 0; | ||
466 | } | ||
467 | |||
468 | return err; | ||
469 | } | ||
470 | EXPORT_SYMBOL_GPL(__add_pages); | ||
471 | |||
472 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
439 | /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ | 473 | /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ |
440 | static int find_smallest_section_pfn(int nid, struct zone *zone, | 474 | static int find_smallest_section_pfn(int nid, struct zone *zone, |
441 | unsigned long start_pfn, | 475 | unsigned long start_pfn, |
@@ -658,39 +692,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
658 | return 0; | 692 | return 0; |
659 | } | 693 | } |
660 | 694 | ||
661 | /* | ||
662 | * Reasonably generic function for adding memory. It is | ||
663 | * expected that archs that support memory hotplug will | ||
664 | * call this function after deciding the zone to which to | ||
665 | * add the new pages. | ||
666 | */ | ||
667 | int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, | ||
668 | unsigned long nr_pages) | ||
669 | { | ||
670 | unsigned long i; | ||
671 | int err = 0; | ||
672 | int start_sec, end_sec; | ||
673 | /* during initialize mem_map, align hot-added range to section */ | ||
674 | start_sec = pfn_to_section_nr(phys_start_pfn); | ||
675 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | ||
676 | |||
677 | for (i = start_sec; i <= end_sec; i++) { | ||
678 | err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); | ||
679 | |||
680 | /* | ||
681 | * EEXIST is finally dealt with by ioresource collision | ||
682 | * check. see add_memory() => register_memory_resource() | ||
683 | * Warning will be printed if there is collision. | ||
684 | */ | ||
685 | if (err && (err != -EEXIST)) | ||
686 | break; | ||
687 | err = 0; | ||
688 | } | ||
689 | |||
690 | return err; | ||
691 | } | ||
692 | EXPORT_SYMBOL_GPL(__add_pages); | ||
693 | |||
694 | /** | 695 | /** |
695 | * __remove_pages() - remove sections of pages from a zone | 696 | * __remove_pages() - remove sections of pages from a zone |
696 | * @zone: zone from which pages need to be removed | 697 | * @zone: zone from which pages need to be removed |
@@ -705,8 +706,10 @@ EXPORT_SYMBOL_GPL(__add_pages); | |||
705 | int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | 706 | int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, |
706 | unsigned long nr_pages) | 707 | unsigned long nr_pages) |
707 | { | 708 | { |
708 | unsigned long i, ret = 0; | 709 | unsigned long i; |
709 | int sections_to_remove; | 710 | int sections_to_remove; |
711 | resource_size_t start, size; | ||
712 | int ret = 0; | ||
710 | 713 | ||
711 | /* | 714 | /* |
712 | * We can only remove entire sections | 715 | * We can only remove entire sections |
@@ -714,7 +717,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
714 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); | 717 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); |
715 | BUG_ON(nr_pages % PAGES_PER_SECTION); | 718 | BUG_ON(nr_pages % PAGES_PER_SECTION); |
716 | 719 | ||
717 | release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); | 720 | start = phys_start_pfn << PAGE_SHIFT; |
721 | size = nr_pages * PAGE_SIZE; | ||
722 | ret = release_mem_region_adjustable(&iomem_resource, start, size); | ||
723 | if (ret) | ||
724 | pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n", | ||
725 | start, start + size - 1, ret); | ||
718 | 726 | ||
719 | sections_to_remove = nr_pages / PAGES_PER_SECTION; | 727 | sections_to_remove = nr_pages / PAGES_PER_SECTION; |
720 | for (i = 0; i < sections_to_remove; i++) { | 728 | for (i = 0; i < sections_to_remove; i++) { |
@@ -726,6 +734,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
726 | return ret; | 734 | return ret; |
727 | } | 735 | } |
728 | EXPORT_SYMBOL_GPL(__remove_pages); | 736 | EXPORT_SYMBOL_GPL(__remove_pages); |
737 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
729 | 738 | ||
730 | int set_online_page_callback(online_page_callback_t callback) | 739 | int set_online_page_callback(online_page_callback_t callback) |
731 | { | 740 | { |
@@ -1613,7 +1622,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | |||
1613 | /** | 1622 | /** |
1614 | * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) | 1623 | * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) |
1615 | * @start_pfn: start pfn of the memory range | 1624 | * @start_pfn: start pfn of the memory range |
1616 | * @end_pfn: end pft of the memory range | 1625 | * @end_pfn: end pfn of the memory range |
1617 | * @arg: argument passed to func | 1626 | * @arg: argument passed to func |
1618 | * @func: callback for each memory section walked | 1627 | * @func: callback for each memory section walked |
1619 | * | 1628 | * |
@@ -1681,11 +1690,15 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | |||
1681 | { | 1690 | { |
1682 | int ret = !is_memblock_offlined(mem); | 1691 | int ret = !is_memblock_offlined(mem); |
1683 | 1692 | ||
1684 | if (unlikely(ret)) | 1693 | if (unlikely(ret)) { |
1694 | phys_addr_t beginpa, endpa; | ||
1695 | |||
1696 | beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); | ||
1697 | endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; | ||
1685 | pr_warn("removing memory fails, because memory " | 1698 | pr_warn("removing memory fails, because memory " |
1686 | "[%#010llx-%#010llx] is onlined\n", | 1699 | "[%pa-%pa] is onlined\n", |
1687 | PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), | 1700 | &beginpa, &endpa); |
1688 | PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); | 1701 | } |
1689 | 1702 | ||
1690 | return ret; | 1703 | return ret; |
1691 | } | 1704 | } |
@@ -1779,7 +1792,11 @@ void try_offline_node(int nid) | |||
1779 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1792 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1780 | struct zone *zone = pgdat->node_zones + i; | 1793 | struct zone *zone = pgdat->node_zones + i; |
1781 | 1794 | ||
1782 | if (zone->wait_table) | 1795 | /* |
1796 | * wait_table may be allocated from boot memory, | ||
1797 | * here only free if it's allocated by vmalloc. | ||
1798 | */ | ||
1799 | if (is_vmalloc_addr(zone->wait_table)) | ||
1783 | vfree(zone->wait_table); | 1800 | vfree(zone->wait_table); |
1784 | } | 1801 | } |
1785 | 1802 | ||
@@ -1801,7 +1818,7 @@ int __ref remove_memory(int nid, u64 start, u64 size) | |||
1801 | int retry = 1; | 1818 | int retry = 1; |
1802 | 1819 | ||
1803 | start_pfn = PFN_DOWN(start); | 1820 | start_pfn = PFN_DOWN(start); |
1804 | end_pfn = start_pfn + PFN_DOWN(size); | 1821 | end_pfn = PFN_UP(start + size - 1); |
1805 | 1822 | ||
1806 | /* | 1823 | /* |
1807 | * When CONFIG_MEMCG is on, one memory block may be used by other | 1824 | * When CONFIG_MEMCG is on, one memory block may be used by other |
diff --git a/mm/migrate.c b/mm/migrate.c index 3bbaf5d230b0..27ed22579fd9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -736,7 +736,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
736 | 736 | ||
737 | if (PageWriteback(page)) { | 737 | if (PageWriteback(page)) { |
738 | /* | 738 | /* |
739 | * Only in the case of a full syncronous migration is it | 739 | * Only in the case of a full synchronous migration is it |
740 | * necessary to wait for PageWriteback. In the async case, | 740 | * necessary to wait for PageWriteback. In the async case, |
741 | * the retry loop is too short and in the sync-light case, | 741 | * the retry loop is too short and in the sync-light case, |
742 | * the overhead of stalling is too much | 742 | * the overhead of stalling is too much |
@@ -973,19 +973,23 @@ out: | |||
973 | } | 973 | } |
974 | 974 | ||
975 | /* | 975 | /* |
976 | * migrate_pages | 976 | * migrate_pages - migrate the pages specified in a list, to the free pages |
977 | * supplied as the target for the page migration | ||
977 | * | 978 | * |
978 | * The function takes one list of pages to migrate and a function | 979 | * @from: The list of pages to be migrated. |
979 | * that determines from the page to be migrated and the private data | 980 | * @get_new_page: The function used to allocate free pages to be used |
980 | * the target of the move and allocates the page. | 981 | * as the target of the page migration. |
982 | * @private: Private data to be passed on to get_new_page() | ||
983 | * @mode: The migration mode that specifies the constraints for | ||
984 | * page migration, if any. | ||
985 | * @reason: The reason for page migration. | ||
981 | * | 986 | * |
982 | * The function returns after 10 attempts or if no pages | 987 | * The function returns after 10 attempts or if no pages are movable any more |
983 | * are movable anymore because to has become empty | 988 | * because the list has become empty or no retryable pages exist any more. |
984 | * or no retryable pages exist anymore. | 989 | * The caller should call putback_lru_pages() to return pages to the LRU |
985 | * Caller should call putback_lru_pages to return pages to the LRU | ||
986 | * or free list only if ret != 0. | 990 | * or free list only if ret != 0. |
987 | * | 991 | * |
988 | * Return: Number of pages not migrated or error code. | 992 | * Returns the number of pages that were not migrated, or an error code. |
989 | */ | 993 | */ |
990 | int migrate_pages(struct list_head *from, new_page_t get_new_page, | 994 | int migrate_pages(struct list_head *from, new_page_t get_new_page, |
991 | unsigned long private, enum migrate_mode mode, int reason) | 995 | unsigned long private, enum migrate_mode mode, int reason) |
diff --git a/mm/mlock.c b/mm/mlock.c index 1c5e33fce639..79b7cf7d1bca 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
358 | 358 | ||
359 | newflags = vma->vm_flags & ~VM_LOCKED; | 359 | newflags = vma->vm_flags & ~VM_LOCKED; |
360 | if (on) | 360 | if (on) |
361 | newflags |= VM_LOCKED | VM_POPULATE; | 361 | newflags |= VM_LOCKED; |
362 | 362 | ||
363 | tmp = vma->vm_end; | 363 | tmp = vma->vm_end; |
364 | if (tmp > end) | 364 | if (tmp > end) |
@@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) | |||
418 | * range with the first VMA. Also, skip undesirable VMA types. | 418 | * range with the first VMA. Also, skip undesirable VMA types. |
419 | */ | 419 | */ |
420 | nend = min(end, vma->vm_end); | 420 | nend = min(end, vma->vm_end); |
421 | if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != | 421 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) |
422 | VM_POPULATE) | ||
423 | continue; | 422 | continue; |
424 | if (nstart < vma->vm_start) | 423 | if (nstart < vma->vm_start) |
425 | nstart = vma->vm_start; | 424 | nstart = vma->vm_start; |
@@ -492,9 +491,9 @@ static int do_mlockall(int flags) | |||
492 | struct vm_area_struct * vma, * prev = NULL; | 491 | struct vm_area_struct * vma, * prev = NULL; |
493 | 492 | ||
494 | if (flags & MCL_FUTURE) | 493 | if (flags & MCL_FUTURE) |
495 | current->mm->def_flags |= VM_LOCKED | VM_POPULATE; | 494 | current->mm->def_flags |= VM_LOCKED; |
496 | else | 495 | else |
497 | current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); | 496 | current->mm->def_flags &= ~VM_LOCKED; |
498 | if (flags == MCL_FUTURE) | 497 | if (flags == MCL_FUTURE) |
499 | goto out; | 498 | goto out; |
500 | 499 | ||
@@ -503,7 +502,7 @@ static int do_mlockall(int flags) | |||
503 | 502 | ||
504 | newflags = vma->vm_flags & ~VM_LOCKED; | 503 | newflags = vma->vm_flags & ~VM_LOCKED; |
505 | if (flags & MCL_CURRENT) | 504 | if (flags & MCL_CURRENT) |
506 | newflags |= VM_LOCKED | VM_POPULATE; | 505 | newflags |= VM_LOCKED; |
507 | 506 | ||
508 | /* Ignore errors */ | 507 | /* Ignore errors */ |
509 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); | 508 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); |
@@ -6,6 +6,7 @@ | |||
6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> | 6 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/kernel.h> | ||
9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
10 | #include <linux/backing-dev.h> | 11 | #include <linux/backing-dev.h> |
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -33,6 +34,8 @@ | |||
33 | #include <linux/uprobes.h> | 34 | #include <linux/uprobes.h> |
34 | #include <linux/rbtree_augmented.h> | 35 | #include <linux/rbtree_augmented.h> |
35 | #include <linux/sched/sysctl.h> | 36 | #include <linux/sched/sysctl.h> |
37 | #include <linux/notifier.h> | ||
38 | #include <linux/memory.h> | ||
36 | 39 | ||
37 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
38 | #include <asm/cacheflush.h> | 41 | #include <asm/cacheflush.h> |
@@ -84,6 +87,8 @@ EXPORT_SYMBOL(vm_get_page_prot); | |||
84 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
85 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ | 88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
86 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
90 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | ||
91 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | ||
87 | /* | 92 | /* |
88 | * Make sure vm_committed_as in one cacheline and not cacheline shared with | 93 | * Make sure vm_committed_as in one cacheline and not cacheline shared with |
89 | * other variables. It can be updated by several CPUs frequently. | 94 | * other variables. It can be updated by several CPUs frequently. |
@@ -122,7 +127,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); | |||
122 | */ | 127 | */ |
123 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 128 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
124 | { | 129 | { |
125 | unsigned long free, allowed; | 130 | unsigned long free, allowed, reserve; |
126 | 131 | ||
127 | vm_acct_memory(pages); | 132 | vm_acct_memory(pages); |
128 | 133 | ||
@@ -163,10 +168,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
163 | free -= totalreserve_pages; | 168 | free -= totalreserve_pages; |
164 | 169 | ||
165 | /* | 170 | /* |
166 | * Leave the last 3% for root | 171 | * Reserve some for root |
167 | */ | 172 | */ |
168 | if (!cap_sys_admin) | 173 | if (!cap_sys_admin) |
169 | free -= free / 32; | 174 | free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
170 | 175 | ||
171 | if (free > pages) | 176 | if (free > pages) |
172 | return 0; | 177 | return 0; |
@@ -177,16 +182,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
177 | allowed = (totalram_pages - hugetlb_total_pages()) | 182 | allowed = (totalram_pages - hugetlb_total_pages()) |
178 | * sysctl_overcommit_ratio / 100; | 183 | * sysctl_overcommit_ratio / 100; |
179 | /* | 184 | /* |
180 | * Leave the last 3% for root | 185 | * Reserve some for root |
181 | */ | 186 | */ |
182 | if (!cap_sys_admin) | 187 | if (!cap_sys_admin) |
183 | allowed -= allowed / 32; | 188 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
184 | allowed += total_swap_pages; | 189 | allowed += total_swap_pages; |
185 | 190 | ||
186 | /* Don't let a single process grow too big: | 191 | /* |
187 | leave 3% of the size of this process for other processes */ | 192 | * Don't let a single process grow so big a user can't recover |
188 | if (mm) | 193 | */ |
189 | allowed -= mm->total_vm / 32; | 194 | if (mm) { |
195 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | ||
196 | allowed -= min(mm->total_vm / 32, reserve); | ||
197 | } | ||
190 | 198 | ||
191 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 199 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
192 | return 0; | 200 | return 0; |
@@ -543,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, | |||
543 | return 0; | 551 | return 0; |
544 | } | 552 | } |
545 | 553 | ||
554 | static unsigned long count_vma_pages_range(struct mm_struct *mm, | ||
555 | unsigned long addr, unsigned long end) | ||
556 | { | ||
557 | unsigned long nr_pages = 0; | ||
558 | struct vm_area_struct *vma; | ||
559 | |||
560 | /* Find first overlaping mapping */ | ||
561 | vma = find_vma_intersection(mm, addr, end); | ||
562 | if (!vma) | ||
563 | return 0; | ||
564 | |||
565 | nr_pages = (min(end, vma->vm_end) - | ||
566 | max(addr, vma->vm_start)) >> PAGE_SHIFT; | ||
567 | |||
568 | /* Iterate over the rest of the overlaps */ | ||
569 | for (vma = vma->vm_next; vma; vma = vma->vm_next) { | ||
570 | unsigned long overlap_len; | ||
571 | |||
572 | if (vma->vm_start > end) | ||
573 | break; | ||
574 | |||
575 | overlap_len = min(end, vma->vm_end) - vma->vm_start; | ||
576 | nr_pages += overlap_len >> PAGE_SHIFT; | ||
577 | } | ||
578 | |||
579 | return nr_pages; | ||
580 | } | ||
581 | |||
546 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 582 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
547 | struct rb_node **rb_link, struct rb_node *rb_parent) | 583 | struct rb_node **rb_link, struct rb_node *rb_parent) |
548 | { | 584 | { |
@@ -829,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
829 | if (next->anon_vma) | 865 | if (next->anon_vma) |
830 | anon_vma_merge(vma, next); | 866 | anon_vma_merge(vma, next); |
831 | mm->map_count--; | 867 | mm->map_count--; |
832 | mpol_put(vma_policy(next)); | 868 | vma_set_policy(vma, vma_policy(next)); |
833 | kmem_cache_free(vm_area_cachep, next); | 869 | kmem_cache_free(vm_area_cachep, next); |
834 | /* | 870 | /* |
835 | * In mprotect's case 6 (see comments on vma_merge), | 871 | * In mprotect's case 6 (see comments on vma_merge), |
@@ -1306,7 +1342,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1306 | } | 1342 | } |
1307 | 1343 | ||
1308 | addr = mmap_region(file, addr, len, vm_flags, pgoff); | 1344 | addr = mmap_region(file, addr, len, vm_flags, pgoff); |
1309 | if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) | 1345 | if (!IS_ERR_VALUE(addr) && |
1346 | ((vm_flags & VM_LOCKED) || | ||
1347 | (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) | ||
1310 | *populate = len; | 1348 | *populate = len; |
1311 | return addr; | 1349 | return addr; |
1312 | } | 1350 | } |
@@ -1433,6 +1471,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1433 | unsigned long charged = 0; | 1471 | unsigned long charged = 0; |
1434 | struct inode *inode = file ? file_inode(file) : NULL; | 1472 | struct inode *inode = file ? file_inode(file) : NULL; |
1435 | 1473 | ||
1474 | /* Check against address space limit. */ | ||
1475 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { | ||
1476 | unsigned long nr_pages; | ||
1477 | |||
1478 | /* | ||
1479 | * MAP_FIXED may remove pages of mappings that intersects with | ||
1480 | * requested mapping. Account for the pages it would unmap. | ||
1481 | */ | ||
1482 | if (!(vm_flags & MAP_FIXED)) | ||
1483 | return -ENOMEM; | ||
1484 | |||
1485 | nr_pages = count_vma_pages_range(mm, addr, addr + len); | ||
1486 | |||
1487 | if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) | ||
1488 | return -ENOMEM; | ||
1489 | } | ||
1490 | |||
1436 | /* Clear old maps */ | 1491 | /* Clear old maps */ |
1437 | error = -ENOMEM; | 1492 | error = -ENOMEM; |
1438 | munmap_back: | 1493 | munmap_back: |
@@ -1442,10 +1497,6 @@ munmap_back: | |||
1442 | goto munmap_back; | 1497 | goto munmap_back; |
1443 | } | 1498 | } |
1444 | 1499 | ||
1445 | /* Check against address space limit. */ | ||
1446 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) | ||
1447 | return -ENOMEM; | ||
1448 | |||
1449 | /* | 1500 | /* |
1450 | * Private writable mapping: check memory availability | 1501 | * Private writable mapping: check memory availability |
1451 | */ | 1502 | */ |
@@ -1933,12 +1984,9 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
1933 | { | 1984 | { |
1934 | struct vm_area_struct *vma = NULL; | 1985 | struct vm_area_struct *vma = NULL; |
1935 | 1986 | ||
1936 | if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ | ||
1937 | return NULL; | ||
1938 | |||
1939 | /* Check the cache first. */ | 1987 | /* Check the cache first. */ |
1940 | /* (Cache hit rate is typically around 35%.) */ | 1988 | /* (Cache hit rate is typically around 35%.) */ |
1941 | vma = mm->mmap_cache; | 1989 | vma = ACCESS_ONCE(mm->mmap_cache); |
1942 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | 1990 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { |
1943 | struct rb_node *rb_node; | 1991 | struct rb_node *rb_node; |
1944 | 1992 | ||
@@ -2303,7 +2351,7 @@ static void unmap_region(struct mm_struct *mm, | |||
2303 | update_hiwater_rss(mm); | 2351 | update_hiwater_rss(mm); |
2304 | unmap_vmas(&tlb, vma, start, end); | 2352 | unmap_vmas(&tlb, vma, start, end); |
2305 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, | 2353 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, |
2306 | next ? next->vm_start : 0); | 2354 | next ? next->vm_start : USER_PGTABLES_CEILING); |
2307 | tlb_finish_mmu(&tlb, start, end); | 2355 | tlb_finish_mmu(&tlb, start, end); |
2308 | } | 2356 | } |
2309 | 2357 | ||
@@ -2683,7 +2731,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2683 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2731 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2684 | unmap_vmas(&tlb, vma, 0, -1); | 2732 | unmap_vmas(&tlb, vma, 0, -1); |
2685 | 2733 | ||
2686 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); | 2734 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); |
2687 | tlb_finish_mmu(&tlb, 0, -1); | 2735 | tlb_finish_mmu(&tlb, 0, -1); |
2688 | 2736 | ||
2689 | /* | 2737 | /* |
@@ -3095,3 +3143,115 @@ void __init mmap_init(void) | |||
3095 | ret = percpu_counter_init(&vm_committed_as, 0); | 3143 | ret = percpu_counter_init(&vm_committed_as, 0); |
3096 | VM_BUG_ON(ret); | 3144 | VM_BUG_ON(ret); |
3097 | } | 3145 | } |
3146 | |||
3147 | /* | ||
3148 | * Initialise sysctl_user_reserve_kbytes. | ||
3149 | * | ||
3150 | * This is intended to prevent a user from starting a single memory hogging | ||
3151 | * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER | ||
3152 | * mode. | ||
3153 | * | ||
3154 | * The default value is min(3% of free memory, 128MB) | ||
3155 | * 128MB is enough to recover with sshd/login, bash, and top/kill. | ||
3156 | */ | ||
3157 | static int init_user_reserve(void) | ||
3158 | { | ||
3159 | unsigned long free_kbytes; | ||
3160 | |||
3161 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | ||
3162 | |||
3163 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); | ||
3164 | return 0; | ||
3165 | } | ||
3166 | module_init(init_user_reserve) | ||
3167 | |||
3168 | /* | ||
3169 | * Initialise sysctl_admin_reserve_kbytes. | ||
3170 | * | ||
3171 | * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin | ||
3172 | * to log in and kill a memory hogging process. | ||
3173 | * | ||
3174 | * Systems with more than 256MB will reserve 8MB, enough to recover | ||
3175 | * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will | ||
3176 | * only reserve 3% of free pages by default. | ||
3177 | */ | ||
3178 | static int init_admin_reserve(void) | ||
3179 | { | ||
3180 | unsigned long free_kbytes; | ||
3181 | |||
3182 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | ||
3183 | |||
3184 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | ||
3185 | return 0; | ||
3186 | } | ||
3187 | module_init(init_admin_reserve) | ||
3188 | |||
3189 | /* | ||
3190 | * Reinititalise user and admin reserves if memory is added or removed. | ||
3191 | * | ||
3192 | * The default user reserve max is 128MB, and the default max for the | ||
3193 | * admin reserve is 8MB. These are usually, but not always, enough to | ||
3194 | * enable recovery from a memory hogging process using login/sshd, a shell, | ||
3195 | * and tools like top. It may make sense to increase or even disable the | ||
3196 | * reserve depending on the existence of swap or variations in the recovery | ||
3197 | * tools. So, the admin may have changed them. | ||
3198 | * | ||
3199 | * If memory is added and the reserves have been eliminated or increased above | ||
3200 | * the default max, then we'll trust the admin. | ||
3201 | * | ||
3202 | * If memory is removed and there isn't enough free memory, then we | ||
3203 | * need to reset the reserves. | ||
3204 | * | ||
3205 | * Otherwise keep the reserve set by the admin. | ||
3206 | */ | ||
3207 | static int reserve_mem_notifier(struct notifier_block *nb, | ||
3208 | unsigned long action, void *data) | ||
3209 | { | ||
3210 | unsigned long tmp, free_kbytes; | ||
3211 | |||
3212 | switch (action) { | ||
3213 | case MEM_ONLINE: | ||
3214 | /* Default max is 128MB. Leave alone if modified by operator. */ | ||
3215 | tmp = sysctl_user_reserve_kbytes; | ||
3216 | if (0 < tmp && tmp < (1UL << 17)) | ||
3217 | init_user_reserve(); | ||
3218 | |||
3219 | /* Default max is 8MB. Leave alone if modified by operator. */ | ||
3220 | tmp = sysctl_admin_reserve_kbytes; | ||
3221 | if (0 < tmp && tmp < (1UL << 13)) | ||
3222 | init_admin_reserve(); | ||
3223 | |||
3224 | break; | ||
3225 | case MEM_OFFLINE: | ||
3226 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | ||
3227 | |||
3228 | if (sysctl_user_reserve_kbytes > free_kbytes) { | ||
3229 | init_user_reserve(); | ||
3230 | pr_info("vm.user_reserve_kbytes reset to %lu\n", | ||
3231 | sysctl_user_reserve_kbytes); | ||
3232 | } | ||
3233 | |||
3234 | if (sysctl_admin_reserve_kbytes > free_kbytes) { | ||
3235 | init_admin_reserve(); | ||
3236 | pr_info("vm.admin_reserve_kbytes reset to %lu\n", | ||
3237 | sysctl_admin_reserve_kbytes); | ||
3238 | } | ||
3239 | break; | ||
3240 | default: | ||
3241 | break; | ||
3242 | } | ||
3243 | return NOTIFY_OK; | ||
3244 | } | ||
3245 | |||
3246 | static struct notifier_block reserve_mem_nb = { | ||
3247 | .notifier_call = reserve_mem_notifier, | ||
3248 | }; | ||
3249 | |||
3250 | static int __meminit init_reserve_notifier(void) | ||
3251 | { | ||
3252 | if (register_hotmemory_notifier(&reserve_mem_nb)) | ||
3253 | printk("Failed registering memory add/remove notifier for admin reserve"); | ||
3254 | |||
3255 | return 0; | ||
3256 | } | ||
3257 | module_init(init_reserve_notifier) | ||
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 5e07d36e381e..bdd3fa2fc73b 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -45,9 +45,9 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | |||
45 | if (!addr) | 45 | if (!addr) |
46 | return NULL; | 46 | return NULL; |
47 | 47 | ||
48 | memblock_reserve(addr, size); | ||
48 | ptr = phys_to_virt(addr); | 49 | ptr = phys_to_virt(addr); |
49 | memset(ptr, 0, size); | 50 | memset(ptr, 0, size); |
50 | memblock_reserve(addr, size); | ||
51 | /* | 51 | /* |
52 | * The min_count is set to 0 so that bootmem allocated blocks | 52 | * The min_count is set to 0 so that bootmem allocated blocks |
53 | * are never reported as leaks. | 53 | * are never reported as leaks. |
@@ -120,7 +120,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start, | |||
120 | return end_pfn - start_pfn; | 120 | return end_pfn - start_pfn; |
121 | } | 121 | } |
122 | 122 | ||
123 | unsigned long __init free_low_memory_core_early(int nodeid) | 123 | static unsigned long __init free_low_memory_core_early(void) |
124 | { | 124 | { |
125 | unsigned long count = 0; | 125 | unsigned long count = 0; |
126 | phys_addr_t start, end, size; | 126 | phys_addr_t start, end, size; |
@@ -170,7 +170,7 @@ unsigned long __init free_all_bootmem(void) | |||
170 | * because in some case like Node0 doesn't have RAM installed | 170 | * because in some case like Node0 doesn't have RAM installed |
171 | * low ram will be on Node1 | 171 | * low ram will be on Node1 |
172 | */ | 172 | */ |
173 | return free_low_memory_core_early(MAX_NUMNODES); | 173 | return free_low_memory_core_early(); |
174 | } | 174 | } |
175 | 175 | ||
176 | /** | 176 | /** |
diff --git a/mm/nommu.c b/mm/nommu.c index e19328087534..fbe3e2f317eb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -63,6 +63,8 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | |||
63 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 63 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
64 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 64 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
65 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 65 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
66 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | ||
67 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | ||
66 | int heap_stack_gap = 0; | 68 | int heap_stack_gap = 0; |
67 | 69 | ||
68 | atomic_long_t mmap_pages_allocated; | 70 | atomic_long_t mmap_pages_allocated; |
@@ -228,8 +230,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, | |||
228 | } | 230 | } |
229 | EXPORT_SYMBOL(follow_pfn); | 231 | EXPORT_SYMBOL(follow_pfn); |
230 | 232 | ||
231 | DEFINE_RWLOCK(vmlist_lock); | 233 | LIST_HEAD(vmap_area_list); |
232 | struct vm_struct *vmlist; | ||
233 | 234 | ||
234 | void vfree(const void *addr) | 235 | void vfree(const void *addr) |
235 | { | 236 | { |
@@ -821,7 +822,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
821 | struct vm_area_struct *vma; | 822 | struct vm_area_struct *vma; |
822 | 823 | ||
823 | /* check the cache first */ | 824 | /* check the cache first */ |
824 | vma = mm->mmap_cache; | 825 | vma = ACCESS_ONCE(mm->mmap_cache); |
825 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) | 826 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) |
826 | return vma; | 827 | return vma; |
827 | 828 | ||
@@ -1838,6 +1839,16 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1838 | } | 1839 | } |
1839 | EXPORT_SYMBOL(remap_pfn_range); | 1840 | EXPORT_SYMBOL(remap_pfn_range); |
1840 | 1841 | ||
1842 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) | ||
1843 | { | ||
1844 | unsigned long pfn = start >> PAGE_SHIFT; | ||
1845 | unsigned long vm_len = vma->vm_end - vma->vm_start; | ||
1846 | |||
1847 | pfn += vma->vm_pgoff; | ||
1848 | return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); | ||
1849 | } | ||
1850 | EXPORT_SYMBOL(vm_iomap_memory); | ||
1851 | |||
1841 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | 1852 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, |
1842 | unsigned long pgoff) | 1853 | unsigned long pgoff) |
1843 | { | 1854 | { |
@@ -1888,7 +1899,7 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
1888 | */ | 1899 | */ |
1889 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 1900 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
1890 | { | 1901 | { |
1891 | unsigned long free, allowed; | 1902 | unsigned long free, allowed, reserve; |
1892 | 1903 | ||
1893 | vm_acct_memory(pages); | 1904 | vm_acct_memory(pages); |
1894 | 1905 | ||
@@ -1929,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1929 | free -= totalreserve_pages; | 1940 | free -= totalreserve_pages; |
1930 | 1941 | ||
1931 | /* | 1942 | /* |
1932 | * Leave the last 3% for root | 1943 | * Reserve some for root |
1933 | */ | 1944 | */ |
1934 | if (!cap_sys_admin) | 1945 | if (!cap_sys_admin) |
1935 | free -= free / 32; | 1946 | free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
1936 | 1947 | ||
1937 | if (free > pages) | 1948 | if (free > pages) |
1938 | return 0; | 1949 | return 0; |
@@ -1942,16 +1953,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1942 | 1953 | ||
1943 | allowed = totalram_pages * sysctl_overcommit_ratio / 100; | 1954 | allowed = totalram_pages * sysctl_overcommit_ratio / 100; |
1944 | /* | 1955 | /* |
1945 | * Leave the last 3% for root | 1956 | * Reserve some 3% for root |
1946 | */ | 1957 | */ |
1947 | if (!cap_sys_admin) | 1958 | if (!cap_sys_admin) |
1948 | allowed -= allowed / 32; | 1959 | allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); |
1949 | allowed += total_swap_pages; | 1960 | allowed += total_swap_pages; |
1950 | 1961 | ||
1951 | /* Don't let a single process grow too big: | 1962 | /* |
1952 | leave 3% of the size of this process for other processes */ | 1963 | * Don't let a single process grow so big a user can't recover |
1953 | if (mm) | 1964 | */ |
1954 | allowed -= mm->total_vm / 32; | 1965 | if (mm) { |
1966 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | ||
1967 | allowed -= min(mm->total_vm / 32, reserve); | ||
1968 | } | ||
1955 | 1969 | ||
1956 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 1970 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
1957 | return 0; | 1971 | return 0; |
@@ -2113,3 +2127,45 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2113 | up_write(&nommu_region_sem); | 2127 | up_write(&nommu_region_sem); |
2114 | return 0; | 2128 | return 0; |
2115 | } | 2129 | } |
2130 | |||
2131 | /* | ||
2132 | * Initialise sysctl_user_reserve_kbytes. | ||
2133 | * | ||
2134 | * This is intended to prevent a user from starting a single memory hogging | ||
2135 | * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER | ||
2136 | * mode. | ||
2137 | * | ||
2138 | * The default value is min(3% of free memory, 128MB) | ||
2139 | * 128MB is enough to recover with sshd/login, bash, and top/kill. | ||
2140 | */ | ||
2141 | static int __meminit init_user_reserve(void) | ||
2142 | { | ||
2143 | unsigned long free_kbytes; | ||
2144 | |||
2145 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | ||
2146 | |||
2147 | sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); | ||
2148 | return 0; | ||
2149 | } | ||
2150 | module_init(init_user_reserve) | ||
2151 | |||
2152 | /* | ||
2153 | * Initialise sysctl_admin_reserve_kbytes. | ||
2154 | * | ||
2155 | * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin | ||
2156 | * to log in and kill a memory hogging process. | ||
2157 | * | ||
2158 | * Systems with more than 256MB will reserve 8MB, enough to recover | ||
2159 | * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will | ||
2160 | * only reserve 3% of free pages by default. | ||
2161 | */ | ||
2162 | static int __meminit init_admin_reserve(void) | ||
2163 | { | ||
2164 | unsigned long free_kbytes; | ||
2165 | |||
2166 | free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); | ||
2167 | |||
2168 | sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); | ||
2169 | return 0; | ||
2170 | } | ||
2171 | module_init(init_admin_reserve) | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index efe68148f621..4514ad7415c3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2311,10 +2311,6 @@ void wait_for_stable_page(struct page *page) | |||
2311 | 2311 | ||
2312 | if (!bdi_cap_stable_pages_required(bdi)) | 2312 | if (!bdi_cap_stable_pages_required(bdi)) |
2313 | return; | 2313 | return; |
2314 | #ifdef CONFIG_NEED_BOUNCE_POOL | ||
2315 | if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE) | ||
2316 | return; | ||
2317 | #endif /* CONFIG_NEED_BOUNCE_POOL */ | ||
2318 | 2314 | ||
2319 | wait_on_page_writeback(page); | 2315 | wait_on_page_writeback(page); |
2320 | } | 2316 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fcced7823fa..98cbdf6e5532 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -58,6 +58,7 @@ | |||
58 | #include <linux/prefetch.h> | 58 | #include <linux/prefetch.h> |
59 | #include <linux/migrate.h> | 59 | #include <linux/migrate.h> |
60 | #include <linux/page-debug-flags.h> | 60 | #include <linux/page-debug-flags.h> |
61 | #include <linux/hugetlb.h> | ||
61 | #include <linux/sched/rt.h> | 62 | #include <linux/sched/rt.h> |
62 | 63 | ||
63 | #include <asm/tlbflush.h> | 64 | #include <asm/tlbflush.h> |
@@ -1397,6 +1398,7 @@ void split_page(struct page *page, unsigned int order) | |||
1397 | for (i = 1; i < (1 << order); i++) | 1398 | for (i = 1; i < (1 << order); i++) |
1398 | set_page_refcounted(page + i); | 1399 | set_page_refcounted(page + i); |
1399 | } | 1400 | } |
1401 | EXPORT_SYMBOL_GPL(split_page); | ||
1400 | 1402 | ||
1401 | static int __isolate_free_page(struct page *page, unsigned int order) | 1403 | static int __isolate_free_page(struct page *page, unsigned int order) |
1402 | { | 1404 | { |
@@ -1940,9 +1942,24 @@ zonelist_scan: | |||
1940 | continue; | 1942 | continue; |
1941 | default: | 1943 | default: |
1942 | /* did we reclaim enough */ | 1944 | /* did we reclaim enough */ |
1943 | if (!zone_watermark_ok(zone, order, mark, | 1945 | if (zone_watermark_ok(zone, order, mark, |
1944 | classzone_idx, alloc_flags)) | 1946 | classzone_idx, alloc_flags)) |
1947 | goto try_this_zone; | ||
1948 | |||
1949 | /* | ||
1950 | * Failed to reclaim enough to meet watermark. | ||
1951 | * Only mark the zone full if checking the min | ||
1952 | * watermark or if we failed to reclaim just | ||
1953 | * 1<<order pages or else the page allocator | ||
1954 | * fastpath will prematurely mark zones full | ||
1955 | * when the watermark is between the low and | ||
1956 | * min watermarks. | ||
1957 | */ | ||
1958 | if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || | ||
1959 | ret == ZONE_RECLAIM_SOME) | ||
1945 | goto this_zone_full; | 1960 | goto this_zone_full; |
1961 | |||
1962 | continue; | ||
1946 | } | 1963 | } |
1947 | } | 1964 | } |
1948 | 1965 | ||
@@ -2002,6 +2019,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
2002 | return; | 2019 | return; |
2003 | 2020 | ||
2004 | /* | 2021 | /* |
2022 | * Walking all memory to count page types is very expensive and should | ||
2023 | * be inhibited in non-blockable contexts. | ||
2024 | */ | ||
2025 | if (!(gfp_mask & __GFP_WAIT)) | ||
2026 | filter |= SHOW_MEM_FILTER_PAGE_COUNT; | ||
2027 | |||
2028 | /* | ||
2005 | * This documents exceptions given to allocations in certain | 2029 | * This documents exceptions given to allocations in certain |
2006 | * contexts that are allowed to allocate outside current's set | 2030 | * contexts that are allowed to allocate outside current's set |
2007 | * of allowed nodes. | 2031 | * of allowed nodes. |
@@ -3105,6 +3129,8 @@ void show_free_areas(unsigned int filter) | |||
3105 | printk("= %lukB\n", K(total)); | 3129 | printk("= %lukB\n", K(total)); |
3106 | } | 3130 | } |
3107 | 3131 | ||
3132 | hugetlb_show_meminfo(); | ||
3133 | |||
3108 | printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); | 3134 | printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); |
3109 | 3135 | ||
3110 | show_swap_cache_info(); | 3136 | show_swap_cache_info(); |
@@ -4161,10 +4187,23 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) | |||
4161 | { | 4187 | { |
4162 | unsigned long start_pfn, end_pfn; | 4188 | unsigned long start_pfn, end_pfn; |
4163 | int i, nid; | 4189 | int i, nid; |
4190 | /* | ||
4191 | * NOTE: The following SMP-unsafe globals are only used early in boot | ||
4192 | * when the kernel is running single-threaded. | ||
4193 | */ | ||
4194 | static unsigned long __meminitdata last_start_pfn, last_end_pfn; | ||
4195 | static int __meminitdata last_nid; | ||
4196 | |||
4197 | if (last_start_pfn <= pfn && pfn < last_end_pfn) | ||
4198 | return last_nid; | ||
4164 | 4199 | ||
4165 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4200 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4166 | if (start_pfn <= pfn && pfn < end_pfn) | 4201 | if (start_pfn <= pfn && pfn < end_pfn) { |
4202 | last_start_pfn = start_pfn; | ||
4203 | last_end_pfn = end_pfn; | ||
4204 | last_nid = nid; | ||
4167 | return nid; | 4205 | return nid; |
4206 | } | ||
4168 | /* This is a memory hole */ | 4207 | /* This is a memory hole */ |
4169 | return -1; | 4208 | return -1; |
4170 | } | 4209 | } |
@@ -4710,7 +4749,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4710 | /* | 4749 | /* |
4711 | * Figure out the number of possible node ids. | 4750 | * Figure out the number of possible node ids. |
4712 | */ | 4751 | */ |
4713 | static void __init setup_nr_node_ids(void) | 4752 | void __init setup_nr_node_ids(void) |
4714 | { | 4753 | { |
4715 | unsigned int node; | 4754 | unsigned int node; |
4716 | unsigned int highest = 0; | 4755 | unsigned int highest = 0; |
@@ -4719,10 +4758,6 @@ static void __init setup_nr_node_ids(void) | |||
4719 | highest = node; | 4758 | highest = node; |
4720 | nr_node_ids = highest + 1; | 4759 | nr_node_ids = highest + 1; |
4721 | } | 4760 | } |
4722 | #else | ||
4723 | static inline void setup_nr_node_ids(void) | ||
4724 | { | ||
4725 | } | ||
4726 | #endif | 4761 | #endif |
4727 | 4762 | ||
4728 | /** | 4763 | /** |
@@ -5113,6 +5148,35 @@ early_param("movablecore", cmdline_parse_movablecore); | |||
5113 | 5148 | ||
5114 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5149 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
5115 | 5150 | ||
5151 | unsigned long free_reserved_area(unsigned long start, unsigned long end, | ||
5152 | int poison, char *s) | ||
5153 | { | ||
5154 | unsigned long pages, pos; | ||
5155 | |||
5156 | pos = start = PAGE_ALIGN(start); | ||
5157 | end &= PAGE_MASK; | ||
5158 | for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { | ||
5159 | if (poison) | ||
5160 | memset((void *)pos, poison, PAGE_SIZE); | ||
5161 | free_reserved_page(virt_to_page(pos)); | ||
5162 | } | ||
5163 | |||
5164 | if (pages && s) | ||
5165 | pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", | ||
5166 | s, pages << (PAGE_SHIFT - 10), start, end); | ||
5167 | |||
5168 | return pages; | ||
5169 | } | ||
5170 | |||
5171 | #ifdef CONFIG_HIGHMEM | ||
5172 | void free_highmem_page(struct page *page) | ||
5173 | { | ||
5174 | __free_reserved_page(page); | ||
5175 | totalram_pages++; | ||
5176 | totalhigh_pages++; | ||
5177 | } | ||
5178 | #endif | ||
5179 | |||
5116 | /** | 5180 | /** |
5117 | * set_dma_reserve - set the specified number of pages reserved in the first zone | 5181 | * set_dma_reserve - set the specified number of pages reserved in the first zone |
5118 | * @new_dma_reserve: The number of pages to mark reserved | 5182 | * @new_dma_reserve: The number of pages to mark reserved |
diff --git a/mm/page_io.c b/mm/page_io.c index 78eee32ee486..bb5d75274686 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -42,7 +42,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, | |||
42 | return bio; | 42 | return bio; |
43 | } | 43 | } |
44 | 44 | ||
45 | static void end_swap_bio_write(struct bio *bio, int err) | 45 | void end_swap_bio_write(struct bio *bio, int err) |
46 | { | 46 | { |
47 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 47 | const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
48 | struct page *page = bio->bi_io_vec[0].bv_page; | 48 | struct page *page = bio->bi_io_vec[0].bv_page; |
@@ -185,9 +185,7 @@ bad_bmap: | |||
185 | */ | 185 | */ |
186 | int swap_writepage(struct page *page, struct writeback_control *wbc) | 186 | int swap_writepage(struct page *page, struct writeback_control *wbc) |
187 | { | 187 | { |
188 | struct bio *bio; | 188 | int ret = 0; |
189 | int ret = 0, rw = WRITE; | ||
190 | struct swap_info_struct *sis = page_swap_info(page); | ||
191 | 189 | ||
192 | if (try_to_free_swap(page)) { | 190 | if (try_to_free_swap(page)) { |
193 | unlock_page(page); | 191 | unlock_page(page); |
@@ -199,6 +197,17 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
199 | end_page_writeback(page); | 197 | end_page_writeback(page); |
200 | goto out; | 198 | goto out; |
201 | } | 199 | } |
200 | ret = __swap_writepage(page, wbc, end_swap_bio_write); | ||
201 | out: | ||
202 | return ret; | ||
203 | } | ||
204 | |||
205 | int __swap_writepage(struct page *page, struct writeback_control *wbc, | ||
206 | void (*end_write_func)(struct bio *, int)) | ||
207 | { | ||
208 | struct bio *bio; | ||
209 | int ret = 0, rw = WRITE; | ||
210 | struct swap_info_struct *sis = page_swap_info(page); | ||
202 | 211 | ||
203 | if (sis->flags & SWP_FILE) { | 212 | if (sis->flags & SWP_FILE) { |
204 | struct kiocb kiocb; | 213 | struct kiocb kiocb; |
@@ -214,6 +223,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
214 | kiocb.ki_left = PAGE_SIZE; | 223 | kiocb.ki_left = PAGE_SIZE; |
215 | kiocb.ki_nbytes = PAGE_SIZE; | 224 | kiocb.ki_nbytes = PAGE_SIZE; |
216 | 225 | ||
226 | set_page_writeback(page); | ||
217 | unlock_page(page); | 227 | unlock_page(page); |
218 | ret = mapping->a_ops->direct_IO(KERNEL_WRITE, | 228 | ret = mapping->a_ops->direct_IO(KERNEL_WRITE, |
219 | &kiocb, &iov, | 229 | &kiocb, &iov, |
@@ -222,11 +232,27 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
222 | if (ret == PAGE_SIZE) { | 232 | if (ret == PAGE_SIZE) { |
223 | count_vm_event(PSWPOUT); | 233 | count_vm_event(PSWPOUT); |
224 | ret = 0; | 234 | ret = 0; |
235 | } else { | ||
236 | /* | ||
237 | * In the case of swap-over-nfs, this can be a | ||
238 | * temporary failure if the system has limited | ||
239 | * memory for allocating transmit buffers. | ||
240 | * Mark the page dirty and avoid | ||
241 | * rotate_reclaimable_page but rate-limit the | ||
242 | * messages but do not flag PageError like | ||
243 | * the normal direct-to-bio case as it could | ||
244 | * be temporary. | ||
245 | */ | ||
246 | set_page_dirty(page); | ||
247 | ClearPageReclaim(page); | ||
248 | pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", | ||
249 | page_file_offset(page)); | ||
225 | } | 250 | } |
251 | end_page_writeback(page); | ||
226 | return ret; | 252 | return ret; |
227 | } | 253 | } |
228 | 254 | ||
229 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 255 | bio = get_swap_bio(GFP_NOIO, page, end_write_func); |
230 | if (bio == NULL) { | 256 | if (bio == NULL) { |
231 | set_page_dirty(page); | 257 | set_page_dirty(page); |
232 | unlock_page(page); | 258 | unlock_page(page); |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 926b46649749..fd26d0433509 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
@@ -429,12 +429,6 @@ compat_process_vm_rw(compat_pid_t pid, | |||
429 | if (flags != 0) | 429 | if (flags != 0) |
430 | return -EINVAL; | 430 | return -EINVAL; |
431 | 431 | ||
432 | if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) | ||
433 | goto out; | ||
434 | |||
435 | if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) | ||
436 | goto out; | ||
437 | |||
438 | if (vm_write) | 432 | if (vm_write) |
439 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, | 433 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, |
440 | UIO_FASTIOV, iovstack_l, | 434 | UIO_FASTIOV, iovstack_l, |
@@ -459,8 +453,6 @@ free_iovecs: | |||
459 | kfree(iov_r); | 453 | kfree(iov_r); |
460 | if (iov_l != iovstack_l) | 454 | if (iov_l != iovstack_l) |
461 | kfree(iov_l); | 455 | kfree(iov_l); |
462 | |||
463 | out: | ||
464 | return rc; | 456 | return rc; |
465 | } | 457 | } |
466 | 458 | ||
@@ -1513,6 +1513,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1513 | unsigned long max_nl_size = 0; | 1513 | unsigned long max_nl_size = 0; |
1514 | unsigned int mapcount; | 1514 | unsigned int mapcount; |
1515 | 1515 | ||
1516 | if (PageHuge(page)) | ||
1517 | pgoff = page->index << compound_order(page); | ||
1518 | |||
1516 | mutex_lock(&mapping->i_mmap_mutex); | 1519 | mutex_lock(&mapping->i_mmap_mutex); |
1517 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1520 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1518 | unsigned long address = vma_address(page, vma); | 1521 | unsigned long address = vma_address(page, vma); |
diff --git a/mm/shmem.c b/mm/shmem.c index 1c44af71fcf5..39b2a0b86fe8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/init.h> | 25 | #include <linux/init.h> |
26 | #include <linux/vfs.h> | 26 | #include <linux/vfs.h> |
27 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
28 | #include <linux/ramfs.h> | ||
28 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
29 | #include <linux/file.h> | 30 | #include <linux/file.h> |
30 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
@@ -2830,8 +2831,6 @@ out4: | |||
2830 | * effectively equivalent, but much lighter weight. | 2831 | * effectively equivalent, but much lighter weight. |
2831 | */ | 2832 | */ |
2832 | 2833 | ||
2833 | #include <linux/ramfs.h> | ||
2834 | |||
2835 | static struct file_system_type shmem_fs_type = { | 2834 | static struct file_system_type shmem_fs_type = { |
2836 | .name = "tmpfs", | 2835 | .name = "tmpfs", |
2837 | .mount = ramfs_mount, | 2836 | .mount = ramfs_mount, |
@@ -2931,11 +2930,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2931 | d_instantiate(path.dentry, inode); | 2930 | d_instantiate(path.dentry, inode); |
2932 | inode->i_size = size; | 2931 | inode->i_size = size; |
2933 | clear_nlink(inode); /* It is unlinked */ | 2932 | clear_nlink(inode); /* It is unlinked */ |
2934 | #ifndef CONFIG_MMU | ||
2935 | res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); | 2933 | res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); |
2936 | if (IS_ERR(res)) | 2934 | if (IS_ERR(res)) |
2937 | goto put_dentry; | 2935 | goto put_dentry; |
2938 | #endif | ||
2939 | 2936 | ||
2940 | res = alloc_file(&path, FMODE_WRITE | FMODE_READ, | 2937 | res = alloc_file(&path, FMODE_WRITE | FMODE_READ, |
2941 | &shmem_file_operations); | 2938 | &shmem_file_operations); |
@@ -2040,11 +2040,9 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) | |||
2040 | } | 2040 | } |
2041 | 2041 | ||
2042 | if (cachep->flags & SLAB_STORE_USER) { | 2042 | if (cachep->flags & SLAB_STORE_USER) { |
2043 | printk(KERN_ERR "Last user: [<%p>]", | 2043 | printk(KERN_ERR "Last user: [<%p>](%pSR)\n", |
2044 | *dbg_userword(cachep, objp)); | 2044 | *dbg_userword(cachep, objp), |
2045 | print_symbol("(%s)", | 2045 | *dbg_userword(cachep, objp)); |
2046 | (unsigned long)*dbg_userword(cachep, objp)); | ||
2047 | printk("\n"); | ||
2048 | } | 2046 | } |
2049 | realobj = (char *)objp + obj_offset(cachep); | 2047 | realobj = (char *)objp + obj_offset(cachep); |
2050 | size = cachep->object_size; | 2048 | size = cachep->object_size; |
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include "slab.h" | 19 | #include "slab.h" |
20 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
21 | #include <linux/notifier.h> | ||
21 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
22 | #include <linux/kmemcheck.h> | 23 | #include <linux/kmemcheck.h> |
23 | #include <linux/cpu.h> | 24 | #include <linux/cpu.h> |
@@ -3483,7 +3484,6 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
3483 | } | 3484 | } |
3484 | EXPORT_SYMBOL(kmem_cache_shrink); | 3485 | EXPORT_SYMBOL(kmem_cache_shrink); |
3485 | 3486 | ||
3486 | #if defined(CONFIG_MEMORY_HOTPLUG) | ||
3487 | static int slab_mem_going_offline_callback(void *arg) | 3487 | static int slab_mem_going_offline_callback(void *arg) |
3488 | { | 3488 | { |
3489 | struct kmem_cache *s; | 3489 | struct kmem_cache *s; |
@@ -3598,7 +3598,10 @@ static int slab_memory_callback(struct notifier_block *self, | |||
3598 | return ret; | 3598 | return ret; |
3599 | } | 3599 | } |
3600 | 3600 | ||
3601 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 3601 | static struct notifier_block slab_memory_callback_nb = { |
3602 | .notifier_call = slab_memory_callback, | ||
3603 | .priority = SLAB_CALLBACK_PRI, | ||
3604 | }; | ||
3602 | 3605 | ||
3603 | /******************************************************************** | 3606 | /******************************************************************** |
3604 | * Basic setup of slabs | 3607 | * Basic setup of slabs |
@@ -3651,7 +3654,7 @@ void __init kmem_cache_init(void) | |||
3651 | create_boot_cache(kmem_cache_node, "kmem_cache_node", | 3654 | create_boot_cache(kmem_cache_node, "kmem_cache_node", |
3652 | sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); | 3655 | sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); |
3653 | 3656 | ||
3654 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); | 3657 | register_hotmemory_notifier(&slab_memory_callback_nb); |
3655 | 3658 | ||
3656 | /* Able to allocate the per node structures */ | 3659 | /* Able to allocate the per node structures */ |
3657 | slab_state = PARTIAL; | 3660 | slab_state = PARTIAL; |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1b7e22ab9b09..27eeab3be757 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -53,10 +53,12 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
53 | struct page *page; | 53 | struct page *page; |
54 | 54 | ||
55 | if (node_state(node, N_HIGH_MEMORY)) | 55 | if (node_state(node, N_HIGH_MEMORY)) |
56 | page = alloc_pages_node(node, | 56 | page = alloc_pages_node( |
57 | GFP_KERNEL | __GFP_ZERO, get_order(size)); | 57 | node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, |
58 | get_order(size)); | ||
58 | else | 59 | else |
59 | page = alloc_pages(GFP_KERNEL | __GFP_ZERO, | 60 | page = alloc_pages( |
61 | GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, | ||
60 | get_order(size)); | 62 | get_order(size)); |
61 | if (page) | 63 | if (page) |
62 | return page_address(page); | 64 | return page_address(page); |
@@ -145,11 +147,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) | |||
145 | return pgd; | 147 | return pgd; |
146 | } | 148 | } |
147 | 149 | ||
148 | int __meminit vmemmap_populate_basepages(struct page *start_page, | 150 | int __meminit vmemmap_populate_basepages(unsigned long start, |
149 | unsigned long size, int node) | 151 | unsigned long end, int node) |
150 | { | 152 | { |
151 | unsigned long addr = (unsigned long)start_page; | 153 | unsigned long addr = start; |
152 | unsigned long end = (unsigned long)(start_page + size); | ||
153 | pgd_t *pgd; | 154 | pgd_t *pgd; |
154 | pud_t *pud; | 155 | pud_t *pud; |
155 | pmd_t *pmd; | 156 | pmd_t *pmd; |
@@ -176,9 +177,15 @@ int __meminit vmemmap_populate_basepages(struct page *start_page, | |||
176 | 177 | ||
177 | struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) | 178 | struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) |
178 | { | 179 | { |
179 | struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); | 180 | unsigned long start; |
180 | int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); | 181 | unsigned long end; |
181 | if (error) | 182 | struct page *map; |
183 | |||
184 | map = pfn_to_page(pnum * PAGES_PER_SECTION); | ||
185 | start = (unsigned long)map; | ||
186 | end = (unsigned long)(map + PAGES_PER_SECTION); | ||
187 | |||
188 | if (vmemmap_populate(start, end, nid)) | ||
182 | return NULL; | 189 | return NULL; |
183 | 190 | ||
184 | return map; | 191 | return map; |
diff --git a/mm/sparse.c b/mm/sparse.c index 7ca6dc847947..1c91f0d3f6ab 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -615,12 +615,20 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | |||
615 | } | 615 | } |
616 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 616 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) |
617 | { | 617 | { |
618 | vmemmap_free(memmap, nr_pages); | 618 | unsigned long start = (unsigned long)memmap; |
619 | unsigned long end = (unsigned long)(memmap + nr_pages); | ||
620 | |||
621 | vmemmap_free(start, end); | ||
619 | } | 622 | } |
623 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
620 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 624 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) |
621 | { | 625 | { |
622 | vmemmap_free(memmap, nr_pages); | 626 | unsigned long start = (unsigned long)memmap; |
627 | unsigned long end = (unsigned long)(memmap + nr_pages); | ||
628 | |||
629 | vmemmap_free(start, end); | ||
623 | } | 630 | } |
631 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
624 | #else | 632 | #else |
625 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 633 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
626 | { | 634 | { |
@@ -658,6 +666,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
658 | get_order(sizeof(struct page) * nr_pages)); | 666 | get_order(sizeof(struct page) * nr_pages)); |
659 | } | 667 | } |
660 | 668 | ||
669 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
661 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 670 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) |
662 | { | 671 | { |
663 | unsigned long maps_section_nr, removing_section_nr, i; | 672 | unsigned long maps_section_nr, removing_section_nr, i; |
@@ -684,40 +693,9 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | |||
684 | put_page_bootmem(page); | 693 | put_page_bootmem(page); |
685 | } | 694 | } |
686 | } | 695 | } |
696 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
687 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | 697 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
688 | 698 | ||
689 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) | ||
690 | { | ||
691 | struct page *usemap_page; | ||
692 | unsigned long nr_pages; | ||
693 | |||
694 | if (!usemap) | ||
695 | return; | ||
696 | |||
697 | usemap_page = virt_to_page(usemap); | ||
698 | /* | ||
699 | * Check to see if allocation came from hot-plug-add | ||
700 | */ | ||
701 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { | ||
702 | kfree(usemap); | ||
703 | if (memmap) | ||
704 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); | ||
705 | return; | ||
706 | } | ||
707 | |||
708 | /* | ||
709 | * The usemap came from bootmem. This is packed with other usemaps | ||
710 | * on the section which has pgdat at boot time. Just keep it as is now. | ||
711 | */ | ||
712 | |||
713 | if (memmap) { | ||
714 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) | ||
715 | >> PAGE_SHIFT; | ||
716 | |||
717 | free_map_bootmem(memmap, nr_pages); | ||
718 | } | ||
719 | } | ||
720 | |||
721 | /* | 699 | /* |
722 | * returns the number of sections whose mem_maps were properly | 700 | * returns the number of sections whose mem_maps were properly |
723 | * set. If this is <=0, then that means that the passed-in | 701 | * set. If this is <=0, then that means that the passed-in |
@@ -794,6 +772,39 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
794 | } | 772 | } |
795 | #endif | 773 | #endif |
796 | 774 | ||
775 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
776 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) | ||
777 | { | ||
778 | struct page *usemap_page; | ||
779 | unsigned long nr_pages; | ||
780 | |||
781 | if (!usemap) | ||
782 | return; | ||
783 | |||
784 | usemap_page = virt_to_page(usemap); | ||
785 | /* | ||
786 | * Check to see if allocation came from hot-plug-add | ||
787 | */ | ||
788 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { | ||
789 | kfree(usemap); | ||
790 | if (memmap) | ||
791 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); | ||
792 | return; | ||
793 | } | ||
794 | |||
795 | /* | ||
796 | * The usemap came from bootmem. This is packed with other usemaps | ||
797 | * on the section which has pgdat at boot time. Just keep it as is now. | ||
798 | */ | ||
799 | |||
800 | if (memmap) { | ||
801 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) | ||
802 | >> PAGE_SHIFT; | ||
803 | |||
804 | free_map_bootmem(memmap, nr_pages); | ||
805 | } | ||
806 | } | ||
807 | |||
797 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 808 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
798 | { | 809 | { |
799 | struct page *memmap = NULL; | 810 | struct page *memmap = NULL; |
@@ -813,4 +824,5 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | |||
813 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); | 824 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); |
814 | free_section_usemap(memmap, usemap); | 825 | free_section_usemap(memmap, usemap); |
815 | } | 826 | } |
816 | #endif | 827 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
828 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
@@ -737,7 +737,7 @@ EXPORT_SYMBOL(__pagevec_release); | |||
737 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 737 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
738 | /* used by __split_huge_page_refcount() */ | 738 | /* used by __split_huge_page_refcount() */ |
739 | void lru_add_page_tail(struct page *page, struct page *page_tail, | 739 | void lru_add_page_tail(struct page *page, struct page *page_tail, |
740 | struct lruvec *lruvec) | 740 | struct lruvec *lruvec, struct list_head *list) |
741 | { | 741 | { |
742 | int uninitialized_var(active); | 742 | int uninitialized_var(active); |
743 | enum lru_list lru; | 743 | enum lru_list lru; |
@@ -749,7 +749,8 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
749 | VM_BUG_ON(NR_CPUS != 1 && | 749 | VM_BUG_ON(NR_CPUS != 1 && |
750 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); | 750 | !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); |
751 | 751 | ||
752 | SetPageLRU(page_tail); | 752 | if (!list) |
753 | SetPageLRU(page_tail); | ||
753 | 754 | ||
754 | if (page_evictable(page_tail)) { | 755 | if (page_evictable(page_tail)) { |
755 | if (PageActive(page)) { | 756 | if (PageActive(page)) { |
@@ -767,7 +768,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
767 | 768 | ||
768 | if (likely(PageLRU(page))) | 769 | if (likely(PageLRU(page))) |
769 | list_add_tail(&page_tail->lru, &page->lru); | 770 | list_add_tail(&page_tail->lru, &page->lru); |
770 | else { | 771 | else if (list) { |
772 | /* page reclaim is reclaiming a huge page */ | ||
773 | get_page(page_tail); | ||
774 | list_add_tail(&page_tail->lru, list); | ||
775 | } else { | ||
771 | struct list_head *list_head; | 776 | struct list_head *list_head; |
772 | /* | 777 | /* |
773 | * Head page has not yet been counted, as an hpage, | 778 | * Head page has not yet been counted, as an hpage, |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 7efcf1525921..b3d40dcf3624 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -78,7 +78,7 @@ void show_swap_cache_info(void) | |||
78 | * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, | 78 | * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, |
79 | * but sets SwapCache flag and private instead of mapping and index. | 79 | * but sets SwapCache flag and private instead of mapping and index. |
80 | */ | 80 | */ |
81 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) | 81 | int __add_to_swap_cache(struct page *page, swp_entry_t entry) |
82 | { | 82 | { |
83 | int error; | 83 | int error; |
84 | struct address_space *address_space; | 84 | struct address_space *address_space; |
@@ -160,7 +160,7 @@ void __delete_from_swap_cache(struct page *page) | |||
160 | * Allocate swap space for the page and add the page to the | 160 | * Allocate swap space for the page and add the page to the |
161 | * swap cache. Caller needs to hold the page lock. | 161 | * swap cache. Caller needs to hold the page lock. |
162 | */ | 162 | */ |
163 | int add_to_swap(struct page *page) | 163 | int add_to_swap(struct page *page, struct list_head *list) |
164 | { | 164 | { |
165 | swp_entry_t entry; | 165 | swp_entry_t entry; |
166 | int err; | 166 | int err; |
@@ -173,7 +173,7 @@ int add_to_swap(struct page *page) | |||
173 | return 0; | 173 | return 0; |
174 | 174 | ||
175 | if (unlikely(PageTransHuge(page))) | 175 | if (unlikely(PageTransHuge(page))) |
176 | if (unlikely(split_huge_page(page))) { | 176 | if (unlikely(split_huge_page_to_list(page, list))) { |
177 | swapcache_free(entry, NULL); | 177 | swapcache_free(entry, NULL); |
178 | return 0; | 178 | return 0; |
179 | } | 179 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index a1f7772a01fc..d417efddfe74 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -2120,7 +2120,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2120 | if (p->bdev) { | 2120 | if (p->bdev) { |
2121 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2121 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
2122 | p->flags |= SWP_SOLIDSTATE; | 2122 | p->flags |= SWP_SOLIDSTATE; |
2123 | p->cluster_next = 1 + (random32() % p->highest_bit); | 2123 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); |
2124 | } | 2124 | } |
2125 | if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) | 2125 | if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) |
2126 | p->flags |= SWP_DISCARDABLE; | 2126 | p->flags |= SWP_DISCARDABLE; |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f751f2068c3..72043d6c88c0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -249,19 +249,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); | |||
249 | #define VM_LAZY_FREEING 0x02 | 249 | #define VM_LAZY_FREEING 0x02 |
250 | #define VM_VM_AREA 0x04 | 250 | #define VM_VM_AREA 0x04 |
251 | 251 | ||
252 | struct vmap_area { | ||
253 | unsigned long va_start; | ||
254 | unsigned long va_end; | ||
255 | unsigned long flags; | ||
256 | struct rb_node rb_node; /* address sorted rbtree */ | ||
257 | struct list_head list; /* address sorted list */ | ||
258 | struct list_head purge_list; /* "lazy purge" list */ | ||
259 | struct vm_struct *vm; | ||
260 | struct rcu_head rcu_head; | ||
261 | }; | ||
262 | |||
263 | static DEFINE_SPINLOCK(vmap_area_lock); | 252 | static DEFINE_SPINLOCK(vmap_area_lock); |
264 | static LIST_HEAD(vmap_area_list); | 253 | /* Export for kexec only */ |
254 | LIST_HEAD(vmap_area_list); | ||
265 | static struct rb_root vmap_area_root = RB_ROOT; | 255 | static struct rb_root vmap_area_root = RB_ROOT; |
266 | 256 | ||
267 | /* The vmap cache globals are protected by vmap_area_lock */ | 257 | /* The vmap cache globals are protected by vmap_area_lock */ |
@@ -313,7 +303,7 @@ static void __insert_vmap_area(struct vmap_area *va) | |||
313 | rb_link_node(&va->rb_node, parent, p); | 303 | rb_link_node(&va->rb_node, parent, p); |
314 | rb_insert_color(&va->rb_node, &vmap_area_root); | 304 | rb_insert_color(&va->rb_node, &vmap_area_root); |
315 | 305 | ||
316 | /* address-sort this list so it is usable like the vmlist */ | 306 | /* address-sort this list */ |
317 | tmp = rb_prev(&va->rb_node); | 307 | tmp = rb_prev(&va->rb_node); |
318 | if (tmp) { | 308 | if (tmp) { |
319 | struct vmap_area *prev; | 309 | struct vmap_area *prev; |
@@ -1125,6 +1115,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro | |||
1125 | } | 1115 | } |
1126 | EXPORT_SYMBOL(vm_map_ram); | 1116 | EXPORT_SYMBOL(vm_map_ram); |
1127 | 1117 | ||
1118 | static struct vm_struct *vmlist __initdata; | ||
1128 | /** | 1119 | /** |
1129 | * vm_area_add_early - add vmap area early during boot | 1120 | * vm_area_add_early - add vmap area early during boot |
1130 | * @vm: vm_struct to add | 1121 | * @vm: vm_struct to add |
@@ -1283,41 +1274,35 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | |||
1283 | } | 1274 | } |
1284 | EXPORT_SYMBOL_GPL(map_vm_area); | 1275 | EXPORT_SYMBOL_GPL(map_vm_area); |
1285 | 1276 | ||
1286 | /*** Old vmalloc interfaces ***/ | ||
1287 | DEFINE_RWLOCK(vmlist_lock); | ||
1288 | struct vm_struct *vmlist; | ||
1289 | |||
1290 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1277 | static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1291 | unsigned long flags, const void *caller) | 1278 | unsigned long flags, const void *caller) |
1292 | { | 1279 | { |
1280 | spin_lock(&vmap_area_lock); | ||
1293 | vm->flags = flags; | 1281 | vm->flags = flags; |
1294 | vm->addr = (void *)va->va_start; | 1282 | vm->addr = (void *)va->va_start; |
1295 | vm->size = va->va_end - va->va_start; | 1283 | vm->size = va->va_end - va->va_start; |
1296 | vm->caller = caller; | 1284 | vm->caller = caller; |
1297 | va->vm = vm; | 1285 | va->vm = vm; |
1298 | va->flags |= VM_VM_AREA; | 1286 | va->flags |= VM_VM_AREA; |
1287 | spin_unlock(&vmap_area_lock); | ||
1299 | } | 1288 | } |
1300 | 1289 | ||
1301 | static void insert_vmalloc_vmlist(struct vm_struct *vm) | 1290 | static void clear_vm_unlist(struct vm_struct *vm) |
1302 | { | 1291 | { |
1303 | struct vm_struct *tmp, **p; | 1292 | /* |
1304 | 1293 | * Before removing VM_UNLIST, | |
1294 | * we should make sure that vm has proper values. | ||
1295 | * Pair with smp_rmb() in show_numa_info(). | ||
1296 | */ | ||
1297 | smp_wmb(); | ||
1305 | vm->flags &= ~VM_UNLIST; | 1298 | vm->flags &= ~VM_UNLIST; |
1306 | write_lock(&vmlist_lock); | ||
1307 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | ||
1308 | if (tmp->addr >= vm->addr) | ||
1309 | break; | ||
1310 | } | ||
1311 | vm->next = *p; | ||
1312 | *p = vm; | ||
1313 | write_unlock(&vmlist_lock); | ||
1314 | } | 1299 | } |
1315 | 1300 | ||
1316 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | 1301 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
1317 | unsigned long flags, const void *caller) | 1302 | unsigned long flags, const void *caller) |
1318 | { | 1303 | { |
1319 | setup_vmalloc_vm(vm, va, flags, caller); | 1304 | setup_vmalloc_vm(vm, va, flags, caller); |
1320 | insert_vmalloc_vmlist(vm); | 1305 | clear_vm_unlist(vm); |
1321 | } | 1306 | } |
1322 | 1307 | ||
1323 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1308 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
@@ -1360,10 +1345,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1360 | 1345 | ||
1361 | /* | 1346 | /* |
1362 | * When this function is called from __vmalloc_node_range, | 1347 | * When this function is called from __vmalloc_node_range, |
1363 | * we do not add vm_struct to vmlist here to avoid | 1348 | * we add VM_UNLIST flag to avoid accessing uninitialized |
1364 | * accessing uninitialized members of vm_struct such as | 1349 | * members of vm_struct such as pages and nr_pages fields. |
1365 | * pages and nr_pages fields. They will be set later. | 1350 | * They will be set later. |
1366 | * To distinguish it from others, we use a VM_UNLIST flag. | ||
1367 | */ | 1351 | */ |
1368 | if (flags & VM_UNLIST) | 1352 | if (flags & VM_UNLIST) |
1369 | setup_vmalloc_vm(area, va, flags, caller); | 1353 | setup_vmalloc_vm(area, va, flags, caller); |
@@ -1447,19 +1431,10 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1447 | if (va && va->flags & VM_VM_AREA) { | 1431 | if (va && va->flags & VM_VM_AREA) { |
1448 | struct vm_struct *vm = va->vm; | 1432 | struct vm_struct *vm = va->vm; |
1449 | 1433 | ||
1450 | if (!(vm->flags & VM_UNLIST)) { | 1434 | spin_lock(&vmap_area_lock); |
1451 | struct vm_struct *tmp, **p; | 1435 | va->vm = NULL; |
1452 | /* | 1436 | va->flags &= ~VM_VM_AREA; |
1453 | * remove from list and disallow access to | 1437 | spin_unlock(&vmap_area_lock); |
1454 | * this vm_struct before unmap. (address range | ||
1455 | * confliction is maintained by vmap.) | ||
1456 | */ | ||
1457 | write_lock(&vmlist_lock); | ||
1458 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | ||
1459 | ; | ||
1460 | *p = tmp->next; | ||
1461 | write_unlock(&vmlist_lock); | ||
1462 | } | ||
1463 | 1438 | ||
1464 | vmap_debug_free_range(va->va_start, va->va_end); | 1439 | vmap_debug_free_range(va->va_start, va->va_end); |
1465 | free_unmap_vmap_area(va); | 1440 | free_unmap_vmap_area(va); |
@@ -1680,10 +1655,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1680 | return NULL; | 1655 | return NULL; |
1681 | 1656 | ||
1682 | /* | 1657 | /* |
1683 | * In this function, newly allocated vm_struct is not added | 1658 | * In this function, newly allocated vm_struct has VM_UNLIST flag. |
1684 | * to vmlist at __get_vm_area_node(). so, it is added here. | 1659 | * It means that vm_struct is not fully initialized. |
1660 | * Now, it is fully initialized, so remove this flag here. | ||
1685 | */ | 1661 | */ |
1686 | insert_vmalloc_vmlist(area); | 1662 | clear_vm_unlist(area); |
1687 | 1663 | ||
1688 | /* | 1664 | /* |
1689 | * A ref_count = 3 is needed because the vm_struct and vmap_area | 1665 | * A ref_count = 3 is needed because the vm_struct and vmap_area |
@@ -2005,7 +1981,8 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) | |||
2005 | 1981 | ||
2006 | long vread(char *buf, char *addr, unsigned long count) | 1982 | long vread(char *buf, char *addr, unsigned long count) |
2007 | { | 1983 | { |
2008 | struct vm_struct *tmp; | 1984 | struct vmap_area *va; |
1985 | struct vm_struct *vm; | ||
2009 | char *vaddr, *buf_start = buf; | 1986 | char *vaddr, *buf_start = buf; |
2010 | unsigned long buflen = count; | 1987 | unsigned long buflen = count; |
2011 | unsigned long n; | 1988 | unsigned long n; |
@@ -2014,10 +1991,17 @@ long vread(char *buf, char *addr, unsigned long count) | |||
2014 | if ((unsigned long) addr + count < count) | 1991 | if ((unsigned long) addr + count < count) |
2015 | count = -(unsigned long) addr; | 1992 | count = -(unsigned long) addr; |
2016 | 1993 | ||
2017 | read_lock(&vmlist_lock); | 1994 | spin_lock(&vmap_area_lock); |
2018 | for (tmp = vmlist; count && tmp; tmp = tmp->next) { | 1995 | list_for_each_entry(va, &vmap_area_list, list) { |
2019 | vaddr = (char *) tmp->addr; | 1996 | if (!count) |
2020 | if (addr >= vaddr + tmp->size - PAGE_SIZE) | 1997 | break; |
1998 | |||
1999 | if (!(va->flags & VM_VM_AREA)) | ||
2000 | continue; | ||
2001 | |||
2002 | vm = va->vm; | ||
2003 | vaddr = (char *) vm->addr; | ||
2004 | if (addr >= vaddr + vm->size - PAGE_SIZE) | ||
2021 | continue; | 2005 | continue; |
2022 | while (addr < vaddr) { | 2006 | while (addr < vaddr) { |
2023 | if (count == 0) | 2007 | if (count == 0) |
@@ -2027,10 +2011,10 @@ long vread(char *buf, char *addr, unsigned long count) | |||
2027 | addr++; | 2011 | addr++; |
2028 | count--; | 2012 | count--; |
2029 | } | 2013 | } |
2030 | n = vaddr + tmp->size - PAGE_SIZE - addr; | 2014 | n = vaddr + vm->size - PAGE_SIZE - addr; |
2031 | if (n > count) | 2015 | if (n > count) |
2032 | n = count; | 2016 | n = count; |
2033 | if (!(tmp->flags & VM_IOREMAP)) | 2017 | if (!(vm->flags & VM_IOREMAP)) |
2034 | aligned_vread(buf, addr, n); | 2018 | aligned_vread(buf, addr, n); |
2035 | else /* IOREMAP area is treated as memory hole */ | 2019 | else /* IOREMAP area is treated as memory hole */ |
2036 | memset(buf, 0, n); | 2020 | memset(buf, 0, n); |
@@ -2039,7 +2023,7 @@ long vread(char *buf, char *addr, unsigned long count) | |||
2039 | count -= n; | 2023 | count -= n; |
2040 | } | 2024 | } |
2041 | finished: | 2025 | finished: |
2042 | read_unlock(&vmlist_lock); | 2026 | spin_unlock(&vmap_area_lock); |
2043 | 2027 | ||
2044 | if (buf == buf_start) | 2028 | if (buf == buf_start) |
2045 | return 0; | 2029 | return 0; |
@@ -2078,7 +2062,8 @@ finished: | |||
2078 | 2062 | ||
2079 | long vwrite(char *buf, char *addr, unsigned long count) | 2063 | long vwrite(char *buf, char *addr, unsigned long count) |
2080 | { | 2064 | { |
2081 | struct vm_struct *tmp; | 2065 | struct vmap_area *va; |
2066 | struct vm_struct *vm; | ||
2082 | char *vaddr; | 2067 | char *vaddr; |
2083 | unsigned long n, buflen; | 2068 | unsigned long n, buflen; |
2084 | int copied = 0; | 2069 | int copied = 0; |
@@ -2088,10 +2073,17 @@ long vwrite(char *buf, char *addr, unsigned long count) | |||
2088 | count = -(unsigned long) addr; | 2073 | count = -(unsigned long) addr; |
2089 | buflen = count; | 2074 | buflen = count; |
2090 | 2075 | ||
2091 | read_lock(&vmlist_lock); | 2076 | spin_lock(&vmap_area_lock); |
2092 | for (tmp = vmlist; count && tmp; tmp = tmp->next) { | 2077 | list_for_each_entry(va, &vmap_area_list, list) { |
2093 | vaddr = (char *) tmp->addr; | 2078 | if (!count) |
2094 | if (addr >= vaddr + tmp->size - PAGE_SIZE) | 2079 | break; |
2080 | |||
2081 | if (!(va->flags & VM_VM_AREA)) | ||
2082 | continue; | ||
2083 | |||
2084 | vm = va->vm; | ||
2085 | vaddr = (char *) vm->addr; | ||
2086 | if (addr >= vaddr + vm->size - PAGE_SIZE) | ||
2095 | continue; | 2087 | continue; |
2096 | while (addr < vaddr) { | 2088 | while (addr < vaddr) { |
2097 | if (count == 0) | 2089 | if (count == 0) |
@@ -2100,10 +2092,10 @@ long vwrite(char *buf, char *addr, unsigned long count) | |||
2100 | addr++; | 2092 | addr++; |
2101 | count--; | 2093 | count--; |
2102 | } | 2094 | } |
2103 | n = vaddr + tmp->size - PAGE_SIZE - addr; | 2095 | n = vaddr + vm->size - PAGE_SIZE - addr; |
2104 | if (n > count) | 2096 | if (n > count) |
2105 | n = count; | 2097 | n = count; |
2106 | if (!(tmp->flags & VM_IOREMAP)) { | 2098 | if (!(vm->flags & VM_IOREMAP)) { |
2107 | aligned_vwrite(buf, addr, n); | 2099 | aligned_vwrite(buf, addr, n); |
2108 | copied++; | 2100 | copied++; |
2109 | } | 2101 | } |
@@ -2112,7 +2104,7 @@ long vwrite(char *buf, char *addr, unsigned long count) | |||
2112 | count -= n; | 2104 | count -= n; |
2113 | } | 2105 | } |
2114 | finished: | 2106 | finished: |
2115 | read_unlock(&vmlist_lock); | 2107 | spin_unlock(&vmap_area_lock); |
2116 | if (!copied) | 2108 | if (!copied) |
2117 | return 0; | 2109 | return 0; |
2118 | return buflen; | 2110 | return buflen; |
@@ -2519,19 +2511,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) | |||
2519 | 2511 | ||
2520 | #ifdef CONFIG_PROC_FS | 2512 | #ifdef CONFIG_PROC_FS |
2521 | static void *s_start(struct seq_file *m, loff_t *pos) | 2513 | static void *s_start(struct seq_file *m, loff_t *pos) |
2522 | __acquires(&vmlist_lock) | 2514 | __acquires(&vmap_area_lock) |
2523 | { | 2515 | { |
2524 | loff_t n = *pos; | 2516 | loff_t n = *pos; |
2525 | struct vm_struct *v; | 2517 | struct vmap_area *va; |
2526 | 2518 | ||
2527 | read_lock(&vmlist_lock); | 2519 | spin_lock(&vmap_area_lock); |
2528 | v = vmlist; | 2520 | va = list_entry((&vmap_area_list)->next, typeof(*va), list); |
2529 | while (n > 0 && v) { | 2521 | while (n > 0 && &va->list != &vmap_area_list) { |
2530 | n--; | 2522 | n--; |
2531 | v = v->next; | 2523 | va = list_entry(va->list.next, typeof(*va), list); |
2532 | } | 2524 | } |
2533 | if (!n) | 2525 | if (!n && &va->list != &vmap_area_list) |
2534 | return v; | 2526 | return va; |
2535 | 2527 | ||
2536 | return NULL; | 2528 | return NULL; |
2537 | 2529 | ||
@@ -2539,16 +2531,20 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
2539 | 2531 | ||
2540 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | 2532 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) |
2541 | { | 2533 | { |
2542 | struct vm_struct *v = p; | 2534 | struct vmap_area *va = p, *next; |
2543 | 2535 | ||
2544 | ++*pos; | 2536 | ++*pos; |
2545 | return v->next; | 2537 | next = list_entry(va->list.next, typeof(*va), list); |
2538 | if (&next->list != &vmap_area_list) | ||
2539 | return next; | ||
2540 | |||
2541 | return NULL; | ||
2546 | } | 2542 | } |
2547 | 2543 | ||
2548 | static void s_stop(struct seq_file *m, void *p) | 2544 | static void s_stop(struct seq_file *m, void *p) |
2549 | __releases(&vmlist_lock) | 2545 | __releases(&vmap_area_lock) |
2550 | { | 2546 | { |
2551 | read_unlock(&vmlist_lock); | 2547 | spin_unlock(&vmap_area_lock); |
2552 | } | 2548 | } |
2553 | 2549 | ||
2554 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) | 2550 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) |
@@ -2559,6 +2555,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
2559 | if (!counters) | 2555 | if (!counters) |
2560 | return; | 2556 | return; |
2561 | 2557 | ||
2558 | /* Pair with smp_wmb() in clear_vm_unlist() */ | ||
2559 | smp_rmb(); | ||
2560 | if (v->flags & VM_UNLIST) | ||
2561 | return; | ||
2562 | |||
2562 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | 2563 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); |
2563 | 2564 | ||
2564 | for (nr = 0; nr < v->nr_pages; nr++) | 2565 | for (nr = 0; nr < v->nr_pages; nr++) |
@@ -2572,7 +2573,20 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
2572 | 2573 | ||
2573 | static int s_show(struct seq_file *m, void *p) | 2574 | static int s_show(struct seq_file *m, void *p) |
2574 | { | 2575 | { |
2575 | struct vm_struct *v = p; | 2576 | struct vmap_area *va = p; |
2577 | struct vm_struct *v; | ||
2578 | |||
2579 | if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) | ||
2580 | return 0; | ||
2581 | |||
2582 | if (!(va->flags & VM_VM_AREA)) { | ||
2583 | seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", | ||
2584 | (void *)va->va_start, (void *)va->va_end, | ||
2585 | va->va_end - va->va_start); | ||
2586 | return 0; | ||
2587 | } | ||
2588 | |||
2589 | v = va->vm; | ||
2576 | 2590 | ||
2577 | seq_printf(m, "0x%pK-0x%pK %7ld", | 2591 | seq_printf(m, "0x%pK-0x%pK %7ld", |
2578 | v->addr, v->addr + v->size, v->size); | 2592 | v->addr, v->addr + v->size, v->size); |
@@ -2645,5 +2659,53 @@ static int __init proc_vmalloc_init(void) | |||
2645 | return 0; | 2659 | return 0; |
2646 | } | 2660 | } |
2647 | module_init(proc_vmalloc_init); | 2661 | module_init(proc_vmalloc_init); |
2662 | |||
2663 | void get_vmalloc_info(struct vmalloc_info *vmi) | ||
2664 | { | ||
2665 | struct vmap_area *va; | ||
2666 | unsigned long free_area_size; | ||
2667 | unsigned long prev_end; | ||
2668 | |||
2669 | vmi->used = 0; | ||
2670 | vmi->largest_chunk = 0; | ||
2671 | |||
2672 | prev_end = VMALLOC_START; | ||
2673 | |||
2674 | spin_lock(&vmap_area_lock); | ||
2675 | |||
2676 | if (list_empty(&vmap_area_list)) { | ||
2677 | vmi->largest_chunk = VMALLOC_TOTAL; | ||
2678 | goto out; | ||
2679 | } | ||
2680 | |||
2681 | list_for_each_entry(va, &vmap_area_list, list) { | ||
2682 | unsigned long addr = va->va_start; | ||
2683 | |||
2684 | /* | ||
2685 | * Some archs keep another range for modules in vmalloc space | ||
2686 | */ | ||
2687 | if (addr < VMALLOC_START) | ||
2688 | continue; | ||
2689 | if (addr >= VMALLOC_END) | ||
2690 | break; | ||
2691 | |||
2692 | if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) | ||
2693 | continue; | ||
2694 | |||
2695 | vmi->used += (va->va_end - va->va_start); | ||
2696 | |||
2697 | free_area_size = addr - prev_end; | ||
2698 | if (vmi->largest_chunk < free_area_size) | ||
2699 | vmi->largest_chunk = free_area_size; | ||
2700 | |||
2701 | prev_end = va->va_end; | ||
2702 | } | ||
2703 | |||
2704 | if (VMALLOC_END - prev_end > vmi->largest_chunk) | ||
2705 | vmi->largest_chunk = VMALLOC_END - prev_end; | ||
2706 | |||
2707 | out: | ||
2708 | spin_unlock(&vmap_area_lock); | ||
2709 | } | ||
2648 | #endif | 2710 | #endif |
2649 | 2711 | ||
diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 000000000000..736a6011c2c8 --- /dev/null +++ b/mm/vmpressure.c | |||
@@ -0,0 +1,374 @@ | |||
1 | /* | ||
2 | * Linux VM pressure | ||
3 | * | ||
4 | * Copyright 2012 Linaro Ltd. | ||
5 | * Anton Vorontsov <anton.vorontsov@linaro.org> | ||
6 | * | ||
7 | * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, | ||
8 | * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify it | ||
11 | * under the terms of the GNU General Public License version 2 as published | ||
12 | * by the Free Software Foundation. | ||
13 | */ | ||
14 | |||
15 | #include <linux/cgroup.h> | ||
16 | #include <linux/fs.h> | ||
17 | #include <linux/log2.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/vmstat.h> | ||
21 | #include <linux/eventfd.h> | ||
22 | #include <linux/swap.h> | ||
23 | #include <linux/printk.h> | ||
24 | #include <linux/vmpressure.h> | ||
25 | |||
26 | /* | ||
27 | * The window size (vmpressure_win) is the number of scanned pages before | ||
28 | * we try to analyze scanned/reclaimed ratio. So the window is used as a | ||
29 | * rate-limit tunable for the "low" level notification, and also for | ||
30 | * averaging the ratio for medium/critical levels. Using small window | ||
31 | * sizes can cause lot of false positives, but too big window size will | ||
32 | * delay the notifications. | ||
33 | * | ||
34 | * As the vmscan reclaimer logic works with chunks which are multiple of | ||
35 | * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. | ||
36 | * | ||
37 | * TODO: Make the window size depend on machine size, as we do for vmstat | ||
38 | * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). | ||
39 | */ | ||
40 | static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; | ||
41 | |||
42 | /* | ||
43 | * These thresholds are used when we account memory pressure through | ||
44 | * scanned/reclaimed ratio. The current values were chosen empirically. In | ||
45 | * essence, they are percents: the higher the value, the more number | ||
46 | * unsuccessful reclaims there were. | ||
47 | */ | ||
48 | static const unsigned int vmpressure_level_med = 60; | ||
49 | static const unsigned int vmpressure_level_critical = 95; | ||
50 | |||
51 | /* | ||
52 | * When there are too little pages left to scan, vmpressure() may miss the | ||
53 | * critical pressure as number of pages will be less than "window size". | ||
54 | * However, in that case the vmscan priority will raise fast as the | ||
55 | * reclaimer will try to scan LRUs more deeply. | ||
56 | * | ||
57 | * The vmscan logic considers these special priorities: | ||
58 | * | ||
59 | * prio == DEF_PRIORITY (12): reclaimer starts with that value | ||
60 | * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed | ||
61 | * prio == 0 : close to OOM, kernel scans every page in an lru | ||
62 | * | ||
63 | * Any value in this range is acceptable for this tunable (i.e. from 12 to | ||
64 | * 0). Current value for the vmpressure_level_critical_prio is chosen | ||
65 | * empirically, but the number, in essence, means that we consider | ||
66 | * critical level when scanning depth is ~10% of the lru size (vmscan | ||
67 | * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one | ||
68 | * eights). | ||
69 | */ | ||
70 | static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); | ||
71 | |||
72 | static struct vmpressure *work_to_vmpressure(struct work_struct *work) | ||
73 | { | ||
74 | return container_of(work, struct vmpressure, work); | ||
75 | } | ||
76 | |||
77 | static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) | ||
78 | { | ||
79 | return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id)); | ||
80 | } | ||
81 | |||
82 | static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) | ||
83 | { | ||
84 | struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; | ||
85 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); | ||
86 | |||
87 | memcg = parent_mem_cgroup(memcg); | ||
88 | if (!memcg) | ||
89 | return NULL; | ||
90 | return memcg_to_vmpressure(memcg); | ||
91 | } | ||
92 | |||
93 | enum vmpressure_levels { | ||
94 | VMPRESSURE_LOW = 0, | ||
95 | VMPRESSURE_MEDIUM, | ||
96 | VMPRESSURE_CRITICAL, | ||
97 | VMPRESSURE_NUM_LEVELS, | ||
98 | }; | ||
99 | |||
100 | static const char * const vmpressure_str_levels[] = { | ||
101 | [VMPRESSURE_LOW] = "low", | ||
102 | [VMPRESSURE_MEDIUM] = "medium", | ||
103 | [VMPRESSURE_CRITICAL] = "critical", | ||
104 | }; | ||
105 | |||
106 | static enum vmpressure_levels vmpressure_level(unsigned long pressure) | ||
107 | { | ||
108 | if (pressure >= vmpressure_level_critical) | ||
109 | return VMPRESSURE_CRITICAL; | ||
110 | else if (pressure >= vmpressure_level_med) | ||
111 | return VMPRESSURE_MEDIUM; | ||
112 | return VMPRESSURE_LOW; | ||
113 | } | ||
114 | |||
115 | static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, | ||
116 | unsigned long reclaimed) | ||
117 | { | ||
118 | unsigned long scale = scanned + reclaimed; | ||
119 | unsigned long pressure; | ||
120 | |||
121 | /* | ||
122 | * We calculate the ratio (in percents) of how many pages were | ||
123 | * scanned vs. reclaimed in a given time frame (window). Note that | ||
124 | * time is in VM reclaimer's "ticks", i.e. number of pages | ||
125 | * scanned. This makes it possible to set desired reaction time | ||
126 | * and serves as a ratelimit. | ||
127 | */ | ||
128 | pressure = scale - (reclaimed * scale / scanned); | ||
129 | pressure = pressure * 100 / scale; | ||
130 | |||
131 | pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, | ||
132 | scanned, reclaimed); | ||
133 | |||
134 | return vmpressure_level(pressure); | ||
135 | } | ||
136 | |||
137 | struct vmpressure_event { | ||
138 | struct eventfd_ctx *efd; | ||
139 | enum vmpressure_levels level; | ||
140 | struct list_head node; | ||
141 | }; | ||
142 | |||
143 | static bool vmpressure_event(struct vmpressure *vmpr, | ||
144 | unsigned long scanned, unsigned long reclaimed) | ||
145 | { | ||
146 | struct vmpressure_event *ev; | ||
147 | enum vmpressure_levels level; | ||
148 | bool signalled = false; | ||
149 | |||
150 | level = vmpressure_calc_level(scanned, reclaimed); | ||
151 | |||
152 | mutex_lock(&vmpr->events_lock); | ||
153 | |||
154 | list_for_each_entry(ev, &vmpr->events, node) { | ||
155 | if (level >= ev->level) { | ||
156 | eventfd_signal(ev->efd, 1); | ||
157 | signalled = true; | ||
158 | } | ||
159 | } | ||
160 | |||
161 | mutex_unlock(&vmpr->events_lock); | ||
162 | |||
163 | return signalled; | ||
164 | } | ||
165 | |||
166 | static void vmpressure_work_fn(struct work_struct *work) | ||
167 | { | ||
168 | struct vmpressure *vmpr = work_to_vmpressure(work); | ||
169 | unsigned long scanned; | ||
170 | unsigned long reclaimed; | ||
171 | |||
172 | /* | ||
173 | * Several contexts might be calling vmpressure(), so it is | ||
174 | * possible that the work was rescheduled again before the old | ||
175 | * work context cleared the counters. In that case we will run | ||
176 | * just after the old work returns, but then scanned might be zero | ||
177 | * here. No need for any locks here since we don't care if | ||
178 | * vmpr->reclaimed is in sync. | ||
179 | */ | ||
180 | if (!vmpr->scanned) | ||
181 | return; | ||
182 | |||
183 | mutex_lock(&vmpr->sr_lock); | ||
184 | scanned = vmpr->scanned; | ||
185 | reclaimed = vmpr->reclaimed; | ||
186 | vmpr->scanned = 0; | ||
187 | vmpr->reclaimed = 0; | ||
188 | mutex_unlock(&vmpr->sr_lock); | ||
189 | |||
190 | do { | ||
191 | if (vmpressure_event(vmpr, scanned, reclaimed)) | ||
192 | break; | ||
193 | /* | ||
194 | * If not handled, propagate the event upward into the | ||
195 | * hierarchy. | ||
196 | */ | ||
197 | } while ((vmpr = vmpressure_parent(vmpr))); | ||
198 | } | ||
199 | |||
200 | /** | ||
201 | * vmpressure() - Account memory pressure through scanned/reclaimed ratio | ||
202 | * @gfp: reclaimer's gfp mask | ||
203 | * @memcg: cgroup memory controller handle | ||
204 | * @scanned: number of pages scanned | ||
205 | * @reclaimed: number of pages reclaimed | ||
206 | * | ||
207 | * This function should be called from the vmscan reclaim path to account | ||
208 | * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw | ||
209 | * pressure index is then further refined and averaged over time. | ||
210 | * | ||
211 | * This function does not return any value. | ||
212 | */ | ||
213 | void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | ||
214 | unsigned long scanned, unsigned long reclaimed) | ||
215 | { | ||
216 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); | ||
217 | |||
218 | /* | ||
219 | * Here we only want to account pressure that userland is able to | ||
220 | * help us with. For example, suppose that DMA zone is under | ||
221 | * pressure; if we notify userland about that kind of pressure, | ||
222 | * then it will be mostly a waste as it will trigger unnecessary | ||
223 | * freeing of memory by userland (since userland is more likely to | ||
224 | * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That | ||
225 | * is why we include only movable, highmem and FS/IO pages. | ||
226 | * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so | ||
227 | * we account it too. | ||
228 | */ | ||
229 | if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) | ||
230 | return; | ||
231 | |||
232 | /* | ||
233 | * If we got here with no pages scanned, then that is an indicator | ||
234 | * that reclaimer was unable to find any shrinkable LRUs at the | ||
235 | * current scanning depth. But it does not mean that we should | ||
236 | * report the critical pressure, yet. If the scanning priority | ||
237 | * (scanning depth) goes too high (deep), we will be notified | ||
238 | * through vmpressure_prio(). But so far, keep calm. | ||
239 | */ | ||
240 | if (!scanned) | ||
241 | return; | ||
242 | |||
243 | mutex_lock(&vmpr->sr_lock); | ||
244 | vmpr->scanned += scanned; | ||
245 | vmpr->reclaimed += reclaimed; | ||
246 | scanned = vmpr->scanned; | ||
247 | mutex_unlock(&vmpr->sr_lock); | ||
248 | |||
249 | if (scanned < vmpressure_win || work_pending(&vmpr->work)) | ||
250 | return; | ||
251 | schedule_work(&vmpr->work); | ||
252 | } | ||
253 | |||
254 | /** | ||
255 | * vmpressure_prio() - Account memory pressure through reclaimer priority level | ||
256 | * @gfp: reclaimer's gfp mask | ||
257 | * @memcg: cgroup memory controller handle | ||
258 | * @prio: reclaimer's priority | ||
259 | * | ||
260 | * This function should be called from the reclaim path every time when | ||
261 | * the vmscan's reclaiming priority (scanning depth) changes. | ||
262 | * | ||
263 | * This function does not return any value. | ||
264 | */ | ||
265 | void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | ||
266 | { | ||
267 | /* | ||
268 | * We only use prio for accounting critical level. For more info | ||
269 | * see comment for vmpressure_level_critical_prio variable above. | ||
270 | */ | ||
271 | if (prio > vmpressure_level_critical_prio) | ||
272 | return; | ||
273 | |||
274 | /* | ||
275 | * OK, the prio is below the threshold, updating vmpressure | ||
276 | * information before shrinker dives into long shrinking of long | ||
277 | * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 | ||
278 | * to the vmpressure() basically means that we signal 'critical' | ||
279 | * level. | ||
280 | */ | ||
281 | vmpressure(gfp, memcg, vmpressure_win, 0); | ||
282 | } | ||
283 | |||
284 | /** | ||
285 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd | ||
286 | * @cg: cgroup that is interested in vmpressure notifications | ||
287 | * @cft: cgroup control files handle | ||
288 | * @eventfd: eventfd context to link notifications with | ||
289 | * @args: event arguments (used to set up a pressure level threshold) | ||
290 | * | ||
291 | * This function associates eventfd context with the vmpressure | ||
292 | * infrastructure, so that the notifications will be delivered to the | ||
293 | * @eventfd. The @args parameter is a string that denotes pressure level | ||
294 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or | ||
295 | * "critical"). | ||
296 | * | ||
297 | * This function should not be used directly, just pass it to (struct | ||
298 | * cftype).register_event, and then cgroup core will handle everything by | ||
299 | * itself. | ||
300 | */ | ||
301 | int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, | ||
302 | struct eventfd_ctx *eventfd, const char *args) | ||
303 | { | ||
304 | struct vmpressure *vmpr = cg_to_vmpressure(cg); | ||
305 | struct vmpressure_event *ev; | ||
306 | int level; | ||
307 | |||
308 | for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { | ||
309 | if (!strcmp(vmpressure_str_levels[level], args)) | ||
310 | break; | ||
311 | } | ||
312 | |||
313 | if (level >= VMPRESSURE_NUM_LEVELS) | ||
314 | return -EINVAL; | ||
315 | |||
316 | ev = kzalloc(sizeof(*ev), GFP_KERNEL); | ||
317 | if (!ev) | ||
318 | return -ENOMEM; | ||
319 | |||
320 | ev->efd = eventfd; | ||
321 | ev->level = level; | ||
322 | |||
323 | mutex_lock(&vmpr->events_lock); | ||
324 | list_add(&ev->node, &vmpr->events); | ||
325 | mutex_unlock(&vmpr->events_lock); | ||
326 | |||
327 | return 0; | ||
328 | } | ||
329 | |||
330 | /** | ||
331 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure | ||
332 | * @cg: cgroup handle | ||
333 | * @cft: cgroup control files handle | ||
334 | * @eventfd: eventfd context that was used to link vmpressure with the @cg | ||
335 | * | ||
336 | * This function does internal manipulations to detach the @eventfd from | ||
337 | * the vmpressure notifications, and then frees internal resources | ||
338 | * associated with the @eventfd (but the @eventfd itself is not freed). | ||
339 | * | ||
340 | * This function should not be used directly, just pass it to (struct | ||
341 | * cftype).unregister_event, and then cgroup core will handle everything | ||
342 | * by itself. | ||
343 | */ | ||
344 | void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, | ||
345 | struct eventfd_ctx *eventfd) | ||
346 | { | ||
347 | struct vmpressure *vmpr = cg_to_vmpressure(cg); | ||
348 | struct vmpressure_event *ev; | ||
349 | |||
350 | mutex_lock(&vmpr->events_lock); | ||
351 | list_for_each_entry(ev, &vmpr->events, node) { | ||
352 | if (ev->efd != eventfd) | ||
353 | continue; | ||
354 | list_del(&ev->node); | ||
355 | kfree(ev); | ||
356 | break; | ||
357 | } | ||
358 | mutex_unlock(&vmpr->events_lock); | ||
359 | } | ||
360 | |||
361 | /** | ||
362 | * vmpressure_init() - Initialize vmpressure control structure | ||
363 | * @vmpr: Structure to be initialized | ||
364 | * | ||
365 | * This function should be called on every allocated vmpressure structure | ||
366 | * before any usage. | ||
367 | */ | ||
368 | void vmpressure_init(struct vmpressure *vmpr) | ||
369 | { | ||
370 | mutex_init(&vmpr->sr_lock); | ||
371 | mutex_init(&vmpr->events_lock); | ||
372 | INIT_LIST_HEAD(&vmpr->events); | ||
373 | INIT_WORK(&vmpr->work, vmpressure_work_fn); | ||
374 | } | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 88c5fed8b9a4..fa6a85378ee4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
22 | #include <linux/vmpressure.h> | ||
22 | #include <linux/vmstat.h> | 23 | #include <linux/vmstat.h> |
23 | #include <linux/file.h> | 24 | #include <linux/file.h> |
24 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
@@ -780,7 +781,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
780 | if (PageAnon(page) && !PageSwapCache(page)) { | 781 | if (PageAnon(page) && !PageSwapCache(page)) { |
781 | if (!(sc->gfp_mask & __GFP_IO)) | 782 | if (!(sc->gfp_mask & __GFP_IO)) |
782 | goto keep_locked; | 783 | goto keep_locked; |
783 | if (!add_to_swap(page)) | 784 | if (!add_to_swap(page, page_list)) |
784 | goto activate_locked; | 785 | goto activate_locked; |
785 | may_enter_fs = 1; | 786 | may_enter_fs = 1; |
786 | } | 787 | } |
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
1982 | } | 1983 | } |
1983 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 1984 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
1984 | } while (memcg); | 1985 | } while (memcg); |
1986 | |||
1987 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | ||
1988 | sc->nr_scanned - nr_scanned, | ||
1989 | sc->nr_reclaimed - nr_reclaimed); | ||
1990 | |||
1985 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 1991 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
1986 | sc->nr_scanned - nr_scanned, sc)); | 1992 | sc->nr_scanned - nr_scanned, sc)); |
1987 | } | 1993 | } |
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2167 | count_vm_event(ALLOCSTALL); | 2173 | count_vm_event(ALLOCSTALL); |
2168 | 2174 | ||
2169 | do { | 2175 | do { |
2176 | vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, | ||
2177 | sc->priority); | ||
2170 | sc->nr_scanned = 0; | 2178 | sc->nr_scanned = 0; |
2171 | aborted_reclaim = shrink_zones(zonelist, sc); | 2179 | aborted_reclaim = shrink_zones(zonelist, sc); |
2172 | 2180 | ||
@@ -2619,7 +2627,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2619 | bool pgdat_is_balanced = false; | 2627 | bool pgdat_is_balanced = false; |
2620 | int i; | 2628 | int i; |
2621 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2629 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2622 | unsigned long total_scanned; | ||
2623 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2630 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2624 | unsigned long nr_soft_reclaimed; | 2631 | unsigned long nr_soft_reclaimed; |
2625 | unsigned long nr_soft_scanned; | 2632 | unsigned long nr_soft_scanned; |
@@ -2639,7 +2646,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2639 | .gfp_mask = sc.gfp_mask, | 2646 | .gfp_mask = sc.gfp_mask, |
2640 | }; | 2647 | }; |
2641 | loop_again: | 2648 | loop_again: |
2642 | total_scanned = 0; | ||
2643 | sc.priority = DEF_PRIORITY; | 2649 | sc.priority = DEF_PRIORITY; |
2644 | sc.nr_reclaimed = 0; | 2650 | sc.nr_reclaimed = 0; |
2645 | sc.may_writepage = !laptop_mode; | 2651 | sc.may_writepage = !laptop_mode; |
@@ -2730,7 +2736,6 @@ loop_again: | |||
2730 | order, sc.gfp_mask, | 2736 | order, sc.gfp_mask, |
2731 | &nr_soft_scanned); | 2737 | &nr_soft_scanned); |
2732 | sc.nr_reclaimed += nr_soft_reclaimed; | 2738 | sc.nr_reclaimed += nr_soft_reclaimed; |
2733 | total_scanned += nr_soft_scanned; | ||
2734 | 2739 | ||
2735 | /* | 2740 | /* |
2736 | * We put equal pressure on every zone, unless | 2741 | * We put equal pressure on every zone, unless |
@@ -2765,7 +2770,6 @@ loop_again: | |||
2765 | reclaim_state->reclaimed_slab = 0; | 2770 | reclaim_state->reclaimed_slab = 0; |
2766 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | 2771 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
2767 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2772 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2768 | total_scanned += sc.nr_scanned; | ||
2769 | 2773 | ||
2770 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2774 | if (nr_slab == 0 && !zone_reclaimable(zone)) |
2771 | zone->all_unreclaimable = 1; | 2775 | zone->all_unreclaimable = 1; |
@@ -3188,9 +3192,9 @@ int kswapd_run(int nid) | |||
3188 | if (IS_ERR(pgdat->kswapd)) { | 3192 | if (IS_ERR(pgdat->kswapd)) { |
3189 | /* failure at boot is fatal */ | 3193 | /* failure at boot is fatal */ |
3190 | BUG_ON(system_state == SYSTEM_BOOTING); | 3194 | BUG_ON(system_state == SYSTEM_BOOTING); |
3191 | pgdat->kswapd = NULL; | ||
3192 | pr_err("Failed to start kswapd on node %d\n", nid); | 3195 | pr_err("Failed to start kswapd on node %d\n", nid); |
3193 | ret = PTR_ERR(pgdat->kswapd); | 3196 | ret = PTR_ERR(pgdat->kswapd); |
3197 | pgdat->kswapd = NULL; | ||
3194 | } | 3198 | } |
3195 | return ret; | 3199 | return ret; |
3196 | } | 3200 | } |
diff --git a/mm/vmstat.c b/mm/vmstat.c index e1d8ed172c42..f42745e65780 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -52,7 +52,6 @@ void all_vm_events(unsigned long *ret) | |||
52 | } | 52 | } |
53 | EXPORT_SYMBOL_GPL(all_vm_events); | 53 | EXPORT_SYMBOL_GPL(all_vm_events); |
54 | 54 | ||
55 | #ifdef CONFIG_HOTPLUG | ||
56 | /* | 55 | /* |
57 | * Fold the foreign cpu events into our own. | 56 | * Fold the foreign cpu events into our own. |
58 | * | 57 | * |
@@ -69,7 +68,6 @@ void vm_events_fold_cpu(int cpu) | |||
69 | fold_state->event[i] = 0; | 68 | fold_state->event[i] = 0; |
70 | } | 69 | } |
71 | } | 70 | } |
72 | #endif /* CONFIG_HOTPLUG */ | ||
73 | 71 | ||
74 | #endif /* CONFIG_VM_EVENT_COUNTERS */ | 72 | #endif /* CONFIG_VM_EVENT_COUNTERS */ |
75 | 73 | ||
@@ -495,6 +493,10 @@ void refresh_cpu_vm_stats(int cpu) | |||
495 | atomic_long_add(global_diff[i], &vm_stat[i]); | 493 | atomic_long_add(global_diff[i], &vm_stat[i]); |
496 | } | 494 | } |
497 | 495 | ||
496 | /* | ||
497 | * this is only called if !populated_zone(zone), which implies no other users of | ||
498 | * pset->vm_stat_diff[] exsist. | ||
499 | */ | ||
498 | void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) | 500 | void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) |
499 | { | 501 | { |
500 | int i; | 502 | int i; |