diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 195 | ||||
-rw-r--r-- | mm/fadvise.c | 10 | ||||
-rw-r--r-- | mm/failslab.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 4 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/highmem.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/ksm.c | 12 | ||||
-rw-r--r-- | mm/memcontrol.c | 1388 | ||||
-rw-r--r-- | mm/memory-failure.c | 5 | ||||
-rw-r--r-- | mm/memory.c | 180 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 10 | ||||
-rw-r--r-- | mm/mempolicy.c | 112 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mlock.c | 12 | ||||
-rw-r--r-- | mm/mmap.c | 175 | ||||
-rw-r--r-- | mm/mmu_context.c | 3 | ||||
-rw-r--r-- | mm/mremap.c | 9 | ||||
-rw-r--r-- | mm/nommu.c | 30 | ||||
-rw-r--r-- | mm/oom_kill.c | 14 | ||||
-rw-r--r-- | mm/page_alloc.c | 401 | ||||
-rw-r--r-- | mm/page_cgroup.c | 34 | ||||
-rw-r--r-- | mm/percpu.c | 36 | ||||
-rw-r--r-- | mm/readahead.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 185 | ||||
-rw-r--r-- | mm/slab.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 343 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 76 | ||||
-rw-r--r-- | mm/sparse.c | 196 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 71 | ||||
-rw-r--r-- | mm/vmscan.c | 177 | ||||
-rw-r--r-- | mm/vmstat.c | 17 |
35 files changed, 2777 insertions, 977 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d34c2b971032..9c61158308dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME | |||
115 | config SPARSEMEM_VMEMMAP_ENABLE | 115 | config SPARSEMEM_VMEMMAP_ENABLE |
116 | bool | 116 | bool |
117 | 117 | ||
118 | config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
119 | def_bool y | ||
120 | depends on SPARSEMEM && X86_64 | ||
121 | |||
118 | config SPARSEMEM_VMEMMAP | 122 | config SPARSEMEM_VMEMMAP |
119 | bool "Sparse Memory virtual memmap" | 123 | bool "Sparse Memory virtual memmap" |
120 | depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE | 124 | depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 7d1486875e1c..d7c791ef0036 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/kmemleak.h> | 15 | #include <linux/kmemleak.h> |
16 | #include <linux/range.h> | ||
16 | 17 | ||
17 | #include <asm/bug.h> | 18 | #include <asm/bug.h> |
18 | #include <asm/io.h> | 19 | #include <asm/io.h> |
@@ -32,6 +33,7 @@ unsigned long max_pfn; | |||
32 | unsigned long saved_max_pfn; | 33 | unsigned long saved_max_pfn; |
33 | #endif | 34 | #endif |
34 | 35 | ||
36 | #ifndef CONFIG_NO_BOOTMEM | ||
35 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; | 37 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
36 | 38 | ||
37 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | 39 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); |
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
142 | min_low_pfn = start; | 144 | min_low_pfn = start; |
143 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | 145 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); |
144 | } | 146 | } |
145 | 147 | #endif | |
146 | /* | 148 | /* |
147 | * free_bootmem_late - free bootmem pages directly to page allocator | 149 | * free_bootmem_late - free bootmem pages directly to page allocator |
148 | * @addr: starting address of the range | 150 | * @addr: starting address of the range |
@@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
167 | } | 169 | } |
168 | } | 170 | } |
169 | 171 | ||
172 | #ifdef CONFIG_NO_BOOTMEM | ||
173 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | ||
174 | { | ||
175 | int i; | ||
176 | unsigned long start_aligned, end_aligned; | ||
177 | int order = ilog2(BITS_PER_LONG); | ||
178 | |||
179 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | ||
180 | end_aligned = end & ~(BITS_PER_LONG - 1); | ||
181 | |||
182 | if (end_aligned <= start_aligned) { | ||
183 | #if 1 | ||
184 | printk(KERN_DEBUG " %lx - %lx\n", start, end); | ||
185 | #endif | ||
186 | for (i = start; i < end; i++) | ||
187 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
188 | |||
189 | return; | ||
190 | } | ||
191 | |||
192 | #if 1 | ||
193 | printk(KERN_DEBUG " %lx %lx - %lx %lx\n", | ||
194 | start, start_aligned, end_aligned, end); | ||
195 | #endif | ||
196 | for (i = start; i < start_aligned; i++) | ||
197 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
198 | |||
199 | for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) | ||
200 | __free_pages_bootmem(pfn_to_page(i), order); | ||
201 | |||
202 | for (i = end_aligned; i < end; i++) | ||
203 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
204 | } | ||
205 | |||
206 | unsigned long __init free_all_memory_core_early(int nodeid) | ||
207 | { | ||
208 | int i; | ||
209 | u64 start, end; | ||
210 | unsigned long count = 0; | ||
211 | struct range *range = NULL; | ||
212 | int nr_range; | ||
213 | |||
214 | nr_range = get_free_all_memory_range(&range, nodeid); | ||
215 | |||
216 | for (i = 0; i < nr_range; i++) { | ||
217 | start = range[i].start; | ||
218 | end = range[i].end; | ||
219 | count += end - start; | ||
220 | __free_pages_memory(start, end); | ||
221 | } | ||
222 | |||
223 | return count; | ||
224 | } | ||
225 | #else | ||
170 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 226 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
171 | { | 227 | { |
172 | int aligned; | 228 | int aligned; |
@@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
227 | 283 | ||
228 | return count; | 284 | return count; |
229 | } | 285 | } |
286 | #endif | ||
230 | 287 | ||
231 | /** | 288 | /** |
232 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 289 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
@@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
237 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 294 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
238 | { | 295 | { |
239 | register_page_bootmem_info_node(pgdat); | 296 | register_page_bootmem_info_node(pgdat); |
297 | #ifdef CONFIG_NO_BOOTMEM | ||
298 | /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ | ||
299 | return 0; | ||
300 | #else | ||
240 | return free_all_bootmem_core(pgdat->bdata); | 301 | return free_all_bootmem_core(pgdat->bdata); |
302 | #endif | ||
241 | } | 303 | } |
242 | 304 | ||
243 | /** | 305 | /** |
@@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | |||
247 | */ | 309 | */ |
248 | unsigned long __init free_all_bootmem(void) | 310 | unsigned long __init free_all_bootmem(void) |
249 | { | 311 | { |
312 | #ifdef CONFIG_NO_BOOTMEM | ||
313 | return free_all_memory_core_early(NODE_DATA(0)->node_id); | ||
314 | #else | ||
250 | return free_all_bootmem_core(NODE_DATA(0)->bdata); | 315 | return free_all_bootmem_core(NODE_DATA(0)->bdata); |
316 | #endif | ||
251 | } | 317 | } |
252 | 318 | ||
319 | #ifndef CONFIG_NO_BOOTMEM | ||
253 | static void __init __free(bootmem_data_t *bdata, | 320 | static void __init __free(bootmem_data_t *bdata, |
254 | unsigned long sidx, unsigned long eidx) | 321 | unsigned long sidx, unsigned long eidx) |
255 | { | 322 | { |
@@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
344 | } | 411 | } |
345 | BUG(); | 412 | BUG(); |
346 | } | 413 | } |
414 | #endif | ||
347 | 415 | ||
348 | /** | 416 | /** |
349 | * free_bootmem_node - mark a page range as usable | 417 | * free_bootmem_node - mark a page range as usable |
@@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
358 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 426 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
359 | unsigned long size) | 427 | unsigned long size) |
360 | { | 428 | { |
429 | #ifdef CONFIG_NO_BOOTMEM | ||
430 | free_early(physaddr, physaddr + size); | ||
431 | #if 0 | ||
432 | printk(KERN_DEBUG "free %lx %lx\n", physaddr, size); | ||
433 | #endif | ||
434 | #else | ||
361 | unsigned long start, end; | 435 | unsigned long start, end; |
362 | 436 | ||
363 | kmemleak_free_part(__va(physaddr), size); | 437 | kmemleak_free_part(__va(physaddr), size); |
@@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
366 | end = PFN_DOWN(physaddr + size); | 440 | end = PFN_DOWN(physaddr + size); |
367 | 441 | ||
368 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); | 442 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); |
443 | #endif | ||
369 | } | 444 | } |
370 | 445 | ||
371 | /** | 446 | /** |
@@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
379 | */ | 454 | */ |
380 | void __init free_bootmem(unsigned long addr, unsigned long size) | 455 | void __init free_bootmem(unsigned long addr, unsigned long size) |
381 | { | 456 | { |
457 | #ifdef CONFIG_NO_BOOTMEM | ||
458 | free_early(addr, addr + size); | ||
459 | #if 0 | ||
460 | printk(KERN_DEBUG "free %lx %lx\n", addr, size); | ||
461 | #endif | ||
462 | #else | ||
382 | unsigned long start, end; | 463 | unsigned long start, end; |
383 | 464 | ||
384 | kmemleak_free_part(__va(addr), size); | 465 | kmemleak_free_part(__va(addr), size); |
@@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
387 | end = PFN_DOWN(addr + size); | 468 | end = PFN_DOWN(addr + size); |
388 | 469 | ||
389 | mark_bootmem(start, end, 0, 0); | 470 | mark_bootmem(start, end, 0, 0); |
471 | #endif | ||
390 | } | 472 | } |
391 | 473 | ||
392 | /** | 474 | /** |
@@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
403 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 485 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
404 | unsigned long size, int flags) | 486 | unsigned long size, int flags) |
405 | { | 487 | { |
488 | #ifdef CONFIG_NO_BOOTMEM | ||
489 | panic("no bootmem"); | ||
490 | return 0; | ||
491 | #else | ||
406 | unsigned long start, end; | 492 | unsigned long start, end; |
407 | 493 | ||
408 | start = PFN_DOWN(physaddr); | 494 | start = PFN_DOWN(physaddr); |
409 | end = PFN_UP(physaddr + size); | 495 | end = PFN_UP(physaddr + size); |
410 | 496 | ||
411 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); | 497 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); |
498 | #endif | ||
412 | } | 499 | } |
413 | 500 | ||
414 | /** | 501 | /** |
@@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
424 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | 511 | int __init reserve_bootmem(unsigned long addr, unsigned long size, |
425 | int flags) | 512 | int flags) |
426 | { | 513 | { |
514 | #ifdef CONFIG_NO_BOOTMEM | ||
515 | panic("no bootmem"); | ||
516 | return 0; | ||
517 | #else | ||
427 | unsigned long start, end; | 518 | unsigned long start, end; |
428 | 519 | ||
429 | start = PFN_DOWN(addr); | 520 | start = PFN_DOWN(addr); |
430 | end = PFN_UP(addr + size); | 521 | end = PFN_UP(addr + size); |
431 | 522 | ||
432 | return mark_bootmem(start, end, 1, flags); | 523 | return mark_bootmem(start, end, 1, flags); |
524 | #endif | ||
433 | } | 525 | } |
434 | 526 | ||
527 | #ifndef CONFIG_NO_BOOTMEM | ||
435 | static unsigned long __init align_idx(struct bootmem_data *bdata, | 528 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
436 | unsigned long idx, unsigned long step) | 529 | unsigned long idx, unsigned long step) |
437 | { | 530 | { |
@@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
582 | #endif | 675 | #endif |
583 | return NULL; | 676 | return NULL; |
584 | } | 677 | } |
678 | #endif | ||
585 | 679 | ||
586 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | 680 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, |
587 | unsigned long align, | 681 | unsigned long align, |
588 | unsigned long goal, | 682 | unsigned long goal, |
589 | unsigned long limit) | 683 | unsigned long limit) |
590 | { | 684 | { |
685 | #ifdef CONFIG_NO_BOOTMEM | ||
686 | void *ptr; | ||
687 | |||
688 | if (WARN_ON_ONCE(slab_is_available())) | ||
689 | return kzalloc(size, GFP_NOWAIT); | ||
690 | |||
691 | restart: | ||
692 | |||
693 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | ||
694 | |||
695 | if (ptr) | ||
696 | return ptr; | ||
697 | |||
698 | if (goal != 0) { | ||
699 | goal = 0; | ||
700 | goto restart; | ||
701 | } | ||
702 | |||
703 | return NULL; | ||
704 | #else | ||
591 | bootmem_data_t *bdata; | 705 | bootmem_data_t *bdata; |
592 | void *region; | 706 | void *region; |
593 | 707 | ||
@@ -613,6 +727,7 @@ restart: | |||
613 | } | 727 | } |
614 | 728 | ||
615 | return NULL; | 729 | return NULL; |
730 | #endif | ||
616 | } | 731 | } |
617 | 732 | ||
618 | /** | 733 | /** |
@@ -631,7 +746,13 @@ restart: | |||
631 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | 746 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, |
632 | unsigned long goal) | 747 | unsigned long goal) |
633 | { | 748 | { |
634 | return ___alloc_bootmem_nopanic(size, align, goal, 0); | 749 | unsigned long limit = 0; |
750 | |||
751 | #ifdef CONFIG_NO_BOOTMEM | ||
752 | limit = -1UL; | ||
753 | #endif | ||
754 | |||
755 | return ___alloc_bootmem_nopanic(size, align, goal, limit); | ||
635 | } | 756 | } |
636 | 757 | ||
637 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | 758 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, |
@@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | |||
665 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | 786 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, |
666 | unsigned long goal) | 787 | unsigned long goal) |
667 | { | 788 | { |
668 | return ___alloc_bootmem(size, align, goal, 0); | 789 | unsigned long limit = 0; |
790 | |||
791 | #ifdef CONFIG_NO_BOOTMEM | ||
792 | limit = -1UL; | ||
793 | #endif | ||
794 | |||
795 | return ___alloc_bootmem(size, align, goal, limit); | ||
669 | } | 796 | } |
670 | 797 | ||
798 | #ifndef CONFIG_NO_BOOTMEM | ||
671 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | 799 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, |
672 | unsigned long size, unsigned long align, | 800 | unsigned long size, unsigned long align, |
673 | unsigned long goal, unsigned long limit) | 801 | unsigned long goal, unsigned long limit) |
@@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
684 | 812 | ||
685 | return ___alloc_bootmem(size, align, goal, limit); | 813 | return ___alloc_bootmem(size, align, goal, limit); |
686 | } | 814 | } |
815 | #endif | ||
687 | 816 | ||
688 | /** | 817 | /** |
689 | * __alloc_bootmem_node - allocate boot memory from a specific node | 818 | * __alloc_bootmem_node - allocate boot memory from a specific node |
@@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
706 | if (WARN_ON_ONCE(slab_is_available())) | 835 | if (WARN_ON_ONCE(slab_is_available())) |
707 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 836 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
708 | 837 | ||
838 | #ifdef CONFIG_NO_BOOTMEM | ||
839 | return __alloc_memory_core_early(pgdat->node_id, size, align, | ||
840 | goal, -1ULL); | ||
841 | #else | ||
709 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 842 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); |
843 | #endif | ||
844 | } | ||
845 | |||
846 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | ||
847 | unsigned long align, unsigned long goal) | ||
848 | { | ||
849 | #ifdef MAX_DMA32_PFN | ||
850 | unsigned long end_pfn; | ||
851 | |||
852 | if (WARN_ON_ONCE(slab_is_available())) | ||
853 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
854 | |||
855 | /* update goal according ...MAX_DMA32_PFN */ | ||
856 | end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
857 | |||
858 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && | ||
859 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { | ||
860 | void *ptr; | ||
861 | unsigned long new_goal; | ||
862 | |||
863 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | ||
864 | #ifdef CONFIG_NO_BOOTMEM | ||
865 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
866 | new_goal, -1ULL); | ||
867 | #else | ||
868 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, | ||
869 | new_goal, 0); | ||
870 | #endif | ||
871 | if (ptr) | ||
872 | return ptr; | ||
873 | } | ||
874 | #endif | ||
875 | |||
876 | return __alloc_bootmem_node(pgdat, size, align, goal); | ||
877 | |||
710 | } | 878 | } |
711 | 879 | ||
712 | #ifdef CONFIG_SPARSEMEM | 880 | #ifdef CONFIG_SPARSEMEM |
@@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
720 | void * __init alloc_bootmem_section(unsigned long size, | 888 | void * __init alloc_bootmem_section(unsigned long size, |
721 | unsigned long section_nr) | 889 | unsigned long section_nr) |
722 | { | 890 | { |
891 | #ifdef CONFIG_NO_BOOTMEM | ||
892 | unsigned long pfn, goal, limit; | ||
893 | |||
894 | pfn = section_nr_to_pfn(section_nr); | ||
895 | goal = pfn << PAGE_SHIFT; | ||
896 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
897 | |||
898 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | ||
899 | SMP_CACHE_BYTES, goal, limit); | ||
900 | #else | ||
723 | bootmem_data_t *bdata; | 901 | bootmem_data_t *bdata; |
724 | unsigned long pfn, goal, limit; | 902 | unsigned long pfn, goal, limit; |
725 | 903 | ||
@@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
729 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 907 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
730 | 908 | ||
731 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 909 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); |
910 | #endif | ||
732 | } | 911 | } |
733 | #endif | 912 | #endif |
734 | 913 | ||
@@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | |||
740 | if (WARN_ON_ONCE(slab_is_available())) | 919 | if (WARN_ON_ONCE(slab_is_available())) |
741 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 920 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
742 | 921 | ||
922 | #ifdef CONFIG_NO_BOOTMEM | ||
923 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
924 | goal, -1ULL); | ||
925 | #else | ||
743 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | 926 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); |
744 | if (ptr) | 927 | if (ptr) |
745 | return ptr; | 928 | return ptr; |
746 | 929 | ||
747 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 930 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
931 | #endif | ||
748 | if (ptr) | 932 | if (ptr) |
749 | return ptr; | 933 | return ptr; |
750 | 934 | ||
@@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | |||
795 | if (WARN_ON_ONCE(slab_is_available())) | 979 | if (WARN_ON_ONCE(slab_is_available())) |
796 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 980 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
797 | 981 | ||
982 | #ifdef CONFIG_NO_BOOTMEM | ||
983 | return __alloc_memory_core_early(pgdat->node_id, size, align, | ||
984 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
985 | #else | ||
798 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 986 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
799 | goal, ARCH_LOW_ADDRESS_LIMIT); | 987 | goal, ARCH_LOW_ADDRESS_LIMIT); |
988 | #endif | ||
800 | } | 989 | } |
diff --git a/mm/fadvise.c b/mm/fadvise.c index e43359214f6f..8d723c9e8b75 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
77 | switch (advice) { | 77 | switch (advice) { |
78 | case POSIX_FADV_NORMAL: | 78 | case POSIX_FADV_NORMAL: |
79 | file->f_ra.ra_pages = bdi->ra_pages; | 79 | file->f_ra.ra_pages = bdi->ra_pages; |
80 | spin_lock(&file->f_lock); | ||
81 | file->f_mode &= ~FMODE_RANDOM; | ||
82 | spin_unlock(&file->f_lock); | ||
80 | break; | 83 | break; |
81 | case POSIX_FADV_RANDOM: | 84 | case POSIX_FADV_RANDOM: |
82 | file->f_ra.ra_pages = 0; | 85 | spin_lock(&file->f_lock); |
86 | file->f_mode |= FMODE_RANDOM; | ||
87 | spin_unlock(&file->f_lock); | ||
83 | break; | 88 | break; |
84 | case POSIX_FADV_SEQUENTIAL: | 89 | case POSIX_FADV_SEQUENTIAL: |
85 | file->f_ra.ra_pages = bdi->ra_pages * 2; | 90 | file->f_ra.ra_pages = bdi->ra_pages * 2; |
91 | spin_lock(&file->f_lock); | ||
92 | file->f_mode &= ~FMODE_RANDOM; | ||
93 | spin_unlock(&file->f_lock); | ||
86 | break; | 94 | break; |
87 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
88 | if (!mapping->a_ops->readpage) { | 96 | if (!mapping->a_ops->readpage) { |
diff --git a/mm/failslab.c b/mm/failslab.c index 9339de5f0a91..bb41f98dd8b7 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
@@ -1,18 +1,22 @@ | |||
1 | #include <linux/fault-inject.h> | 1 | #include <linux/fault-inject.h> |
2 | #include <linux/gfp.h> | 2 | #include <linux/gfp.h> |
3 | #include <linux/slab.h> | ||
3 | 4 | ||
4 | static struct { | 5 | static struct { |
5 | struct fault_attr attr; | 6 | struct fault_attr attr; |
6 | u32 ignore_gfp_wait; | 7 | u32 ignore_gfp_wait; |
8 | int cache_filter; | ||
7 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 9 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
8 | struct dentry *ignore_gfp_wait_file; | 10 | struct dentry *ignore_gfp_wait_file; |
11 | struct dentry *cache_filter_file; | ||
9 | #endif | 12 | #endif |
10 | } failslab = { | 13 | } failslab = { |
11 | .attr = FAULT_ATTR_INITIALIZER, | 14 | .attr = FAULT_ATTR_INITIALIZER, |
12 | .ignore_gfp_wait = 1, | 15 | .ignore_gfp_wait = 1, |
16 | .cache_filter = 0, | ||
13 | }; | 17 | }; |
14 | 18 | ||
15 | bool should_failslab(size_t size, gfp_t gfpflags) | 19 | bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) |
16 | { | 20 | { |
17 | if (gfpflags & __GFP_NOFAIL) | 21 | if (gfpflags & __GFP_NOFAIL) |
18 | return false; | 22 | return false; |
@@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags) | |||
20 | if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) | 24 | if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) |
21 | return false; | 25 | return false; |
22 | 26 | ||
27 | if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) | ||
28 | return false; | ||
29 | |||
23 | return should_fail(&failslab.attr, size); | 30 | return should_fail(&failslab.attr, size); |
24 | } | 31 | } |
25 | 32 | ||
@@ -30,7 +37,6 @@ static int __init setup_failslab(char *str) | |||
30 | __setup("failslab=", setup_failslab); | 37 | __setup("failslab=", setup_failslab); |
31 | 38 | ||
32 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 39 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
33 | |||
34 | static int __init failslab_debugfs_init(void) | 40 | static int __init failslab_debugfs_init(void) |
35 | { | 41 | { |
36 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 42 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
@@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void) | |||
46 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | 52 | debugfs_create_bool("ignore-gfp-wait", mode, dir, |
47 | &failslab.ignore_gfp_wait); | 53 | &failslab.ignore_gfp_wait); |
48 | 54 | ||
49 | if (!failslab.ignore_gfp_wait_file) { | 55 | failslab.cache_filter_file = |
56 | debugfs_create_bool("cache-filter", mode, dir, | ||
57 | &failslab.cache_filter); | ||
58 | |||
59 | if (!failslab.ignore_gfp_wait_file || | ||
60 | !failslab.cache_filter_file) { | ||
50 | err = -ENOMEM; | 61 | err = -ENOMEM; |
62 | debugfs_remove(failslab.cache_filter_file); | ||
51 | debugfs_remove(failslab.ignore_gfp_wait_file); | 63 | debugfs_remove(failslab.ignore_gfp_wait_file); |
52 | cleanup_fault_attr_dentries(&failslab.attr); | 64 | cleanup_fault_attr_dentries(&failslab.attr); |
53 | } | 65 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 698ea80f2102..045b31c37653 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1117,7 +1117,7 @@ readpage: | |||
1117 | if (!PageUptodate(page)) { | 1117 | if (!PageUptodate(page)) { |
1118 | if (page->mapping == NULL) { | 1118 | if (page->mapping == NULL) { |
1119 | /* | 1119 | /* |
1120 | * invalidate_inode_pages got it | 1120 | * invalidate_mapping_pages got it |
1121 | */ | 1121 | */ |
1122 | unlock_page(page); | 1122 | unlock_page(page); |
1123 | page_cache_release(page); | 1123 | page_cache_release(page); |
@@ -1986,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count); | |||
1986 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) | 1986 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) |
1987 | { | 1987 | { |
1988 | struct inode *inode = file->f_mapping->host; | 1988 | struct inode *inode = file->f_mapping->host; |
1989 | unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 1989 | unsigned long limit = rlimit(RLIMIT_FSIZE); |
1990 | 1990 | ||
1991 | if (unlikely(*pos < 0)) | 1991 | if (unlikely(*pos < 0)) |
1992 | return -EINVAL; | 1992 | return -EINVAL; |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d71bb8..78b94f0b6d5d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -194,7 +194,7 @@ retry: | |||
194 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
195 | pteval = ptep_clear_flush_notify(vma, address, pte); | 195 | pteval = ptep_clear_flush_notify(vma, address, pte); |
196 | page_remove_rmap(page); | 196 | page_remove_rmap(page); |
197 | dec_mm_counter(mm, file_rss); | 197 | dec_mm_counter(mm, MM_FILEPAGES); |
198 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
199 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
200 | page_cache_release(page); | 200 | page_cache_release(page); |
diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85abbb39..46f5dacf90a2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
40 | page_remove_rmap(page); | 40 | page_remove_rmap(page); |
41 | page_cache_release(page); | 41 | page_cache_release(page); |
42 | update_hiwater_rss(mm); | 42 | update_hiwater_rss(mm); |
43 | dec_mm_counter(mm, file_rss); | 43 | dec_mm_counter(mm, MM_FILEPAGES); |
44 | } | 44 | } |
45 | } else { | 45 | } else { |
46 | if (!pte_file(pte)) | 46 | if (!pte_file(pte)) |
diff --git a/mm/highmem.c b/mm/highmem.c index 9c1e627f282e..bed8a8bfd01f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high); | |||
220 | * @page: &struct page to pin | 220 | * @page: &struct page to pin |
221 | * | 221 | * |
222 | * Returns the page's current virtual memory address, or NULL if no mapping | 222 | * Returns the page's current virtual memory address, or NULL if no mapping |
223 | * exists. When and only when a non null address is returned then a | 223 | * exists. If and only if a non null address is returned then a |
224 | * matching call to kunmap_high() is necessary. | 224 | * matching call to kunmap_high() is necessary. |
225 | * | 225 | * |
226 | * This can be called from any context. | 226 | * This can be called from any context. |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2d16fa6b8c2d..3a5aeb37c110 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2087,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
2087 | 2087 | ||
2088 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2088 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
2089 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 2089 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
2090 | update_mmu_cache(vma, address, entry); | 2090 | update_mmu_cache(vma, address, ptep); |
2091 | } | 2091 | } |
2092 | } | 2092 | } |
2093 | 2093 | ||
@@ -2558,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2558 | entry = pte_mkyoung(entry); | 2558 | entry = pte_mkyoung(entry); |
2559 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 2559 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
2560 | flags & FAULT_FLAG_WRITE)) | 2560 | flags & FAULT_FLAG_WRITE)) |
2561 | update_mmu_cache(vma, address, entry); | 2561 | update_mmu_cache(vma, address, ptep); |
2562 | 2562 | ||
2563 | out_page_table_lock: | 2563 | out_page_table_lock: |
2564 | spin_unlock(&mm->page_table_lock); | 2564 | spin_unlock(&mm->page_table_lock); |
@@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | |||
1563 | again: | 1563 | again: |
1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
1566 | struct anon_vma_chain *vmac; | ||
1566 | struct vm_area_struct *vma; | 1567 | struct vm_area_struct *vma; |
1567 | 1568 | ||
1568 | spin_lock(&anon_vma->lock); | 1569 | spin_lock(&anon_vma->lock); |
1569 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1570 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { |
1571 | vma = vmac->vma; | ||
1570 | if (rmap_item->address < vma->vm_start || | 1572 | if (rmap_item->address < vma->vm_start || |
1571 | rmap_item->address >= vma->vm_end) | 1573 | rmap_item->address >= vma->vm_end) |
1572 | continue; | 1574 | continue; |
@@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | |||
1614 | again: | 1616 | again: |
1615 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1617 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
1616 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1618 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
1619 | struct anon_vma_chain *vmac; | ||
1617 | struct vm_area_struct *vma; | 1620 | struct vm_area_struct *vma; |
1618 | 1621 | ||
1619 | spin_lock(&anon_vma->lock); | 1622 | spin_lock(&anon_vma->lock); |
1620 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1623 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { |
1624 | vma = vmac->vma; | ||
1621 | if (rmap_item->address < vma->vm_start || | 1625 | if (rmap_item->address < vma->vm_start || |
1622 | rmap_item->address >= vma->vm_end) | 1626 | rmap_item->address >= vma->vm_end) |
1623 | continue; | 1627 | continue; |
@@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | |||
1664 | again: | 1668 | again: |
1665 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1669 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
1666 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1670 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
1671 | struct anon_vma_chain *vmac; | ||
1667 | struct vm_area_struct *vma; | 1672 | struct vm_area_struct *vma; |
1668 | 1673 | ||
1669 | spin_lock(&anon_vma->lock); | 1674 | spin_lock(&anon_vma->lock); |
1670 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1675 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { |
1676 | vma = vmac->vma; | ||
1671 | if (rmap_item->address < vma->vm_start || | 1677 | if (rmap_item->address < vma->vm_start || |
1672 | rmap_item->address >= vma->vm_end) | 1678 | rmap_item->address >= vma->vm_end) |
1673 | continue; | 1679 | continue; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 954032b80bed..7973b5221fb8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -6,6 +6,10 @@ | |||
6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | * | 8 | * |
9 | * Memory thresholds | ||
10 | * Copyright (C) 2009 Nokia Corporation | ||
11 | * Author: Kirill A. Shutemov | ||
12 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | 14 | * it under the terms of the GNU General Public License as published by |
11 | * the Free Software Foundation; either version 2 of the License, or | 15 | * the Free Software Foundation; either version 2 of the License, or |
@@ -21,6 +25,7 @@ | |||
21 | #include <linux/memcontrol.h> | 25 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 26 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/hugetlb.h> | ||
24 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
25 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
26 | #include <linux/page-flags.h> | 31 | #include <linux/page-flags.h> |
@@ -32,7 +37,10 @@ | |||
32 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
33 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
34 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
40 | #include <linux/swapops.h> | ||
35 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/eventfd.h> | ||
43 | #include <linux/sort.h> | ||
36 | #include <linux/fs.h> | 44 | #include <linux/fs.h> |
37 | #include <linux/seq_file.h> | 45 | #include <linux/seq_file.h> |
38 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
@@ -55,7 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
55 | #define do_swap_account (0) | 63 | #define do_swap_account (0) |
56 | #endif | 64 | #endif |
57 | 65 | ||
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 66 | /* |
67 | * Per memcg event counter is incremented at every pagein/pageout. This counter | ||
68 | * is used for trigger some periodic events. This is straightforward and better | ||
69 | * than using jiffies etc. to handle periodic memcg event. | ||
70 | * | ||
71 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
72 | */ | ||
73 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
74 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
59 | 75 | ||
60 | /* | 76 | /* |
61 | * Statistics for memory cgroup. | 77 | * Statistics for memory cgroup. |
@@ -69,62 +85,16 @@ enum mem_cgroup_stat_index { | |||
69 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 85 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 86 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 87 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 88 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
89 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | ||
74 | 90 | ||
75 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
76 | }; | 92 | }; |
77 | 93 | ||
78 | struct mem_cgroup_stat_cpu { | 94 | struct mem_cgroup_stat_cpu { |
79 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 95 | s64 count[MEM_CGROUP_STAT_NSTATS]; |
80 | } ____cacheline_aligned_in_smp; | ||
81 | |||
82 | struct mem_cgroup_stat { | ||
83 | struct mem_cgroup_stat_cpu cpustat[0]; | ||
84 | }; | 96 | }; |
85 | 97 | ||
86 | static inline void | ||
87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
88 | enum mem_cgroup_stat_index idx) | ||
89 | { | ||
90 | stat->count[idx] = 0; | ||
91 | } | ||
92 | |||
93 | static inline s64 | ||
94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
95 | enum mem_cgroup_stat_index idx) | ||
96 | { | ||
97 | return stat->count[idx]; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * For accounting under irq disable, no need for increment preempt count. | ||
102 | */ | ||
103 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, | ||
104 | enum mem_cgroup_stat_index idx, int val) | ||
105 | { | ||
106 | stat->count[idx] += val; | ||
107 | } | ||
108 | |||
109 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | ||
110 | enum mem_cgroup_stat_index idx) | ||
111 | { | ||
112 | int cpu; | ||
113 | s64 ret = 0; | ||
114 | for_each_possible_cpu(cpu) | ||
115 | ret += stat->cpustat[cpu].count[idx]; | ||
116 | return ret; | ||
117 | } | ||
118 | |||
119 | static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | ||
120 | { | ||
121 | s64 ret; | ||
122 | |||
123 | ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | ||
124 | ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | ||
125 | return ret; | ||
126 | } | ||
127 | |||
128 | /* | 98 | /* |
129 | * per-zone information in memory controller. | 99 | * per-zone information in memory controller. |
130 | */ | 100 | */ |
@@ -174,6 +144,22 @@ struct mem_cgroup_tree { | |||
174 | 144 | ||
175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 145 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
176 | 146 | ||
147 | struct mem_cgroup_threshold { | ||
148 | struct eventfd_ctx *eventfd; | ||
149 | u64 threshold; | ||
150 | }; | ||
151 | |||
152 | struct mem_cgroup_threshold_ary { | ||
153 | /* An array index points to threshold just below usage. */ | ||
154 | atomic_t current_threshold; | ||
155 | /* Size of entries[] */ | ||
156 | unsigned int size; | ||
157 | /* Array of thresholds */ | ||
158 | struct mem_cgroup_threshold entries[0]; | ||
159 | }; | ||
160 | |||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | ||
162 | |||
177 | /* | 163 | /* |
178 | * The memory controller data structure. The memory controller controls both | 164 | * The memory controller data structure. The memory controller controls both |
179 | * page cache and RSS per cgroup. We would eventually like to provide | 165 | * page cache and RSS per cgroup. We would eventually like to provide |
@@ -217,7 +203,7 @@ struct mem_cgroup { | |||
217 | * Should the accounting and control be hierarchical, per subtree? | 203 | * Should the accounting and control be hierarchical, per subtree? |
218 | */ | 204 | */ |
219 | bool use_hierarchy; | 205 | bool use_hierarchy; |
220 | unsigned long last_oom_jiffies; | 206 | atomic_t oom_lock; |
221 | atomic_t refcnt; | 207 | atomic_t refcnt; |
222 | 208 | ||
223 | unsigned int swappiness; | 209 | unsigned int swappiness; |
@@ -225,10 +211,48 @@ struct mem_cgroup { | |||
225 | /* set when res.limit == memsw.limit */ | 211 | /* set when res.limit == memsw.limit */ |
226 | bool memsw_is_minimum; | 212 | bool memsw_is_minimum; |
227 | 213 | ||
214 | /* protect arrays of thresholds */ | ||
215 | struct mutex thresholds_lock; | ||
216 | |||
217 | /* thresholds for memory usage. RCU-protected */ | ||
218 | struct mem_cgroup_threshold_ary *thresholds; | ||
219 | |||
220 | /* thresholds for mem+swap usage. RCU-protected */ | ||
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | ||
222 | |||
228 | /* | 223 | /* |
229 | * statistics. This must be placed at the end of memcg. | 224 | * Should we move charges of a task when a task is moved into this |
225 | * mem_cgroup ? And what type of charges should we move ? | ||
230 | */ | 226 | */ |
231 | struct mem_cgroup_stat stat; | 227 | unsigned long move_charge_at_immigrate; |
228 | |||
229 | /* | ||
230 | * percpu counter. | ||
231 | */ | ||
232 | struct mem_cgroup_stat_cpu *stat; | ||
233 | }; | ||
234 | |||
235 | /* Stuffs for move charges at task migration. */ | ||
236 | /* | ||
237 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | ||
238 | * left-shifted bitmap of these types. | ||
239 | */ | ||
240 | enum move_type { | ||
241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | ||
242 | NR_MOVE_TYPE, | ||
243 | }; | ||
244 | |||
245 | /* "mc" and its members are protected by cgroup_mutex */ | ||
246 | static struct move_charge_struct { | ||
247 | struct mem_cgroup *from; | ||
248 | struct mem_cgroup *to; | ||
249 | unsigned long precharge; | ||
250 | unsigned long moved_charge; | ||
251 | unsigned long moved_swap; | ||
252 | struct task_struct *moving_task; /* a task moving charges */ | ||
253 | wait_queue_head_t waitq; /* a waitq for other context */ | ||
254 | } mc = { | ||
255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | ||
232 | }; | 256 | }; |
233 | 257 | ||
234 | /* | 258 | /* |
@@ -371,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
371 | spin_unlock(&mctz->lock); | 395 | spin_unlock(&mctz->lock); |
372 | } | 396 | } |
373 | 397 | ||
374 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
375 | { | ||
376 | bool ret = false; | ||
377 | int cpu; | ||
378 | s64 val; | ||
379 | struct mem_cgroup_stat_cpu *cpustat; | ||
380 | |||
381 | cpu = get_cpu(); | ||
382 | cpustat = &mem->stat.cpustat[cpu]; | ||
383 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
384 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
385 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
386 | ret = true; | ||
387 | } | ||
388 | put_cpu(); | ||
389 | return ret; | ||
390 | } | ||
391 | 398 | ||
392 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 399 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) |
393 | { | 400 | { |
@@ -481,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
481 | return mz; | 488 | return mz; |
482 | } | 489 | } |
483 | 490 | ||
491 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | ||
492 | enum mem_cgroup_stat_index idx) | ||
493 | { | ||
494 | int cpu; | ||
495 | s64 val = 0; | ||
496 | |||
497 | for_each_possible_cpu(cpu) | ||
498 | val += per_cpu(mem->stat->count[idx], cpu); | ||
499 | return val; | ||
500 | } | ||
501 | |||
502 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
503 | { | ||
504 | s64 ret; | ||
505 | |||
506 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
507 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
508 | return ret; | ||
509 | } | ||
510 | |||
484 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 511 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
485 | bool charge) | 512 | bool charge) |
486 | { | 513 | { |
487 | int val = (charge) ? 1 : -1; | 514 | int val = (charge) ? 1 : -1; |
488 | struct mem_cgroup_stat *stat = &mem->stat; | 515 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
489 | struct mem_cgroup_stat_cpu *cpustat; | ||
490 | int cpu = get_cpu(); | ||
491 | |||
492 | cpustat = &stat->cpustat[cpu]; | ||
493 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
494 | put_cpu(); | ||
495 | } | 516 | } |
496 | 517 | ||
497 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 518 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
@@ -499,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
499 | bool charge) | 520 | bool charge) |
500 | { | 521 | { |
501 | int val = (charge) ? 1 : -1; | 522 | int val = (charge) ? 1 : -1; |
502 | struct mem_cgroup_stat *stat = &mem->stat; | ||
503 | struct mem_cgroup_stat_cpu *cpustat; | ||
504 | int cpu = get_cpu(); | ||
505 | 523 | ||
506 | cpustat = &stat->cpustat[cpu]; | 524 | preempt_disable(); |
525 | |||
507 | if (PageCgroupCache(pc)) | 526 | if (PageCgroupCache(pc)) |
508 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 527 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); |
509 | else | 528 | else |
510 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); | 529 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); |
511 | 530 | ||
512 | if (charge) | 531 | if (charge) |
513 | __mem_cgroup_stat_add_safe(cpustat, | 532 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
514 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | ||
515 | else | 533 | else |
516 | __mem_cgroup_stat_add_safe(cpustat, | 534 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
517 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 535 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); |
518 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | 536 | |
519 | put_cpu(); | 537 | preempt_enable(); |
520 | } | 538 | } |
521 | 539 | ||
522 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 540 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
@@ -534,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
534 | return total; | 552 | return total; |
535 | } | 553 | } |
536 | 554 | ||
555 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | ||
556 | { | ||
557 | s64 val; | ||
558 | |||
559 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | ||
560 | |||
561 | return !(val & ((1 << event_mask_shift) - 1)); | ||
562 | } | ||
563 | |||
564 | /* | ||
565 | * Check events in order. | ||
566 | * | ||
567 | */ | ||
568 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | ||
569 | { | ||
570 | /* threshold event is triggered in finer grain than soft limit */ | ||
571 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | ||
572 | mem_cgroup_threshold(mem); | ||
573 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | ||
574 | mem_cgroup_update_tree(mem, page); | ||
575 | } | ||
576 | } | ||
577 | |||
537 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 578 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
538 | { | 579 | { |
539 | return container_of(cgroup_subsys_state(cont, | 580 | return container_of(cgroup_subsys_state(cont, |
@@ -1000,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | |||
1000 | } | 1041 | } |
1001 | 1042 | ||
1002 | /** | 1043 | /** |
1003 | * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. | 1044 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
1004 | * @memcg: The memory cgroup that went over limit | 1045 | * @memcg: The memory cgroup that went over limit |
1005 | * @p: Task that is going to be killed | 1046 | * @p: Task that is going to be killed |
1006 | * | 1047 | * |
@@ -1174,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1174 | } | 1215 | } |
1175 | } | 1216 | } |
1176 | } | 1217 | } |
1177 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1218 | if (!mem_cgroup_local_usage(victim)) { |
1178 | /* this cgroup's local usage == 0 */ | 1219 | /* this cgroup's local usage == 0 */ |
1179 | css_put(&victim->css); | 1220 | css_put(&victim->css); |
1180 | continue; | 1221 | continue; |
@@ -1205,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1205 | return total; | 1246 | return total; |
1206 | } | 1247 | } |
1207 | 1248 | ||
1208 | bool mem_cgroup_oom_called(struct task_struct *task) | 1249 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) |
1209 | { | 1250 | { |
1210 | bool ret = false; | 1251 | int *val = (int *)data; |
1211 | struct mem_cgroup *mem; | 1252 | int x; |
1212 | struct mm_struct *mm; | 1253 | /* |
1254 | * Logically, we can stop scanning immediately when we find | ||
1255 | * a memcg is already locked. But condidering unlock ops and | ||
1256 | * creation/removal of memcg, scan-all is simple operation. | ||
1257 | */ | ||
1258 | x = atomic_inc_return(&mem->oom_lock); | ||
1259 | *val = max(x, *val); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | /* | ||
1263 | * Check OOM-Killer is already running under our hierarchy. | ||
1264 | * If someone is running, return false. | ||
1265 | */ | ||
1266 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | ||
1267 | { | ||
1268 | int lock_count = 0; | ||
1213 | 1269 | ||
1214 | rcu_read_lock(); | 1270 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); |
1215 | mm = task->mm; | 1271 | |
1216 | if (!mm) | 1272 | if (lock_count == 1) |
1217 | mm = &init_mm; | 1273 | return true; |
1218 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1274 | return false; |
1219 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
1220 | ret = true; | ||
1221 | rcu_read_unlock(); | ||
1222 | return ret; | ||
1223 | } | 1275 | } |
1224 | 1276 | ||
1225 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | 1277 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) |
1226 | { | 1278 | { |
1227 | mem->last_oom_jiffies = jiffies; | 1279 | /* |
1280 | * When a new child is created while the hierarchy is under oom, | ||
1281 | * mem_cgroup_oom_lock() may not be called. We have to use | ||
1282 | * atomic_add_unless() here. | ||
1283 | */ | ||
1284 | atomic_add_unless(&mem->oom_lock, -1, 0); | ||
1228 | return 0; | 1285 | return 0; |
1229 | } | 1286 | } |
1230 | 1287 | ||
1231 | static void record_last_oom(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1232 | { | 1289 | { |
1233 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 1290 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); |
1291 | } | ||
1292 | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | ||
1295 | |||
1296 | /* | ||
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | ||
1298 | */ | ||
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | ||
1300 | { | ||
1301 | DEFINE_WAIT(wait); | ||
1302 | bool locked; | ||
1303 | |||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | ||
1305 | mutex_lock(&memcg_oom_mutex); | ||
1306 | locked = mem_cgroup_oom_lock(mem); | ||
1307 | /* | ||
1308 | * Even if signal_pending(), we can't quit charge() loop without | ||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
1311 | */ | ||
1312 | if (!locked) | ||
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | ||
1315 | |||
1316 | if (locked) | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | ||
1318 | else { | ||
1319 | schedule(); | ||
1320 | finish_wait(&memcg_oom_waitq, &wait); | ||
1321 | } | ||
1322 | mutex_lock(&memcg_oom_mutex); | ||
1323 | mem_cgroup_oom_unlock(mem); | ||
1324 | /* | ||
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | ||
1339 | |||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
1341 | return false; | ||
1342 | /* Give chance to dying process */ | ||
1343 | schedule_timeout(1); | ||
1344 | return true; | ||
1234 | } | 1345 | } |
1235 | 1346 | ||
1236 | /* | 1347 | /* |
@@ -1240,9 +1351,6 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
1240 | void mem_cgroup_update_file_mapped(struct page *page, int val) | 1351 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1241 | { | 1352 | { |
1242 | struct mem_cgroup *mem; | 1353 | struct mem_cgroup *mem; |
1243 | struct mem_cgroup_stat *stat; | ||
1244 | struct mem_cgroup_stat_cpu *cpustat; | ||
1245 | int cpu; | ||
1246 | struct page_cgroup *pc; | 1354 | struct page_cgroup *pc; |
1247 | 1355 | ||
1248 | pc = lookup_page_cgroup(page); | 1356 | pc = lookup_page_cgroup(page); |
@@ -1258,13 +1366,10 @@ void mem_cgroup_update_file_mapped(struct page *page, int val) | |||
1258 | goto done; | 1366 | goto done; |
1259 | 1367 | ||
1260 | /* | 1368 | /* |
1261 | * Preemption is already disabled, we don't need get_cpu() | 1369 | * Preemption is already disabled. We can use __this_cpu_xxx |
1262 | */ | 1370 | */ |
1263 | cpu = smp_processor_id(); | 1371 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val); |
1264 | stat = &mem->stat; | ||
1265 | cpustat = &stat->cpustat[cpu]; | ||
1266 | 1372 | ||
1267 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
1268 | done: | 1373 | done: |
1269 | unlock_page_cgroup(pc); | 1374 | unlock_page_cgroup(pc); |
1270 | } | 1375 | } |
@@ -1401,19 +1506,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | |||
1401 | * oom-killer can be invoked. | 1506 | * oom-killer can be invoked. |
1402 | */ | 1507 | */ |
1403 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1508 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1404 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1509 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
1405 | bool oom, struct page *page) | ||
1406 | { | 1510 | { |
1407 | struct mem_cgroup *mem, *mem_over_limit; | 1511 | struct mem_cgroup *mem, *mem_over_limit; |
1408 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1512 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1409 | struct res_counter *fail_res; | 1513 | struct res_counter *fail_res; |
1410 | int csize = CHARGE_SIZE; | 1514 | int csize = CHARGE_SIZE; |
1411 | 1515 | ||
1412 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1516 | /* |
1413 | /* Don't account this! */ | 1517 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
1414 | *memcg = NULL; | 1518 | * in system level. So, allow to go ahead dying process in addition to |
1415 | return 0; | 1519 | * MEMDIE process. |
1416 | } | 1520 | */ |
1521 | if (unlikely(test_thread_flag(TIF_MEMDIE) | ||
1522 | || fatal_signal_pending(current))) | ||
1523 | goto bypass; | ||
1417 | 1524 | ||
1418 | /* | 1525 | /* |
1419 | * We always charge the cgroup the mm_struct belongs to. | 1526 | * We always charge the cgroup the mm_struct belongs to. |
@@ -1440,7 +1547,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1440 | unsigned long flags = 0; | 1547 | unsigned long flags = 0; |
1441 | 1548 | ||
1442 | if (consume_stock(mem)) | 1549 | if (consume_stock(mem)) |
1443 | goto charged; | 1550 | goto done; |
1444 | 1551 | ||
1445 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 1552 | ret = res_counter_charge(&mem->res, csize, &fail_res); |
1446 | if (likely(!ret)) { | 1553 | if (likely(!ret)) { |
@@ -1483,28 +1590,70 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1483 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1590 | if (mem_cgroup_check_under_limit(mem_over_limit)) |
1484 | continue; | 1591 | continue; |
1485 | 1592 | ||
1593 | /* try to avoid oom while someone is moving charge */ | ||
1594 | if (mc.moving_task && current != mc.moving_task) { | ||
1595 | struct mem_cgroup *from, *to; | ||
1596 | bool do_continue = false; | ||
1597 | /* | ||
1598 | * There is a small race that "from" or "to" can be | ||
1599 | * freed by rmdir, so we use css_tryget(). | ||
1600 | */ | ||
1601 | rcu_read_lock(); | ||
1602 | from = mc.from; | ||
1603 | to = mc.to; | ||
1604 | if (from && css_tryget(&from->css)) { | ||
1605 | if (mem_over_limit->use_hierarchy) | ||
1606 | do_continue = css_is_ancestor( | ||
1607 | &from->css, | ||
1608 | &mem_over_limit->css); | ||
1609 | else | ||
1610 | do_continue = (from == mem_over_limit); | ||
1611 | css_put(&from->css); | ||
1612 | } | ||
1613 | if (!do_continue && to && css_tryget(&to->css)) { | ||
1614 | if (mem_over_limit->use_hierarchy) | ||
1615 | do_continue = css_is_ancestor( | ||
1616 | &to->css, | ||
1617 | &mem_over_limit->css); | ||
1618 | else | ||
1619 | do_continue = (to == mem_over_limit); | ||
1620 | css_put(&to->css); | ||
1621 | } | ||
1622 | rcu_read_unlock(); | ||
1623 | if (do_continue) { | ||
1624 | DEFINE_WAIT(wait); | ||
1625 | prepare_to_wait(&mc.waitq, &wait, | ||
1626 | TASK_INTERRUPTIBLE); | ||
1627 | /* moving charge context might have finished. */ | ||
1628 | if (mc.moving_task) | ||
1629 | schedule(); | ||
1630 | finish_wait(&mc.waitq, &wait); | ||
1631 | continue; | ||
1632 | } | ||
1633 | } | ||
1634 | |||
1486 | if (!nr_retries--) { | 1635 | if (!nr_retries--) { |
1487 | if (oom) { | 1636 | if (!oom) |
1488 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1637 | goto nomem; |
1489 | record_last_oom(mem_over_limit); | 1638 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { |
1639 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
1640 | continue; | ||
1490 | } | 1641 | } |
1491 | goto nomem; | 1642 | /* When we reach here, current task is dying .*/ |
1643 | css_put(&mem->css); | ||
1644 | goto bypass; | ||
1492 | } | 1645 | } |
1493 | } | 1646 | } |
1494 | if (csize > PAGE_SIZE) | 1647 | if (csize > PAGE_SIZE) |
1495 | refill_stock(mem, csize - PAGE_SIZE); | 1648 | refill_stock(mem, csize - PAGE_SIZE); |
1496 | charged: | ||
1497 | /* | ||
1498 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
1499 | * if they exceeds softlimit. | ||
1500 | */ | ||
1501 | if (mem_cgroup_soft_limit_check(mem)) | ||
1502 | mem_cgroup_update_tree(mem, page); | ||
1503 | done: | 1649 | done: |
1504 | return 0; | 1650 | return 0; |
1505 | nomem: | 1651 | nomem: |
1506 | css_put(&mem->css); | 1652 | css_put(&mem->css); |
1507 | return -ENOMEM; | 1653 | return -ENOMEM; |
1654 | bypass: | ||
1655 | *memcg = NULL; | ||
1656 | return 0; | ||
1508 | } | 1657 | } |
1509 | 1658 | ||
1510 | /* | 1659 | /* |
@@ -1512,14 +1661,23 @@ nomem: | |||
1512 | * This function is for that and do uncharge, put css's refcnt. | 1661 | * This function is for that and do uncharge, put css's refcnt. |
1513 | * gotten by try_charge(). | 1662 | * gotten by try_charge(). |
1514 | */ | 1663 | */ |
1515 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 1664 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
1665 | unsigned long count) | ||
1516 | { | 1666 | { |
1517 | if (!mem_cgroup_is_root(mem)) { | 1667 | if (!mem_cgroup_is_root(mem)) { |
1518 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1668 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); |
1519 | if (do_swap_account) | 1669 | if (do_swap_account) |
1520 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1670 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); |
1671 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
1672 | WARN_ON_ONCE(count > INT_MAX); | ||
1673 | __css_put(&mem->css, (int)count); | ||
1521 | } | 1674 | } |
1522 | css_put(&mem->css); | 1675 | /* we don't need css_put for root */ |
1676 | } | ||
1677 | |||
1678 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1679 | { | ||
1680 | __mem_cgroup_cancel_charge(mem, 1); | ||
1523 | } | 1681 | } |
1524 | 1682 | ||
1525 | /* | 1683 | /* |
@@ -1615,6 +1773,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1615 | mem_cgroup_charge_statistics(mem, pc, true); | 1773 | mem_cgroup_charge_statistics(mem, pc, true); |
1616 | 1774 | ||
1617 | unlock_page_cgroup(pc); | 1775 | unlock_page_cgroup(pc); |
1776 | /* | ||
1777 | * "charge_statistics" updated event counter. Then, check it. | ||
1778 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
1779 | * if they exceeds softlimit. | ||
1780 | */ | ||
1781 | memcg_check_events(mem, pc->page); | ||
1618 | } | 1782 | } |
1619 | 1783 | ||
1620 | /** | 1784 | /** |
@@ -1622,22 +1786,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1622 | * @pc: page_cgroup of the page. | 1786 | * @pc: page_cgroup of the page. |
1623 | * @from: mem_cgroup which the page is moved from. | 1787 | * @from: mem_cgroup which the page is moved from. |
1624 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1788 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1789 | * @uncharge: whether we should call uncharge and css_put against @from. | ||
1625 | * | 1790 | * |
1626 | * The caller must confirm following. | 1791 | * The caller must confirm following. |
1627 | * - page is not on LRU (isolate_page() is useful.) | 1792 | * - page is not on LRU (isolate_page() is useful.) |
1628 | * - the pc is locked, used, and ->mem_cgroup points to @from. | 1793 | * - the pc is locked, used, and ->mem_cgroup points to @from. |
1629 | * | 1794 | * |
1630 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1795 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
1631 | * new cgroup. It should be done by a caller. | 1796 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is |
1797 | * true, this function does "uncharge" from old cgroup, but it doesn't if | ||
1798 | * @uncharge is false, so a caller should do "uncharge". | ||
1632 | */ | 1799 | */ |
1633 | 1800 | ||
1634 | static void __mem_cgroup_move_account(struct page_cgroup *pc, | 1801 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1635 | struct mem_cgroup *from, struct mem_cgroup *to) | 1802 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
1636 | { | 1803 | { |
1637 | struct page *page; | 1804 | struct page *page; |
1638 | int cpu; | ||
1639 | struct mem_cgroup_stat *stat; | ||
1640 | struct mem_cgroup_stat_cpu *cpustat; | ||
1641 | 1805 | ||
1642 | VM_BUG_ON(from == to); | 1806 | VM_BUG_ON(from == to); |
1643 | VM_BUG_ON(PageLRU(pc->page)); | 1807 | VM_BUG_ON(PageLRU(pc->page)); |
@@ -1645,38 +1809,28 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
1645 | VM_BUG_ON(!PageCgroupUsed(pc)); | 1809 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1646 | VM_BUG_ON(pc->mem_cgroup != from); | 1810 | VM_BUG_ON(pc->mem_cgroup != from); |
1647 | 1811 | ||
1648 | if (!mem_cgroup_is_root(from)) | ||
1649 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
1650 | mem_cgroup_charge_statistics(from, pc, false); | ||
1651 | |||
1652 | page = pc->page; | 1812 | page = pc->page; |
1653 | if (page_mapped(page) && !PageAnon(page)) { | 1813 | if (page_mapped(page) && !PageAnon(page)) { |
1654 | cpu = smp_processor_id(); | 1814 | /* Update mapped_file data for mem_cgroup */ |
1655 | /* Update mapped_file data for mem_cgroup "from" */ | 1815 | preempt_disable(); |
1656 | stat = &from->stat; | 1816 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1657 | cpustat = &stat->cpustat[cpu]; | 1817 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1658 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, | 1818 | preempt_enable(); |
1659 | -1); | ||
1660 | |||
1661 | /* Update mapped_file data for mem_cgroup "to" */ | ||
1662 | stat = &to->stat; | ||
1663 | cpustat = &stat->cpustat[cpu]; | ||
1664 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, | ||
1665 | 1); | ||
1666 | } | 1819 | } |
1820 | mem_cgroup_charge_statistics(from, pc, false); | ||
1821 | if (uncharge) | ||
1822 | /* This is not "cancel", but cancel_charge does all we need. */ | ||
1823 | mem_cgroup_cancel_charge(from); | ||
1667 | 1824 | ||
1668 | if (do_swap_account && !mem_cgroup_is_root(from)) | 1825 | /* caller should have done css_get */ |
1669 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
1670 | css_put(&from->css); | ||
1671 | |||
1672 | css_get(&to->css); | ||
1673 | pc->mem_cgroup = to; | 1826 | pc->mem_cgroup = to; |
1674 | mem_cgroup_charge_statistics(to, pc, true); | 1827 | mem_cgroup_charge_statistics(to, pc, true); |
1675 | /* | 1828 | /* |
1676 | * We charges against "to" which may not have any tasks. Then, "to" | 1829 | * We charges against "to" which may not have any tasks. Then, "to" |
1677 | * can be under rmdir(). But in current implementation, caller of | 1830 | * can be under rmdir(). But in current implementation, caller of |
1678 | * this function is just force_empty() and it's garanteed that | 1831 | * this function is just force_empty() and move charge, so it's |
1679 | * "to" is never removed. So, we don't check rmdir status here. | 1832 | * garanteed that "to" is never removed. So, we don't check rmdir |
1833 | * status here. | ||
1680 | */ | 1834 | */ |
1681 | } | 1835 | } |
1682 | 1836 | ||
@@ -1685,15 +1839,20 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
1685 | * __mem_cgroup_move_account() | 1839 | * __mem_cgroup_move_account() |
1686 | */ | 1840 | */ |
1687 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1841 | static int mem_cgroup_move_account(struct page_cgroup *pc, |
1688 | struct mem_cgroup *from, struct mem_cgroup *to) | 1842 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
1689 | { | 1843 | { |
1690 | int ret = -EINVAL; | 1844 | int ret = -EINVAL; |
1691 | lock_page_cgroup(pc); | 1845 | lock_page_cgroup(pc); |
1692 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | 1846 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { |
1693 | __mem_cgroup_move_account(pc, from, to); | 1847 | __mem_cgroup_move_account(pc, from, to, uncharge); |
1694 | ret = 0; | 1848 | ret = 0; |
1695 | } | 1849 | } |
1696 | unlock_page_cgroup(pc); | 1850 | unlock_page_cgroup(pc); |
1851 | /* | ||
1852 | * check events | ||
1853 | */ | ||
1854 | memcg_check_events(to, pc->page); | ||
1855 | memcg_check_events(from, pc->page); | ||
1697 | return ret; | 1856 | return ret; |
1698 | } | 1857 | } |
1699 | 1858 | ||
@@ -1722,15 +1881,13 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1722 | goto put; | 1881 | goto put; |
1723 | 1882 | ||
1724 | parent = mem_cgroup_from_cont(pcg); | 1883 | parent = mem_cgroup_from_cont(pcg); |
1725 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | 1884 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); |
1726 | if (ret || !parent) | 1885 | if (ret || !parent) |
1727 | goto put_back; | 1886 | goto put_back; |
1728 | 1887 | ||
1729 | ret = mem_cgroup_move_account(pc, child, parent); | 1888 | ret = mem_cgroup_move_account(pc, child, parent, true); |
1730 | if (!ret) | 1889 | if (ret) |
1731 | css_put(&parent->css); /* drop extra refcnt by try_charge() */ | 1890 | mem_cgroup_cancel_charge(parent); |
1732 | else | ||
1733 | mem_cgroup_cancel_charge(parent); /* does css_put */ | ||
1734 | put_back: | 1891 | put_back: |
1735 | putback_lru_page(page); | 1892 | putback_lru_page(page); |
1736 | put: | 1893 | put: |
@@ -1760,7 +1917,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1760 | prefetchw(pc); | 1917 | prefetchw(pc); |
1761 | 1918 | ||
1762 | mem = memcg; | 1919 | mem = memcg; |
1763 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); | 1920 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
1764 | if (ret || !mem) | 1921 | if (ret || !mem) |
1765 | return ret; | 1922 | return ret; |
1766 | 1923 | ||
@@ -1880,14 +2037,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1880 | if (!mem) | 2037 | if (!mem) |
1881 | goto charge_cur_mm; | 2038 | goto charge_cur_mm; |
1882 | *ptr = mem; | 2039 | *ptr = mem; |
1883 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); | 2040 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
1884 | /* drop extra refcnt from tryget */ | 2041 | /* drop extra refcnt from tryget */ |
1885 | css_put(&mem->css); | 2042 | css_put(&mem->css); |
1886 | return ret; | 2043 | return ret; |
1887 | charge_cur_mm: | 2044 | charge_cur_mm: |
1888 | if (unlikely(!mm)) | 2045 | if (unlikely(!mm)) |
1889 | mm = &init_mm; | 2046 | mm = &init_mm; |
1890 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); | 2047 | return __mem_cgroup_try_charge(mm, mask, ptr, true); |
1891 | } | 2048 | } |
1892 | 2049 | ||
1893 | static void | 2050 | static void |
@@ -2064,8 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2064 | mz = page_cgroup_zoneinfo(pc); | 2221 | mz = page_cgroup_zoneinfo(pc); |
2065 | unlock_page_cgroup(pc); | 2222 | unlock_page_cgroup(pc); |
2066 | 2223 | ||
2067 | if (mem_cgroup_soft_limit_check(mem)) | 2224 | memcg_check_events(mem, page); |
2068 | mem_cgroup_update_tree(mem, page); | ||
2069 | /* at swapout, this memcg will be accessed to record to swap */ | 2225 | /* at swapout, this memcg will be accessed to record to swap */ |
2070 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2226 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2071 | css_put(&mem->css); | 2227 | css_put(&mem->css); |
@@ -2192,6 +2348,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
2192 | } | 2348 | } |
2193 | rcu_read_unlock(); | 2349 | rcu_read_unlock(); |
2194 | } | 2350 | } |
2351 | |||
2352 | /** | ||
2353 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | ||
2354 | * @entry: swap entry to be moved | ||
2355 | * @from: mem_cgroup which the entry is moved from | ||
2356 | * @to: mem_cgroup which the entry is moved to | ||
2357 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
2358 | * | ||
2359 | * It succeeds only when the swap_cgroup's record for this entry is the same | ||
2360 | * as the mem_cgroup's id of @from. | ||
2361 | * | ||
2362 | * Returns 0 on success, -EINVAL on failure. | ||
2363 | * | ||
2364 | * The caller must have charged to @to, IOW, called res_counter_charge() about | ||
2365 | * both res and memsw, and called css_get(). | ||
2366 | */ | ||
2367 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2368 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2369 | { | ||
2370 | unsigned short old_id, new_id; | ||
2371 | |||
2372 | old_id = css_id(&from->css); | ||
2373 | new_id = css_id(&to->css); | ||
2374 | |||
2375 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | ||
2376 | mem_cgroup_swap_statistics(from, false); | ||
2377 | mem_cgroup_swap_statistics(to, true); | ||
2378 | /* | ||
2379 | * This function is only called from task migration context now. | ||
2380 | * It postpones res_counter and refcount handling till the end | ||
2381 | * of task migration(mem_cgroup_clear_mc()) for performance | ||
2382 | * improvement. But we cannot postpone mem_cgroup_get(to) | ||
2383 | * because if the process that has been moved to @to does | ||
2384 | * swap-in, the refcount of @to might be decreased to 0. | ||
2385 | */ | ||
2386 | mem_cgroup_get(to); | ||
2387 | if (need_fixup) { | ||
2388 | if (!mem_cgroup_is_root(from)) | ||
2389 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
2390 | mem_cgroup_put(from); | ||
2391 | /* | ||
2392 | * we charged both to->res and to->memsw, so we should | ||
2393 | * uncharge to->res. | ||
2394 | */ | ||
2395 | if (!mem_cgroup_is_root(to)) | ||
2396 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
2397 | css_put(&to->css); | ||
2398 | } | ||
2399 | return 0; | ||
2400 | } | ||
2401 | return -EINVAL; | ||
2402 | } | ||
2403 | #else | ||
2404 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2405 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2406 | { | ||
2407 | return -EINVAL; | ||
2408 | } | ||
2195 | #endif | 2409 | #endif |
2196 | 2410 | ||
2197 | /* | 2411 | /* |
@@ -2216,8 +2430,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
2216 | unlock_page_cgroup(pc); | 2430 | unlock_page_cgroup(pc); |
2217 | 2431 | ||
2218 | if (mem) { | 2432 | if (mem) { |
2219 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 2433 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); |
2220 | page); | ||
2221 | css_put(&mem->css); | 2434 | css_put(&mem->css); |
2222 | } | 2435 | } |
2223 | *ptr = mem; | 2436 | *ptr = mem; |
@@ -2545,7 +2758,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
2545 | pc = list_entry(list->prev, struct page_cgroup, lru); | 2758 | pc = list_entry(list->prev, struct page_cgroup, lru); |
2546 | if (busy == pc) { | 2759 | if (busy == pc) { |
2547 | list_move(&pc->lru, list); | 2760 | list_move(&pc->lru, list); |
2548 | busy = 0; | 2761 | busy = NULL; |
2549 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2762 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
2550 | continue; | 2763 | continue; |
2551 | } | 2764 | } |
@@ -2704,7 +2917,7 @@ static int | |||
2704 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | 2917 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) |
2705 | { | 2918 | { |
2706 | struct mem_cgroup_idx_data *d = data; | 2919 | struct mem_cgroup_idx_data *d = data; |
2707 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | 2920 | d->val += mem_cgroup_read_stat(mem, d->idx); |
2708 | return 0; | 2921 | return 0; |
2709 | } | 2922 | } |
2710 | 2923 | ||
@@ -2719,40 +2932,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | |||
2719 | *val = d.val; | 2932 | *val = d.val; |
2720 | } | 2933 | } |
2721 | 2934 | ||
2935 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | ||
2936 | { | ||
2937 | u64 idx_val, val; | ||
2938 | |||
2939 | if (!mem_cgroup_is_root(mem)) { | ||
2940 | if (!swap) | ||
2941 | return res_counter_read_u64(&mem->res, RES_USAGE); | ||
2942 | else | ||
2943 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | ||
2944 | } | ||
2945 | |||
2946 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2947 | val = idx_val; | ||
2948 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); | ||
2949 | val += idx_val; | ||
2950 | |||
2951 | if (swap) { | ||
2952 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2953 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2954 | val += idx_val; | ||
2955 | } | ||
2956 | |||
2957 | return val << PAGE_SHIFT; | ||
2958 | } | ||
2959 | |||
2722 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2960 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2723 | { | 2961 | { |
2724 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2962 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2725 | u64 idx_val, val; | 2963 | u64 val; |
2726 | int type, name; | 2964 | int type, name; |
2727 | 2965 | ||
2728 | type = MEMFILE_TYPE(cft->private); | 2966 | type = MEMFILE_TYPE(cft->private); |
2729 | name = MEMFILE_ATTR(cft->private); | 2967 | name = MEMFILE_ATTR(cft->private); |
2730 | switch (type) { | 2968 | switch (type) { |
2731 | case _MEM: | 2969 | case _MEM: |
2732 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2970 | if (name == RES_USAGE) |
2733 | mem_cgroup_get_recursive_idx_stat(mem, | 2971 | val = mem_cgroup_usage(mem, false); |
2734 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2972 | else |
2735 | val = idx_val; | ||
2736 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2737 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2738 | val += idx_val; | ||
2739 | val <<= PAGE_SHIFT; | ||
2740 | } else | ||
2741 | val = res_counter_read_u64(&mem->res, name); | 2973 | val = res_counter_read_u64(&mem->res, name); |
2742 | break; | 2974 | break; |
2743 | case _MEMSWAP: | 2975 | case _MEMSWAP: |
2744 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2976 | if (name == RES_USAGE) |
2745 | mem_cgroup_get_recursive_idx_stat(mem, | 2977 | val = mem_cgroup_usage(mem, true); |
2746 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2978 | else |
2747 | val = idx_val; | ||
2748 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2749 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2750 | val += idx_val; | ||
2751 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2752 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2753 | val += idx_val; | ||
2754 | val <<= PAGE_SHIFT; | ||
2755 | } else | ||
2756 | val = res_counter_read_u64(&mem->memsw, name); | 2979 | val = res_counter_read_u64(&mem->memsw, name); |
2757 | break; | 2980 | break; |
2758 | default: | 2981 | default: |
@@ -2865,6 +3088,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2865 | return 0; | 3088 | return 0; |
2866 | } | 3089 | } |
2867 | 3090 | ||
3091 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | ||
3092 | struct cftype *cft) | ||
3093 | { | ||
3094 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | ||
3095 | } | ||
3096 | |||
3097 | #ifdef CONFIG_MMU | ||
3098 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3099 | struct cftype *cft, u64 val) | ||
3100 | { | ||
3101 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3102 | |||
3103 | if (val >= (1 << NR_MOVE_TYPE)) | ||
3104 | return -EINVAL; | ||
3105 | /* | ||
3106 | * We check this value several times in both in can_attach() and | ||
3107 | * attach(), so we need cgroup lock to prevent this value from being | ||
3108 | * inconsistent. | ||
3109 | */ | ||
3110 | cgroup_lock(); | ||
3111 | mem->move_charge_at_immigrate = val; | ||
3112 | cgroup_unlock(); | ||
3113 | |||
3114 | return 0; | ||
3115 | } | ||
3116 | #else | ||
3117 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3118 | struct cftype *cft, u64 val) | ||
3119 | { | ||
3120 | return -ENOSYS; | ||
3121 | } | ||
3122 | #endif | ||
3123 | |||
2868 | 3124 | ||
2869 | /* For read statistics */ | 3125 | /* For read statistics */ |
2870 | enum { | 3126 | enum { |
@@ -2910,18 +3166,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2910 | s64 val; | 3166 | s64 val; |
2911 | 3167 | ||
2912 | /* per cpu stat */ | 3168 | /* per cpu stat */ |
2913 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); | 3169 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
2914 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 3170 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2915 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 3171 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
2916 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3172 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2917 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); | 3173 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
2918 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 3174 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2919 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3175 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2920 | s->stat[MCS_PGPGIN] += val; | 3176 | s->stat[MCS_PGPGIN] += val; |
2921 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3177 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
2922 | s->stat[MCS_PGPGOUT] += val; | 3178 | s->stat[MCS_PGPGOUT] += val; |
2923 | if (do_swap_account) { | 3179 | if (do_swap_account) { |
2924 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | 3180 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
2925 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 3181 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
2926 | } | 3182 | } |
2927 | 3183 | ||
@@ -3049,12 +3305,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
3049 | return 0; | 3305 | return 0; |
3050 | } | 3306 | } |
3051 | 3307 | ||
3308 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | ||
3309 | { | ||
3310 | struct mem_cgroup_threshold_ary *t; | ||
3311 | u64 usage; | ||
3312 | int i; | ||
3313 | |||
3314 | rcu_read_lock(); | ||
3315 | if (!swap) | ||
3316 | t = rcu_dereference(memcg->thresholds); | ||
3317 | else | ||
3318 | t = rcu_dereference(memcg->memsw_thresholds); | ||
3319 | |||
3320 | if (!t) | ||
3321 | goto unlock; | ||
3322 | |||
3323 | usage = mem_cgroup_usage(memcg, swap); | ||
3324 | |||
3325 | /* | ||
3326 | * current_threshold points to threshold just below usage. | ||
3327 | * If it's not true, a threshold was crossed after last | ||
3328 | * call of __mem_cgroup_threshold(). | ||
3329 | */ | ||
3330 | i = atomic_read(&t->current_threshold); | ||
3331 | |||
3332 | /* | ||
3333 | * Iterate backward over array of thresholds starting from | ||
3334 | * current_threshold and check if a threshold is crossed. | ||
3335 | * If none of thresholds below usage is crossed, we read | ||
3336 | * only one element of the array here. | ||
3337 | */ | ||
3338 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | ||
3339 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3340 | |||
3341 | /* i = current_threshold + 1 */ | ||
3342 | i++; | ||
3343 | |||
3344 | /* | ||
3345 | * Iterate forward over array of thresholds starting from | ||
3346 | * current_threshold+1 and check if a threshold is crossed. | ||
3347 | * If none of thresholds above usage is crossed, we read | ||
3348 | * only one element of the array here. | ||
3349 | */ | ||
3350 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | ||
3351 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3352 | |||
3353 | /* Update current_threshold */ | ||
3354 | atomic_set(&t->current_threshold, i - 1); | ||
3355 | unlock: | ||
3356 | rcu_read_unlock(); | ||
3357 | } | ||
3358 | |||
3359 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | ||
3360 | { | ||
3361 | __mem_cgroup_threshold(memcg, false); | ||
3362 | if (do_swap_account) | ||
3363 | __mem_cgroup_threshold(memcg, true); | ||
3364 | } | ||
3365 | |||
3366 | static int compare_thresholds(const void *a, const void *b) | ||
3367 | { | ||
3368 | const struct mem_cgroup_threshold *_a = a; | ||
3369 | const struct mem_cgroup_threshold *_b = b; | ||
3370 | |||
3371 | return _a->threshold - _b->threshold; | ||
3372 | } | ||
3373 | |||
3374 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | ||
3375 | struct eventfd_ctx *eventfd, const char *args) | ||
3376 | { | ||
3377 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3378 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3379 | int type = MEMFILE_TYPE(cft->private); | ||
3380 | u64 threshold, usage; | ||
3381 | int size; | ||
3382 | int i, ret; | ||
3383 | |||
3384 | ret = res_counter_memparse_write_strategy(args, &threshold); | ||
3385 | if (ret) | ||
3386 | return ret; | ||
3387 | |||
3388 | mutex_lock(&memcg->thresholds_lock); | ||
3389 | if (type == _MEM) | ||
3390 | thresholds = memcg->thresholds; | ||
3391 | else if (type == _MEMSWAP) | ||
3392 | thresholds = memcg->memsw_thresholds; | ||
3393 | else | ||
3394 | BUG(); | ||
3395 | |||
3396 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3397 | |||
3398 | /* Check if a threshold crossed before adding a new one */ | ||
3399 | if (thresholds) | ||
3400 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3401 | |||
3402 | if (thresholds) | ||
3403 | size = thresholds->size + 1; | ||
3404 | else | ||
3405 | size = 1; | ||
3406 | |||
3407 | /* Allocate memory for new array of thresholds */ | ||
3408 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3409 | size * sizeof(struct mem_cgroup_threshold), | ||
3410 | GFP_KERNEL); | ||
3411 | if (!thresholds_new) { | ||
3412 | ret = -ENOMEM; | ||
3413 | goto unlock; | ||
3414 | } | ||
3415 | thresholds_new->size = size; | ||
3416 | |||
3417 | /* Copy thresholds (if any) to new array */ | ||
3418 | if (thresholds) | ||
3419 | memcpy(thresholds_new->entries, thresholds->entries, | ||
3420 | thresholds->size * | ||
3421 | sizeof(struct mem_cgroup_threshold)); | ||
3422 | /* Add new threshold */ | ||
3423 | thresholds_new->entries[size - 1].eventfd = eventfd; | ||
3424 | thresholds_new->entries[size - 1].threshold = threshold; | ||
3425 | |||
3426 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | ||
3427 | sort(thresholds_new->entries, size, | ||
3428 | sizeof(struct mem_cgroup_threshold), | ||
3429 | compare_thresholds, NULL); | ||
3430 | |||
3431 | /* Find current threshold */ | ||
3432 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3433 | for (i = 0; i < size; i++) { | ||
3434 | if (thresholds_new->entries[i].threshold < usage) { | ||
3435 | /* | ||
3436 | * thresholds_new->current_threshold will not be used | ||
3437 | * until rcu_assign_pointer(), so it's safe to increment | ||
3438 | * it here. | ||
3439 | */ | ||
3440 | atomic_inc(&thresholds_new->current_threshold); | ||
3441 | } | ||
3442 | } | ||
3443 | |||
3444 | if (type == _MEM) | ||
3445 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3446 | else | ||
3447 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3448 | |||
3449 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3450 | synchronize_rcu(); | ||
3451 | |||
3452 | kfree(thresholds); | ||
3453 | unlock: | ||
3454 | mutex_unlock(&memcg->thresholds_lock); | ||
3455 | |||
3456 | return ret; | ||
3457 | } | ||
3458 | |||
3459 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | ||
3460 | struct eventfd_ctx *eventfd) | ||
3461 | { | ||
3462 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3463 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3464 | int type = MEMFILE_TYPE(cft->private); | ||
3465 | u64 usage; | ||
3466 | int size = 0; | ||
3467 | int i, j, ret; | ||
3468 | |||
3469 | mutex_lock(&memcg->thresholds_lock); | ||
3470 | if (type == _MEM) | ||
3471 | thresholds = memcg->thresholds; | ||
3472 | else if (type == _MEMSWAP) | ||
3473 | thresholds = memcg->memsw_thresholds; | ||
3474 | else | ||
3475 | BUG(); | ||
3476 | |||
3477 | /* | ||
3478 | * Something went wrong if we trying to unregister a threshold | ||
3479 | * if we don't have thresholds | ||
3480 | */ | ||
3481 | BUG_ON(!thresholds); | ||
3482 | |||
3483 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3484 | |||
3485 | /* Check if a threshold crossed before removing */ | ||
3486 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3487 | |||
3488 | /* Calculate new number of threshold */ | ||
3489 | for (i = 0; i < thresholds->size; i++) { | ||
3490 | if (thresholds->entries[i].eventfd != eventfd) | ||
3491 | size++; | ||
3492 | } | ||
3493 | |||
3494 | /* Set thresholds array to NULL if we don't have thresholds */ | ||
3495 | if (!size) { | ||
3496 | thresholds_new = NULL; | ||
3497 | goto assign; | ||
3498 | } | ||
3499 | |||
3500 | /* Allocate memory for new array of thresholds */ | ||
3501 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3502 | size * sizeof(struct mem_cgroup_threshold), | ||
3503 | GFP_KERNEL); | ||
3504 | if (!thresholds_new) { | ||
3505 | ret = -ENOMEM; | ||
3506 | goto unlock; | ||
3507 | } | ||
3508 | thresholds_new->size = size; | ||
3509 | |||
3510 | /* Copy thresholds and find current threshold */ | ||
3511 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3512 | for (i = 0, j = 0; i < thresholds->size; i++) { | ||
3513 | if (thresholds->entries[i].eventfd == eventfd) | ||
3514 | continue; | ||
3515 | |||
3516 | thresholds_new->entries[j] = thresholds->entries[i]; | ||
3517 | if (thresholds_new->entries[j].threshold < usage) { | ||
3518 | /* | ||
3519 | * thresholds_new->current_threshold will not be used | ||
3520 | * until rcu_assign_pointer(), so it's safe to increment | ||
3521 | * it here. | ||
3522 | */ | ||
3523 | atomic_inc(&thresholds_new->current_threshold); | ||
3524 | } | ||
3525 | j++; | ||
3526 | } | ||
3527 | |||
3528 | assign: | ||
3529 | if (type == _MEM) | ||
3530 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3531 | else | ||
3532 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3533 | |||
3534 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3535 | synchronize_rcu(); | ||
3536 | |||
3537 | kfree(thresholds); | ||
3538 | unlock: | ||
3539 | mutex_unlock(&memcg->thresholds_lock); | ||
3540 | |||
3541 | return ret; | ||
3542 | } | ||
3052 | 3543 | ||
3053 | static struct cftype mem_cgroup_files[] = { | 3544 | static struct cftype mem_cgroup_files[] = { |
3054 | { | 3545 | { |
3055 | .name = "usage_in_bytes", | 3546 | .name = "usage_in_bytes", |
3056 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3547 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
3057 | .read_u64 = mem_cgroup_read, | 3548 | .read_u64 = mem_cgroup_read, |
3549 | .register_event = mem_cgroup_register_event, | ||
3550 | .unregister_event = mem_cgroup_unregister_event, | ||
3058 | }, | 3551 | }, |
3059 | { | 3552 | { |
3060 | .name = "max_usage_in_bytes", | 3553 | .name = "max_usage_in_bytes", |
@@ -3098,6 +3591,11 @@ static struct cftype mem_cgroup_files[] = { | |||
3098 | .read_u64 = mem_cgroup_swappiness_read, | 3591 | .read_u64 = mem_cgroup_swappiness_read, |
3099 | .write_u64 = mem_cgroup_swappiness_write, | 3592 | .write_u64 = mem_cgroup_swappiness_write, |
3100 | }, | 3593 | }, |
3594 | { | ||
3595 | .name = "move_charge_at_immigrate", | ||
3596 | .read_u64 = mem_cgroup_move_charge_read, | ||
3597 | .write_u64 = mem_cgroup_move_charge_write, | ||
3598 | }, | ||
3101 | }; | 3599 | }; |
3102 | 3600 | ||
3103 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3601 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -3106,6 +3604,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
3106 | .name = "memsw.usage_in_bytes", | 3604 | .name = "memsw.usage_in_bytes", |
3107 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3605 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
3108 | .read_u64 = mem_cgroup_read, | 3606 | .read_u64 = mem_cgroup_read, |
3607 | .register_event = mem_cgroup_register_event, | ||
3608 | .unregister_event = mem_cgroup_unregister_event, | ||
3109 | }, | 3609 | }, |
3110 | { | 3610 | { |
3111 | .name = "memsw.max_usage_in_bytes", | 3611 | .name = "memsw.max_usage_in_bytes", |
@@ -3180,17 +3680,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
3180 | kfree(mem->info.nodeinfo[node]); | 3680 | kfree(mem->info.nodeinfo[node]); |
3181 | } | 3681 | } |
3182 | 3682 | ||
3183 | static int mem_cgroup_size(void) | ||
3184 | { | ||
3185 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
3186 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
3187 | } | ||
3188 | |||
3189 | static struct mem_cgroup *mem_cgroup_alloc(void) | 3683 | static struct mem_cgroup *mem_cgroup_alloc(void) |
3190 | { | 3684 | { |
3191 | struct mem_cgroup *mem; | 3685 | struct mem_cgroup *mem; |
3192 | int size = mem_cgroup_size(); | 3686 | int size = sizeof(struct mem_cgroup); |
3193 | 3687 | ||
3688 | /* Can be very big if MAX_NUMNODES is very big */ | ||
3194 | if (size < PAGE_SIZE) | 3689 | if (size < PAGE_SIZE) |
3195 | mem = kmalloc(size, GFP_KERNEL); | 3690 | mem = kmalloc(size, GFP_KERNEL); |
3196 | else | 3691 | else |
@@ -3198,6 +3693,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
3198 | 3693 | ||
3199 | if (mem) | 3694 | if (mem) |
3200 | memset(mem, 0, size); | 3695 | memset(mem, 0, size); |
3696 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | ||
3697 | if (!mem->stat) { | ||
3698 | if (size < PAGE_SIZE) | ||
3699 | kfree(mem); | ||
3700 | else | ||
3701 | vfree(mem); | ||
3702 | mem = NULL; | ||
3703 | } | ||
3201 | return mem; | 3704 | return mem; |
3202 | } | 3705 | } |
3203 | 3706 | ||
@@ -3222,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
3222 | for_each_node_state(node, N_POSSIBLE) | 3725 | for_each_node_state(node, N_POSSIBLE) |
3223 | free_mem_cgroup_per_zone_info(mem, node); | 3726 | free_mem_cgroup_per_zone_info(mem, node); |
3224 | 3727 | ||
3225 | if (mem_cgroup_size() < PAGE_SIZE) | 3728 | free_percpu(mem->stat); |
3729 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | ||
3226 | kfree(mem); | 3730 | kfree(mem); |
3227 | else | 3731 | else |
3228 | vfree(mem); | 3732 | vfree(mem); |
@@ -3233,9 +3737,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) | |||
3233 | atomic_inc(&mem->refcnt); | 3737 | atomic_inc(&mem->refcnt); |
3234 | } | 3738 | } |
3235 | 3739 | ||
3236 | static void mem_cgroup_put(struct mem_cgroup *mem) | 3740 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) |
3237 | { | 3741 | { |
3238 | if (atomic_dec_and_test(&mem->refcnt)) { | 3742 | if (atomic_sub_and_test(count, &mem->refcnt)) { |
3239 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 3743 | struct mem_cgroup *parent = parent_mem_cgroup(mem); |
3240 | __mem_cgroup_free(mem); | 3744 | __mem_cgroup_free(mem); |
3241 | if (parent) | 3745 | if (parent) |
@@ -3243,6 +3747,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) | |||
3243 | } | 3747 | } |
3244 | } | 3748 | } |
3245 | 3749 | ||
3750 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
3751 | { | ||
3752 | __mem_cgroup_put(mem, 1); | ||
3753 | } | ||
3754 | |||
3246 | /* | 3755 | /* |
3247 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 3756 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
3248 | */ | 3757 | */ |
@@ -3319,7 +3828,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3319 | INIT_WORK(&stock->work, drain_local_stock); | 3828 | INIT_WORK(&stock->work, drain_local_stock); |
3320 | } | 3829 | } |
3321 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | 3830 | hotcpu_notifier(memcg_stock_cpu_callback, 0); |
3322 | |||
3323 | } else { | 3831 | } else { |
3324 | parent = mem_cgroup_from_cont(cont->parent); | 3832 | parent = mem_cgroup_from_cont(cont->parent); |
3325 | mem->use_hierarchy = parent->use_hierarchy; | 3833 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -3345,6 +3853,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3345 | if (parent) | 3853 | if (parent) |
3346 | mem->swappiness = get_swappiness(parent); | 3854 | mem->swappiness = get_swappiness(parent); |
3347 | atomic_set(&mem->refcnt, 1); | 3855 | atomic_set(&mem->refcnt, 1); |
3856 | mem->move_charge_at_immigrate = 0; | ||
3857 | mutex_init(&mem->thresholds_lock); | ||
3348 | return &mem->css; | 3858 | return &mem->css; |
3349 | free_out: | 3859 | free_out: |
3350 | __mem_cgroup_free(mem); | 3860 | __mem_cgroup_free(mem); |
@@ -3381,16 +3891,444 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
3381 | return ret; | 3891 | return ret; |
3382 | } | 3892 | } |
3383 | 3893 | ||
3894 | #ifdef CONFIG_MMU | ||
3895 | /* Handlers for move charge at task migration. */ | ||
3896 | #define PRECHARGE_COUNT_AT_ONCE 256 | ||
3897 | static int mem_cgroup_do_precharge(unsigned long count) | ||
3898 | { | ||
3899 | int ret = 0; | ||
3900 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3901 | struct mem_cgroup *mem = mc.to; | ||
3902 | |||
3903 | if (mem_cgroup_is_root(mem)) { | ||
3904 | mc.precharge += count; | ||
3905 | /* we don't need css_get for root */ | ||
3906 | return ret; | ||
3907 | } | ||
3908 | /* try to charge at once */ | ||
3909 | if (count > 1) { | ||
3910 | struct res_counter *dummy; | ||
3911 | /* | ||
3912 | * "mem" cannot be under rmdir() because we've already checked | ||
3913 | * by cgroup_lock_live_cgroup() that it is not removed and we | ||
3914 | * are still under the same cgroup_mutex. So we can postpone | ||
3915 | * css_get(). | ||
3916 | */ | ||
3917 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | ||
3918 | goto one_by_one; | ||
3919 | if (do_swap_account && res_counter_charge(&mem->memsw, | ||
3920 | PAGE_SIZE * count, &dummy)) { | ||
3921 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
3922 | goto one_by_one; | ||
3923 | } | ||
3924 | mc.precharge += count; | ||
3925 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
3926 | WARN_ON_ONCE(count > INT_MAX); | ||
3927 | __css_get(&mem->css, (int)count); | ||
3928 | return ret; | ||
3929 | } | ||
3930 | one_by_one: | ||
3931 | /* fall back to one by one charge */ | ||
3932 | while (count--) { | ||
3933 | if (signal_pending(current)) { | ||
3934 | ret = -EINTR; | ||
3935 | break; | ||
3936 | } | ||
3937 | if (!batch_count--) { | ||
3938 | batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3939 | cond_resched(); | ||
3940 | } | ||
3941 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | ||
3942 | if (ret || !mem) | ||
3943 | /* mem_cgroup_clear_mc() will do uncharge later */ | ||
3944 | return -ENOMEM; | ||
3945 | mc.precharge++; | ||
3946 | } | ||
3947 | return ret; | ||
3948 | } | ||
3949 | #else /* !CONFIG_MMU */ | ||
3950 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
3951 | struct cgroup *cgroup, | ||
3952 | struct task_struct *p, | ||
3953 | bool threadgroup) | ||
3954 | { | ||
3955 | return 0; | ||
3956 | } | ||
3957 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
3958 | struct cgroup *cgroup, | ||
3959 | struct task_struct *p, | ||
3960 | bool threadgroup) | ||
3961 | { | ||
3962 | } | ||
3384 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3963 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
3385 | struct cgroup *cont, | 3964 | struct cgroup *cont, |
3386 | struct cgroup *old_cont, | 3965 | struct cgroup *old_cont, |
3387 | struct task_struct *p, | 3966 | struct task_struct *p, |
3388 | bool threadgroup) | 3967 | bool threadgroup) |
3389 | { | 3968 | { |
3969 | } | ||
3970 | #endif | ||
3971 | |||
3972 | /** | ||
3973 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | ||
3974 | * @vma: the vma the pte to be checked belongs | ||
3975 | * @addr: the address corresponding to the pte to be checked | ||
3976 | * @ptent: the pte to be checked | ||
3977 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | ||
3978 | * | ||
3979 | * Returns | ||
3980 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
3981 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
3982 | * move charge. if @target is not NULL, the page is stored in target->page | ||
3983 | * with extra refcnt got(Callers should handle it). | ||
3984 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
3985 | * target for charge migration. if @target is not NULL, the entry is stored | ||
3986 | * in target->ent. | ||
3987 | * | ||
3988 | * Called with pte lock held. | ||
3989 | */ | ||
3990 | union mc_target { | ||
3991 | struct page *page; | ||
3992 | swp_entry_t ent; | ||
3993 | }; | ||
3994 | |||
3995 | enum mc_target_type { | ||
3996 | MC_TARGET_NONE, /* not used */ | ||
3997 | MC_TARGET_PAGE, | ||
3998 | MC_TARGET_SWAP, | ||
3999 | }; | ||
4000 | |||
4001 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | ||
4002 | unsigned long addr, pte_t ptent, union mc_target *target) | ||
4003 | { | ||
4004 | struct page *page = NULL; | ||
4005 | struct page_cgroup *pc; | ||
4006 | int ret = 0; | ||
4007 | swp_entry_t ent = { .val = 0 }; | ||
4008 | int usage_count = 0; | ||
4009 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
4010 | &mc.to->move_charge_at_immigrate); | ||
4011 | |||
4012 | if (!pte_present(ptent)) { | ||
4013 | /* TODO: handle swap of shmes/tmpfs */ | ||
4014 | if (pte_none(ptent) || pte_file(ptent)) | ||
4015 | return 0; | ||
4016 | else if (is_swap_pte(ptent)) { | ||
4017 | ent = pte_to_swp_entry(ptent); | ||
4018 | if (!move_anon || non_swap_entry(ent)) | ||
4019 | return 0; | ||
4020 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
4021 | } | ||
4022 | } else { | ||
4023 | page = vm_normal_page(vma, addr, ptent); | ||
4024 | if (!page || !page_mapped(page)) | ||
4025 | return 0; | ||
4026 | /* | ||
4027 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
4028 | * pages for now. | ||
4029 | */ | ||
4030 | if (!move_anon || !PageAnon(page)) | ||
4031 | return 0; | ||
4032 | if (!get_page_unless_zero(page)) | ||
4033 | return 0; | ||
4034 | usage_count = page_mapcount(page); | ||
4035 | } | ||
4036 | if (usage_count > 1) { | ||
4037 | /* | ||
4038 | * TODO: We don't move charges of shared(used by multiple | ||
4039 | * processes) pages for now. | ||
4040 | */ | ||
4041 | if (page) | ||
4042 | put_page(page); | ||
4043 | return 0; | ||
4044 | } | ||
4045 | if (page) { | ||
4046 | pc = lookup_page_cgroup(page); | ||
4047 | /* | ||
4048 | * Do only loose check w/o page_cgroup lock. | ||
4049 | * mem_cgroup_move_account() checks the pc is valid or not under | ||
4050 | * the lock. | ||
4051 | */ | ||
4052 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
4053 | ret = MC_TARGET_PAGE; | ||
4054 | if (target) | ||
4055 | target->page = page; | ||
4056 | } | ||
4057 | if (!ret || !target) | ||
4058 | put_page(page); | ||
4059 | } | ||
4060 | /* throught */ | ||
4061 | if (ent.val && do_swap_account && !ret && | ||
4062 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | ||
4063 | ret = MC_TARGET_SWAP; | ||
4064 | if (target) | ||
4065 | target->ent = ent; | ||
4066 | } | ||
4067 | return ret; | ||
4068 | } | ||
4069 | |||
4070 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | ||
4071 | unsigned long addr, unsigned long end, | ||
4072 | struct mm_walk *walk) | ||
4073 | { | ||
4074 | struct vm_area_struct *vma = walk->private; | ||
4075 | pte_t *pte; | ||
4076 | spinlock_t *ptl; | ||
4077 | |||
4078 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4079 | for (; addr != end; pte++, addr += PAGE_SIZE) | ||
4080 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | ||
4081 | mc.precharge++; /* increment precharge temporarily */ | ||
4082 | pte_unmap_unlock(pte - 1, ptl); | ||
4083 | cond_resched(); | ||
4084 | |||
4085 | return 0; | ||
4086 | } | ||
4087 | |||
4088 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | ||
4089 | { | ||
4090 | unsigned long precharge; | ||
4091 | struct vm_area_struct *vma; | ||
4092 | |||
4093 | down_read(&mm->mmap_sem); | ||
4094 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4095 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
4096 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
4097 | .mm = mm, | ||
4098 | .private = vma, | ||
4099 | }; | ||
4100 | if (is_vm_hugetlb_page(vma)) | ||
4101 | continue; | ||
4102 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4103 | if (vma->vm_flags & VM_SHARED) | ||
4104 | continue; | ||
4105 | walk_page_range(vma->vm_start, vma->vm_end, | ||
4106 | &mem_cgroup_count_precharge_walk); | ||
4107 | } | ||
4108 | up_read(&mm->mmap_sem); | ||
4109 | |||
4110 | precharge = mc.precharge; | ||
4111 | mc.precharge = 0; | ||
4112 | |||
4113 | return precharge; | ||
4114 | } | ||
4115 | |||
4116 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | ||
4117 | { | ||
4118 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | ||
4119 | } | ||
4120 | |||
4121 | static void mem_cgroup_clear_mc(void) | ||
4122 | { | ||
4123 | /* we must uncharge all the leftover precharges from mc.to */ | ||
4124 | if (mc.precharge) { | ||
4125 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | ||
4126 | mc.precharge = 0; | ||
4127 | } | ||
3390 | /* | 4128 | /* |
3391 | * FIXME: It's better to move charges of this process from old | 4129 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
3392 | * memcg to new memcg. But it's just on TODO-List now. | 4130 | * we must uncharge here. |
3393 | */ | 4131 | */ |
4132 | if (mc.moved_charge) { | ||
4133 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | ||
4134 | mc.moved_charge = 0; | ||
4135 | } | ||
4136 | /* we must fixup refcnts and charges */ | ||
4137 | if (mc.moved_swap) { | ||
4138 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
4139 | /* uncharge swap account from the old cgroup */ | ||
4140 | if (!mem_cgroup_is_root(mc.from)) | ||
4141 | res_counter_uncharge(&mc.from->memsw, | ||
4142 | PAGE_SIZE * mc.moved_swap); | ||
4143 | __mem_cgroup_put(mc.from, mc.moved_swap); | ||
4144 | |||
4145 | if (!mem_cgroup_is_root(mc.to)) { | ||
4146 | /* | ||
4147 | * we charged both to->res and to->memsw, so we should | ||
4148 | * uncharge to->res. | ||
4149 | */ | ||
4150 | res_counter_uncharge(&mc.to->res, | ||
4151 | PAGE_SIZE * mc.moved_swap); | ||
4152 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
4153 | __css_put(&mc.to->css, mc.moved_swap); | ||
4154 | } | ||
4155 | /* we've already done mem_cgroup_get(mc.to) */ | ||
4156 | |||
4157 | mc.moved_swap = 0; | ||
4158 | } | ||
4159 | mc.from = NULL; | ||
4160 | mc.to = NULL; | ||
4161 | mc.moving_task = NULL; | ||
4162 | wake_up_all(&mc.waitq); | ||
4163 | } | ||
4164 | |||
4165 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
4166 | struct cgroup *cgroup, | ||
4167 | struct task_struct *p, | ||
4168 | bool threadgroup) | ||
4169 | { | ||
4170 | int ret = 0; | ||
4171 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | ||
4172 | |||
4173 | if (mem->move_charge_at_immigrate) { | ||
4174 | struct mm_struct *mm; | ||
4175 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
4176 | |||
4177 | VM_BUG_ON(from == mem); | ||
4178 | |||
4179 | mm = get_task_mm(p); | ||
4180 | if (!mm) | ||
4181 | return 0; | ||
4182 | /* We move charges only when we move a owner of the mm */ | ||
4183 | if (mm->owner == p) { | ||
4184 | VM_BUG_ON(mc.from); | ||
4185 | VM_BUG_ON(mc.to); | ||
4186 | VM_BUG_ON(mc.precharge); | ||
4187 | VM_BUG_ON(mc.moved_charge); | ||
4188 | VM_BUG_ON(mc.moved_swap); | ||
4189 | VM_BUG_ON(mc.moving_task); | ||
4190 | mc.from = from; | ||
4191 | mc.to = mem; | ||
4192 | mc.precharge = 0; | ||
4193 | mc.moved_charge = 0; | ||
4194 | mc.moved_swap = 0; | ||
4195 | mc.moving_task = current; | ||
4196 | |||
4197 | ret = mem_cgroup_precharge_mc(mm); | ||
4198 | if (ret) | ||
4199 | mem_cgroup_clear_mc(); | ||
4200 | } | ||
4201 | mmput(mm); | ||
4202 | } | ||
4203 | return ret; | ||
4204 | } | ||
4205 | |||
4206 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
4207 | struct cgroup *cgroup, | ||
4208 | struct task_struct *p, | ||
4209 | bool threadgroup) | ||
4210 | { | ||
4211 | mem_cgroup_clear_mc(); | ||
4212 | } | ||
4213 | |||
4214 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | ||
4215 | unsigned long addr, unsigned long end, | ||
4216 | struct mm_walk *walk) | ||
4217 | { | ||
4218 | int ret = 0; | ||
4219 | struct vm_area_struct *vma = walk->private; | ||
4220 | pte_t *pte; | ||
4221 | spinlock_t *ptl; | ||
4222 | |||
4223 | retry: | ||
4224 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4225 | for (; addr != end; addr += PAGE_SIZE) { | ||
4226 | pte_t ptent = *(pte++); | ||
4227 | union mc_target target; | ||
4228 | int type; | ||
4229 | struct page *page; | ||
4230 | struct page_cgroup *pc; | ||
4231 | swp_entry_t ent; | ||
4232 | |||
4233 | if (!mc.precharge) | ||
4234 | break; | ||
4235 | |||
4236 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | ||
4237 | switch (type) { | ||
4238 | case MC_TARGET_PAGE: | ||
4239 | page = target.page; | ||
4240 | if (isolate_lru_page(page)) | ||
4241 | goto put; | ||
4242 | pc = lookup_page_cgroup(page); | ||
4243 | if (!mem_cgroup_move_account(pc, | ||
4244 | mc.from, mc.to, false)) { | ||
4245 | mc.precharge--; | ||
4246 | /* we uncharge from mc.from later. */ | ||
4247 | mc.moved_charge++; | ||
4248 | } | ||
4249 | putback_lru_page(page); | ||
4250 | put: /* is_target_pte_for_mc() gets the page */ | ||
4251 | put_page(page); | ||
4252 | break; | ||
4253 | case MC_TARGET_SWAP: | ||
4254 | ent = target.ent; | ||
4255 | if (!mem_cgroup_move_swap_account(ent, | ||
4256 | mc.from, mc.to, false)) { | ||
4257 | mc.precharge--; | ||
4258 | /* we fixup refcnts and charges later. */ | ||
4259 | mc.moved_swap++; | ||
4260 | } | ||
4261 | break; | ||
4262 | default: | ||
4263 | break; | ||
4264 | } | ||
4265 | } | ||
4266 | pte_unmap_unlock(pte - 1, ptl); | ||
4267 | cond_resched(); | ||
4268 | |||
4269 | if (addr != end) { | ||
4270 | /* | ||
4271 | * We have consumed all precharges we got in can_attach(). | ||
4272 | * We try charge one by one, but don't do any additional | ||
4273 | * charges to mc.to if we have failed in charge once in attach() | ||
4274 | * phase. | ||
4275 | */ | ||
4276 | ret = mem_cgroup_do_precharge(1); | ||
4277 | if (!ret) | ||
4278 | goto retry; | ||
4279 | } | ||
4280 | |||
4281 | return ret; | ||
4282 | } | ||
4283 | |||
4284 | static void mem_cgroup_move_charge(struct mm_struct *mm) | ||
4285 | { | ||
4286 | struct vm_area_struct *vma; | ||
4287 | |||
4288 | lru_add_drain_all(); | ||
4289 | down_read(&mm->mmap_sem); | ||
4290 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4291 | int ret; | ||
4292 | struct mm_walk mem_cgroup_move_charge_walk = { | ||
4293 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
4294 | .mm = mm, | ||
4295 | .private = vma, | ||
4296 | }; | ||
4297 | if (is_vm_hugetlb_page(vma)) | ||
4298 | continue; | ||
4299 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4300 | if (vma->vm_flags & VM_SHARED) | ||
4301 | continue; | ||
4302 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
4303 | &mem_cgroup_move_charge_walk); | ||
4304 | if (ret) | ||
4305 | /* | ||
4306 | * means we have consumed all precharges and failed in | ||
4307 | * doing additional charge. Just abandon here. | ||
4308 | */ | ||
4309 | break; | ||
4310 | } | ||
4311 | up_read(&mm->mmap_sem); | ||
4312 | } | ||
4313 | |||
4314 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
4315 | struct cgroup *cont, | ||
4316 | struct cgroup *old_cont, | ||
4317 | struct task_struct *p, | ||
4318 | bool threadgroup) | ||
4319 | { | ||
4320 | struct mm_struct *mm; | ||
4321 | |||
4322 | if (!mc.to) | ||
4323 | /* no need to move charge */ | ||
4324 | return; | ||
4325 | |||
4326 | mm = get_task_mm(p); | ||
4327 | if (mm) { | ||
4328 | mem_cgroup_move_charge(mm); | ||
4329 | mmput(mm); | ||
4330 | } | ||
4331 | mem_cgroup_clear_mc(); | ||
3394 | } | 4332 | } |
3395 | 4333 | ||
3396 | struct cgroup_subsys mem_cgroup_subsys = { | 4334 | struct cgroup_subsys mem_cgroup_subsys = { |
@@ -3400,6 +4338,8 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
3400 | .pre_destroy = mem_cgroup_pre_destroy, | 4338 | .pre_destroy = mem_cgroup_pre_destroy, |
3401 | .destroy = mem_cgroup_destroy, | 4339 | .destroy = mem_cgroup_destroy, |
3402 | .populate = mem_cgroup_populate, | 4340 | .populate = mem_cgroup_populate, |
4341 | .can_attach = mem_cgroup_can_attach, | ||
4342 | .cancel_attach = mem_cgroup_cancel_attach, | ||
3403 | .attach = mem_cgroup_move_task, | 4343 | .attach = mem_cgroup_move_task, |
3404 | .early_init = 0, | 4344 | .early_init = 0, |
3405 | .use_id = 1, | 4345 | .use_id = 1, |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 17299fd4577c..d1f335162976 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -383,9 +383,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
383 | if (av == NULL) /* Not actually mapped anymore */ | 383 | if (av == NULL) /* Not actually mapped anymore */ |
384 | goto out; | 384 | goto out; |
385 | for_each_process (tsk) { | 385 | for_each_process (tsk) { |
386 | struct anon_vma_chain *vmac; | ||
387 | |||
386 | if (!task_early_kill(tsk)) | 388 | if (!task_early_kill(tsk)) |
387 | continue; | 389 | continue; |
388 | list_for_each_entry (vma, &av->head, anon_vma_node) { | 390 | list_for_each_entry(vmac, &av->head, same_anon_vma) { |
391 | vma = vmac->vma; | ||
389 | if (!page_mapped_in_vma(page, vma)) | 392 | if (!page_mapped_in_vma(page, vma)) |
390 | continue; | 393 | continue; |
391 | if (vma->vm_mm == tsk->mm) | 394 | if (vma->vm_mm == tsk->mm) |
diff --git a/mm/memory.c b/mm/memory.c index 09e4b1be7b67..5b7f2002e54b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -121,6 +121,77 @@ static int __init init_zero_pfn(void) | |||
121 | } | 121 | } |
122 | core_initcall(init_zero_pfn); | 122 | core_initcall(init_zero_pfn); |
123 | 123 | ||
124 | |||
125 | #if defined(SPLIT_RSS_COUNTING) | ||
126 | |||
127 | void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | ||
128 | { | ||
129 | int i; | ||
130 | |||
131 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
132 | if (task->rss_stat.count[i]) { | ||
133 | add_mm_counter(mm, i, task->rss_stat.count[i]); | ||
134 | task->rss_stat.count[i] = 0; | ||
135 | } | ||
136 | } | ||
137 | task->rss_stat.events = 0; | ||
138 | } | ||
139 | |||
140 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | ||
141 | { | ||
142 | struct task_struct *task = current; | ||
143 | |||
144 | if (likely(task->mm == mm)) | ||
145 | task->rss_stat.count[member] += val; | ||
146 | else | ||
147 | add_mm_counter(mm, member, val); | ||
148 | } | ||
149 | #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) | ||
150 | #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) | ||
151 | |||
152 | /* sync counter once per 64 page faults */ | ||
153 | #define TASK_RSS_EVENTS_THRESH (64) | ||
154 | static void check_sync_rss_stat(struct task_struct *task) | ||
155 | { | ||
156 | if (unlikely(task != current)) | ||
157 | return; | ||
158 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | ||
159 | __sync_task_rss_stat(task, task->mm); | ||
160 | } | ||
161 | |||
162 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
163 | { | ||
164 | long val = 0; | ||
165 | |||
166 | /* | ||
167 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
168 | * The caller must guarantee task->mm is not invalid. | ||
169 | */ | ||
170 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
171 | /* | ||
172 | * counter is updated in asynchronous manner and may go to minus. | ||
173 | * But it's never be expected number for users. | ||
174 | */ | ||
175 | if (val < 0) | ||
176 | return 0; | ||
177 | return (unsigned long)val; | ||
178 | } | ||
179 | |||
180 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
181 | { | ||
182 | __sync_task_rss_stat(task, mm); | ||
183 | } | ||
184 | #else | ||
185 | |||
186 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | ||
187 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | ||
188 | |||
189 | static void check_sync_rss_stat(struct task_struct *task) | ||
190 | { | ||
191 | } | ||
192 | |||
193 | #endif | ||
194 | |||
124 | /* | 195 | /* |
125 | * If a p?d_bad entry is found while walking page tables, report | 196 | * If a p?d_bad entry is found while walking page tables, report |
126 | * the error, before resetting entry to p?d_none. Usually (but | 197 | * the error, before resetting entry to p?d_none. Usually (but |
@@ -300,7 +371,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
300 | * Hide vma from rmap and truncate_pagecache before freeing | 371 | * Hide vma from rmap and truncate_pagecache before freeing |
301 | * pgtables | 372 | * pgtables |
302 | */ | 373 | */ |
303 | anon_vma_unlink(vma); | 374 | unlink_anon_vmas(vma); |
304 | unlink_file_vma(vma); | 375 | unlink_file_vma(vma); |
305 | 376 | ||
306 | if (is_vm_hugetlb_page(vma)) { | 377 | if (is_vm_hugetlb_page(vma)) { |
@@ -314,7 +385,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
314 | && !is_vm_hugetlb_page(next)) { | 385 | && !is_vm_hugetlb_page(next)) { |
315 | vma = next; | 386 | vma = next; |
316 | next = vma->vm_next; | 387 | next = vma->vm_next; |
317 | anon_vma_unlink(vma); | 388 | unlink_anon_vmas(vma); |
318 | unlink_file_vma(vma); | 389 | unlink_file_vma(vma); |
319 | } | 390 | } |
320 | free_pgd_range(tlb, addr, vma->vm_end, | 391 | free_pgd_range(tlb, addr, vma->vm_end, |
@@ -376,12 +447,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
376 | return 0; | 447 | return 0; |
377 | } | 448 | } |
378 | 449 | ||
379 | static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | 450 | static inline void init_rss_vec(int *rss) |
380 | { | 451 | { |
381 | if (file_rss) | 452 | memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); |
382 | add_mm_counter(mm, file_rss, file_rss); | 453 | } |
383 | if (anon_rss) | 454 | |
384 | add_mm_counter(mm, anon_rss, anon_rss); | 455 | static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) |
456 | { | ||
457 | int i; | ||
458 | |||
459 | if (current->mm == mm) | ||
460 | sync_mm_rss(current, mm); | ||
461 | for (i = 0; i < NR_MM_COUNTERS; i++) | ||
462 | if (rss[i]) | ||
463 | add_mm_counter(mm, i, rss[i]); | ||
385 | } | 464 | } |
386 | 465 | ||
387 | /* | 466 | /* |
@@ -430,12 +509,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
430 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", | 509 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", |
431 | current->comm, | 510 | current->comm, |
432 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | 511 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); |
433 | if (page) { | 512 | if (page) |
434 | printk(KERN_ALERT | 513 | dump_page(page); |
435 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
436 | page, (void *)page->flags, page_count(page), | ||
437 | page_mapcount(page), page->mapping, page->index); | ||
438 | } | ||
439 | printk(KERN_ALERT | 514 | printk(KERN_ALERT |
440 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | 515 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", |
441 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | 516 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); |
@@ -597,7 +672,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
597 | &src_mm->mmlist); | 672 | &src_mm->mmlist); |
598 | spin_unlock(&mmlist_lock); | 673 | spin_unlock(&mmlist_lock); |
599 | } | 674 | } |
600 | if (is_write_migration_entry(entry) && | 675 | if (likely(!non_swap_entry(entry))) |
676 | rss[MM_SWAPENTS]++; | ||
677 | else if (is_write_migration_entry(entry) && | ||
601 | is_cow_mapping(vm_flags)) { | 678 | is_cow_mapping(vm_flags)) { |
602 | /* | 679 | /* |
603 | * COW mappings require pages in both parent | 680 | * COW mappings require pages in both parent |
@@ -632,7 +709,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
632 | if (page) { | 709 | if (page) { |
633 | get_page(page); | 710 | get_page(page); |
634 | page_dup_rmap(page); | 711 | page_dup_rmap(page); |
635 | rss[PageAnon(page)]++; | 712 | if (PageAnon(page)) |
713 | rss[MM_ANONPAGES]++; | ||
714 | else | ||
715 | rss[MM_FILEPAGES]++; | ||
636 | } | 716 | } |
637 | 717 | ||
638 | out_set_pte: | 718 | out_set_pte: |
@@ -648,11 +728,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
648 | pte_t *src_pte, *dst_pte; | 728 | pte_t *src_pte, *dst_pte; |
649 | spinlock_t *src_ptl, *dst_ptl; | 729 | spinlock_t *src_ptl, *dst_ptl; |
650 | int progress = 0; | 730 | int progress = 0; |
651 | int rss[2]; | 731 | int rss[NR_MM_COUNTERS]; |
652 | swp_entry_t entry = (swp_entry_t){0}; | 732 | swp_entry_t entry = (swp_entry_t){0}; |
653 | 733 | ||
654 | again: | 734 | again: |
655 | rss[1] = rss[0] = 0; | 735 | init_rss_vec(rss); |
736 | |||
656 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | 737 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); |
657 | if (!dst_pte) | 738 | if (!dst_pte) |
658 | return -ENOMEM; | 739 | return -ENOMEM; |
@@ -688,7 +769,7 @@ again: | |||
688 | arch_leave_lazy_mmu_mode(); | 769 | arch_leave_lazy_mmu_mode(); |
689 | spin_unlock(src_ptl); | 770 | spin_unlock(src_ptl); |
690 | pte_unmap_nested(orig_src_pte); | 771 | pte_unmap_nested(orig_src_pte); |
691 | add_mm_rss(dst_mm, rss[0], rss[1]); | 772 | add_mm_rss_vec(dst_mm, rss); |
692 | pte_unmap_unlock(orig_dst_pte, dst_ptl); | 773 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
693 | cond_resched(); | 774 | cond_resched(); |
694 | 775 | ||
@@ -816,8 +897,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
816 | struct mm_struct *mm = tlb->mm; | 897 | struct mm_struct *mm = tlb->mm; |
817 | pte_t *pte; | 898 | pte_t *pte; |
818 | spinlock_t *ptl; | 899 | spinlock_t *ptl; |
819 | int file_rss = 0; | 900 | int rss[NR_MM_COUNTERS]; |
820 | int anon_rss = 0; | 901 | |
902 | init_rss_vec(rss); | ||
821 | 903 | ||
822 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 904 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
823 | arch_enter_lazy_mmu_mode(); | 905 | arch_enter_lazy_mmu_mode(); |
@@ -863,14 +945,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
863 | set_pte_at(mm, addr, pte, | 945 | set_pte_at(mm, addr, pte, |
864 | pgoff_to_pte(page->index)); | 946 | pgoff_to_pte(page->index)); |
865 | if (PageAnon(page)) | 947 | if (PageAnon(page)) |
866 | anon_rss--; | 948 | rss[MM_ANONPAGES]--; |
867 | else { | 949 | else { |
868 | if (pte_dirty(ptent)) | 950 | if (pte_dirty(ptent)) |
869 | set_page_dirty(page); | 951 | set_page_dirty(page); |
870 | if (pte_young(ptent) && | 952 | if (pte_young(ptent) && |
871 | likely(!VM_SequentialReadHint(vma))) | 953 | likely(!VM_SequentialReadHint(vma))) |
872 | mark_page_accessed(page); | 954 | mark_page_accessed(page); |
873 | file_rss--; | 955 | rss[MM_FILEPAGES]--; |
874 | } | 956 | } |
875 | page_remove_rmap(page); | 957 | page_remove_rmap(page); |
876 | if (unlikely(page_mapcount(page) < 0)) | 958 | if (unlikely(page_mapcount(page) < 0)) |
@@ -887,13 +969,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
887 | if (pte_file(ptent)) { | 969 | if (pte_file(ptent)) { |
888 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) | 970 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
889 | print_bad_pte(vma, addr, ptent, NULL); | 971 | print_bad_pte(vma, addr, ptent, NULL); |
890 | } else if | 972 | } else { |
891 | (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) | 973 | swp_entry_t entry = pte_to_swp_entry(ptent); |
892 | print_bad_pte(vma, addr, ptent, NULL); | 974 | |
975 | if (!non_swap_entry(entry)) | ||
976 | rss[MM_SWAPENTS]--; | ||
977 | if (unlikely(!free_swap_and_cache(entry))) | ||
978 | print_bad_pte(vma, addr, ptent, NULL); | ||
979 | } | ||
893 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 980 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
894 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 981 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
895 | 982 | ||
896 | add_mm_rss(mm, file_rss, anon_rss); | 983 | add_mm_rss_vec(mm, rss); |
897 | arch_leave_lazy_mmu_mode(); | 984 | arch_leave_lazy_mmu_mode(); |
898 | pte_unmap_unlock(pte - 1, ptl); | 985 | pte_unmap_unlock(pte - 1, ptl); |
899 | 986 | ||
@@ -1527,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1527 | 1614 | ||
1528 | /* Ok, finally just insert the thing.. */ | 1615 | /* Ok, finally just insert the thing.. */ |
1529 | get_page(page); | 1616 | get_page(page); |
1530 | inc_mm_counter(mm, file_rss); | 1617 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
1531 | page_add_file_rmap(page); | 1618 | page_add_file_rmap(page); |
1532 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1619 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1533 | 1620 | ||
@@ -1593,7 +1680,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1593 | /* Ok, finally just insert the thing.. */ | 1680 | /* Ok, finally just insert the thing.. */ |
1594 | entry = pte_mkspecial(pfn_pte(pfn, prot)); | 1681 | entry = pte_mkspecial(pfn_pte(pfn, prot)); |
1595 | set_pte_at(mm, addr, pte, entry); | 1682 | set_pte_at(mm, addr, pte, entry); |
1596 | update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ | 1683 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ |
1597 | 1684 | ||
1598 | retval = 0; | 1685 | retval = 0; |
1599 | out_unlock: | 1686 | out_unlock: |
@@ -2044,6 +2131,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2044 | page_cache_release(old_page); | 2131 | page_cache_release(old_page); |
2045 | } | 2132 | } |
2046 | reuse = reuse_swap_page(old_page); | 2133 | reuse = reuse_swap_page(old_page); |
2134 | if (reuse) | ||
2135 | /* | ||
2136 | * The page is all ours. Move it to our anon_vma so | ||
2137 | * the rmap code will not search our parent or siblings. | ||
2138 | * Protected against the rmap code by the page lock. | ||
2139 | */ | ||
2140 | page_move_anon_rmap(old_page, vma, address); | ||
2047 | unlock_page(old_page); | 2141 | unlock_page(old_page); |
2048 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2142 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2049 | (VM_WRITE|VM_SHARED))) { | 2143 | (VM_WRITE|VM_SHARED))) { |
@@ -2116,7 +2210,7 @@ reuse: | |||
2116 | entry = pte_mkyoung(orig_pte); | 2210 | entry = pte_mkyoung(orig_pte); |
2117 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2211 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2118 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2212 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
2119 | update_mmu_cache(vma, address, entry); | 2213 | update_mmu_cache(vma, address, page_table); |
2120 | ret |= VM_FAULT_WRITE; | 2214 | ret |= VM_FAULT_WRITE; |
2121 | goto unlock; | 2215 | goto unlock; |
2122 | } | 2216 | } |
@@ -2163,11 +2257,11 @@ gotten: | |||
2163 | if (likely(pte_same(*page_table, orig_pte))) { | 2257 | if (likely(pte_same(*page_table, orig_pte))) { |
2164 | if (old_page) { | 2258 | if (old_page) { |
2165 | if (!PageAnon(old_page)) { | 2259 | if (!PageAnon(old_page)) { |
2166 | dec_mm_counter(mm, file_rss); | 2260 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
2167 | inc_mm_counter(mm, anon_rss); | 2261 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2168 | } | 2262 | } |
2169 | } else | 2263 | } else |
2170 | inc_mm_counter(mm, anon_rss); | 2264 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2171 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2265 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2172 | entry = mk_pte(new_page, vma->vm_page_prot); | 2266 | entry = mk_pte(new_page, vma->vm_page_prot); |
2173 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2267 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -2185,7 +2279,7 @@ gotten: | |||
2185 | * new page to be mapped directly into the secondary page table. | 2279 | * new page to be mapped directly into the secondary page table. |
2186 | */ | 2280 | */ |
2187 | set_pte_at_notify(mm, address, page_table, entry); | 2281 | set_pte_at_notify(mm, address, page_table, entry); |
2188 | update_mmu_cache(vma, address, entry); | 2282 | update_mmu_cache(vma, address, page_table); |
2189 | if (old_page) { | 2283 | if (old_page) { |
2190 | /* | 2284 | /* |
2191 | * Only after switching the pte to the new page may | 2285 | * Only after switching the pte to the new page may |
@@ -2604,7 +2698,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2604 | * discarded at swap_free(). | 2698 | * discarded at swap_free(). |
2605 | */ | 2699 | */ |
2606 | 2700 | ||
2607 | inc_mm_counter(mm, anon_rss); | 2701 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2702 | dec_mm_counter_fast(mm, MM_SWAPENTS); | ||
2608 | pte = mk_pte(page, vma->vm_page_prot); | 2703 | pte = mk_pte(page, vma->vm_page_prot); |
2609 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { | 2704 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
2610 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2705 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
@@ -2629,7 +2724,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2629 | } | 2724 | } |
2630 | 2725 | ||
2631 | /* No need to invalidate - it was non-present before */ | 2726 | /* No need to invalidate - it was non-present before */ |
2632 | update_mmu_cache(vma, address, pte); | 2727 | update_mmu_cache(vma, address, page_table); |
2633 | unlock: | 2728 | unlock: |
2634 | pte_unmap_unlock(page_table, ptl); | 2729 | pte_unmap_unlock(page_table, ptl); |
2635 | out: | 2730 | out: |
@@ -2688,13 +2783,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2688 | if (!pte_none(*page_table)) | 2783 | if (!pte_none(*page_table)) |
2689 | goto release; | 2784 | goto release; |
2690 | 2785 | ||
2691 | inc_mm_counter(mm, anon_rss); | 2786 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2692 | page_add_new_anon_rmap(page, vma, address); | 2787 | page_add_new_anon_rmap(page, vma, address); |
2693 | setpte: | 2788 | setpte: |
2694 | set_pte_at(mm, address, page_table, entry); | 2789 | set_pte_at(mm, address, page_table, entry); |
2695 | 2790 | ||
2696 | /* No need to invalidate - it was non-present before */ | 2791 | /* No need to invalidate - it was non-present before */ |
2697 | update_mmu_cache(vma, address, entry); | 2792 | update_mmu_cache(vma, address, page_table); |
2698 | unlock: | 2793 | unlock: |
2699 | pte_unmap_unlock(page_table, ptl); | 2794 | pte_unmap_unlock(page_table, ptl); |
2700 | return 0; | 2795 | return 0; |
@@ -2842,10 +2937,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2842 | if (flags & FAULT_FLAG_WRITE) | 2937 | if (flags & FAULT_FLAG_WRITE) |
2843 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2938 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2844 | if (anon) { | 2939 | if (anon) { |
2845 | inc_mm_counter(mm, anon_rss); | 2940 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2846 | page_add_new_anon_rmap(page, vma, address); | 2941 | page_add_new_anon_rmap(page, vma, address); |
2847 | } else { | 2942 | } else { |
2848 | inc_mm_counter(mm, file_rss); | 2943 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
2849 | page_add_file_rmap(page); | 2944 | page_add_file_rmap(page); |
2850 | if (flags & FAULT_FLAG_WRITE) { | 2945 | if (flags & FAULT_FLAG_WRITE) { |
2851 | dirty_page = page; | 2946 | dirty_page = page; |
@@ -2855,7 +2950,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2855 | set_pte_at(mm, address, page_table, entry); | 2950 | set_pte_at(mm, address, page_table, entry); |
2856 | 2951 | ||
2857 | /* no need to invalidate: a not-present page won't be cached */ | 2952 | /* no need to invalidate: a not-present page won't be cached */ |
2858 | update_mmu_cache(vma, address, entry); | 2953 | update_mmu_cache(vma, address, page_table); |
2859 | } else { | 2954 | } else { |
2860 | if (charged) | 2955 | if (charged) |
2861 | mem_cgroup_uncharge_page(page); | 2956 | mem_cgroup_uncharge_page(page); |
@@ -2992,7 +3087,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2992 | } | 3087 | } |
2993 | entry = pte_mkyoung(entry); | 3088 | entry = pte_mkyoung(entry); |
2994 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { | 3089 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { |
2995 | update_mmu_cache(vma, address, entry); | 3090 | update_mmu_cache(vma, address, pte); |
2996 | } else { | 3091 | } else { |
2997 | /* | 3092 | /* |
2998 | * This is needed only for protection faults but the arch code | 3093 | * This is needed only for protection faults but the arch code |
@@ -3023,6 +3118,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3023 | 3118 | ||
3024 | count_vm_event(PGFAULT); | 3119 | count_vm_event(PGFAULT); |
3025 | 3120 | ||
3121 | /* do counter updates before entering really critical section. */ | ||
3122 | check_sync_rss_stat(current); | ||
3123 | |||
3026 | if (unlikely(is_vm_hugetlb_page(vma))) | 3124 | if (unlikely(is_vm_hugetlb_page(vma))) |
3027 | return hugetlb_fault(mm, vma, address, flags); | 3125 | return hugetlb_fault(mm, vma, address, flags); |
3028 | 3126 | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 030ce8a5bb0e..be211a582930 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
30 | #include <linux/mm_inline.h> | 30 | #include <linux/mm_inline.h> |
31 | #include <linux/firmware-map.h> | ||
31 | 32 | ||
32 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
33 | 34 | ||
@@ -523,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
523 | BUG_ON(ret); | 524 | BUG_ON(ret); |
524 | } | 525 | } |
525 | 526 | ||
527 | /* create new memmap entry */ | ||
528 | firmware_map_add_hotplug(start, start + size, "System RAM"); | ||
529 | |||
526 | goto out; | 530 | goto out; |
527 | 531 | ||
528 | error: | 532 | error: |
@@ -684,9 +688,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
684 | if (page_count(page)) | 688 | if (page_count(page)) |
685 | not_managed++; | 689 | not_managed++; |
686 | #ifdef CONFIG_DEBUG_VM | 690 | #ifdef CONFIG_DEBUG_VM |
687 | printk(KERN_INFO "removing from LRU failed" | 691 | printk(KERN_ALERT "removing pfn %lx from LRU failed\n", |
688 | " %lx/%d/%lx\n", | 692 | pfn); |
689 | pfn, page_count(page), page->flags); | 693 | dump_page(page); |
690 | #endif | 694 | #endif |
691 | } | 695 | } |
692 | } | 696 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3cec080faa23..643f66e10187 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -563,24 +563,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | |||
563 | } | 563 | } |
564 | 564 | ||
565 | /* Step 2: apply policy to a range and do splits. */ | 565 | /* Step 2: apply policy to a range and do splits. */ |
566 | static int mbind_range(struct vm_area_struct *vma, unsigned long start, | 566 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
567 | unsigned long end, struct mempolicy *new) | 567 | unsigned long end, struct mempolicy *new_pol) |
568 | { | 568 | { |
569 | struct vm_area_struct *next; | 569 | struct vm_area_struct *next; |
570 | int err; | 570 | struct vm_area_struct *prev; |
571 | struct vm_area_struct *vma; | ||
572 | int err = 0; | ||
573 | pgoff_t pgoff; | ||
574 | unsigned long vmstart; | ||
575 | unsigned long vmend; | ||
571 | 576 | ||
572 | err = 0; | 577 | vma = find_vma_prev(mm, start, &prev); |
573 | for (; vma && vma->vm_start < end; vma = next) { | 578 | if (!vma || vma->vm_start > start) |
579 | return -EFAULT; | ||
580 | |||
581 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { | ||
574 | next = vma->vm_next; | 582 | next = vma->vm_next; |
575 | if (vma->vm_start < start) | 583 | vmstart = max(start, vma->vm_start); |
576 | err = split_vma(vma->vm_mm, vma, start, 1); | 584 | vmend = min(end, vma->vm_end); |
577 | if (!err && vma->vm_end > end) | 585 | |
578 | err = split_vma(vma->vm_mm, vma, end, 0); | 586 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
579 | if (!err) | 587 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
580 | err = policy_vma(vma, new); | 588 | vma->anon_vma, vma->vm_file, pgoff, new_pol); |
589 | if (prev) { | ||
590 | vma = prev; | ||
591 | next = vma->vm_next; | ||
592 | continue; | ||
593 | } | ||
594 | if (vma->vm_start != vmstart) { | ||
595 | err = split_vma(vma->vm_mm, vma, vmstart, 1); | ||
596 | if (err) | ||
597 | goto out; | ||
598 | } | ||
599 | if (vma->vm_end != vmend) { | ||
600 | err = split_vma(vma->vm_mm, vma, vmend, 0); | ||
601 | if (err) | ||
602 | goto out; | ||
603 | } | ||
604 | err = policy_vma(vma, new_pol); | ||
581 | if (err) | 605 | if (err) |
582 | break; | 606 | goto out; |
583 | } | 607 | } |
608 | |||
609 | out: | ||
584 | return err; | 610 | return err; |
585 | } | 611 | } |
586 | 612 | ||
@@ -862,36 +888,36 @@ int do_migrate_pages(struct mm_struct *mm, | |||
862 | if (err) | 888 | if (err) |
863 | goto out; | 889 | goto out; |
864 | 890 | ||
865 | /* | 891 | /* |
866 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 892 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
867 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 893 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
868 | * bit in 'tmp', and return that <source, dest> pair for migration. | 894 | * bit in 'tmp', and return that <source, dest> pair for migration. |
869 | * The pair of nodemasks 'to' and 'from' define the map. | 895 | * The pair of nodemasks 'to' and 'from' define the map. |
870 | * | 896 | * |
871 | * If no pair of bits is found that way, fallback to picking some | 897 | * If no pair of bits is found that way, fallback to picking some |
872 | * pair of 'source' and 'dest' bits that are not the same. If the | 898 | * pair of 'source' and 'dest' bits that are not the same. If the |
873 | * 'source' and 'dest' bits are the same, this represents a node | 899 | * 'source' and 'dest' bits are the same, this represents a node |
874 | * that will be migrating to itself, so no pages need move. | 900 | * that will be migrating to itself, so no pages need move. |
875 | * | 901 | * |
876 | * If no bits are left in 'tmp', or if all remaining bits left | 902 | * If no bits are left in 'tmp', or if all remaining bits left |
877 | * in 'tmp' correspond to the same bit in 'to', return false | 903 | * in 'tmp' correspond to the same bit in 'to', return false |
878 | * (nothing left to migrate). | 904 | * (nothing left to migrate). |
879 | * | 905 | * |
880 | * This lets us pick a pair of nodes to migrate between, such that | 906 | * This lets us pick a pair of nodes to migrate between, such that |
881 | * if possible the dest node is not already occupied by some other | 907 | * if possible the dest node is not already occupied by some other |
882 | * source node, minimizing the risk of overloading the memory on a | 908 | * source node, minimizing the risk of overloading the memory on a |
883 | * node that would happen if we migrated incoming memory to a node | 909 | * node that would happen if we migrated incoming memory to a node |
884 | * before migrating outgoing memory source that same node. | 910 | * before migrating outgoing memory source that same node. |
885 | * | 911 | * |
886 | * A single scan of tmp is sufficient. As we go, we remember the | 912 | * A single scan of tmp is sufficient. As we go, we remember the |
887 | * most recent <s, d> pair that moved (s != d). If we find a pair | 913 | * most recent <s, d> pair that moved (s != d). If we find a pair |
888 | * that not only moved, but what's better, moved to an empty slot | 914 | * that not only moved, but what's better, moved to an empty slot |
889 | * (d is not set in tmp), then we break out then, with that pair. | 915 | * (d is not set in tmp), then we break out then, with that pair. |
890 | * Otherwise when we finish scannng from_tmp, we at least have the | 916 | * Otherwise when we finish scannng from_tmp, we at least have the |
891 | * most recent <s, d> pair that moved. If we get all the way through | 917 | * most recent <s, d> pair that moved. If we get all the way through |
892 | * the scan of tmp without finding any node that moved, much less | 918 | * the scan of tmp without finding any node that moved, much less |
893 | * moved to an empty node, then there is nothing left worth migrating. | 919 | * moved to an empty node, then there is nothing left worth migrating. |
894 | */ | 920 | */ |
895 | 921 | ||
896 | tmp = *from_nodes; | 922 | tmp = *from_nodes; |
897 | while (!nodes_empty(tmp)) { | 923 | while (!nodes_empty(tmp)) { |
@@ -1047,7 +1073,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1047 | if (!IS_ERR(vma)) { | 1073 | if (!IS_ERR(vma)) { |
1048 | int nr_failed = 0; | 1074 | int nr_failed = 0; |
1049 | 1075 | ||
1050 | err = mbind_range(vma, start, end, new); | 1076 | err = mbind_range(mm, start, end, new); |
1051 | 1077 | ||
1052 | if (!list_empty(&pagelist)) | 1078 | if (!list_empty(&pagelist)) |
1053 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1079 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
diff --git a/mm/migrate.c b/mm/migrate.c index 880bd592d38e..88000b89fc9a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -134,7 +134,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
134 | page_add_file_rmap(new); | 134 | page_add_file_rmap(new); |
135 | 135 | ||
136 | /* No need to invalidate - it was non-present before */ | 136 | /* No need to invalidate - it was non-present before */ |
137 | update_mmu_cache(vma, addr, pte); | 137 | update_mmu_cache(vma, addr, ptep); |
138 | unlock: | 138 | unlock: |
139 | pte_unmap_unlock(ptep, ptl); | 139 | pte_unmap_unlock(ptep, ptl); |
140 | out: | 140 | out: |
@@ -275,8 +275,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
275 | */ | 275 | */ |
276 | static void migrate_page_copy(struct page *newpage, struct page *page) | 276 | static void migrate_page_copy(struct page *newpage, struct page *page) |
277 | { | 277 | { |
278 | int anon; | ||
279 | |||
280 | copy_highpage(newpage, page); | 278 | copy_highpage(newpage, page); |
281 | 279 | ||
282 | if (PageError(page)) | 280 | if (PageError(page)) |
@@ -313,8 +311,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
313 | ClearPageSwapCache(page); | 311 | ClearPageSwapCache(page); |
314 | ClearPagePrivate(page); | 312 | ClearPagePrivate(page); |
315 | set_page_private(page, 0); | 313 | set_page_private(page, 0); |
316 | /* page->mapping contains a flag for PageAnon() */ | ||
317 | anon = PageAnon(page); | ||
318 | page->mapping = NULL; | 314 | page->mapping = NULL; |
319 | 315 | ||
320 | /* | 316 | /* |
diff --git a/mm/mlock.c b/mm/mlock.c index 2b8335a89400..8f4e2dfceec1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -25,7 +25,7 @@ int can_do_mlock(void) | |||
25 | { | 25 | { |
26 | if (capable(CAP_IPC_LOCK)) | 26 | if (capable(CAP_IPC_LOCK)) |
27 | return 1; | 27 | return 1; |
28 | if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) | 28 | if (rlimit(RLIMIT_MEMLOCK) != 0) |
29 | return 1; | 29 | return 1; |
30 | return 0; | 30 | return 0; |
31 | } | 31 | } |
@@ -487,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
487 | locked = len >> PAGE_SHIFT; | 487 | locked = len >> PAGE_SHIFT; |
488 | locked += current->mm->locked_vm; | 488 | locked += current->mm->locked_vm; |
489 | 489 | ||
490 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 490 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
491 | lock_limit >>= PAGE_SHIFT; | 491 | lock_limit >>= PAGE_SHIFT; |
492 | 492 | ||
493 | /* check against resource limits */ | 493 | /* check against resource limits */ |
@@ -550,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
550 | 550 | ||
551 | down_write(¤t->mm->mmap_sem); | 551 | down_write(¤t->mm->mmap_sem); |
552 | 552 | ||
553 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 553 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
554 | lock_limit >>= PAGE_SHIFT; | 554 | lock_limit >>= PAGE_SHIFT; |
555 | 555 | ||
556 | ret = -ENOMEM; | 556 | ret = -ENOMEM; |
@@ -584,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user) | |||
584 | int allowed = 0; | 584 | int allowed = 0; |
585 | 585 | ||
586 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 586 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
587 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 587 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
588 | if (lock_limit == RLIM_INFINITY) | 588 | if (lock_limit == RLIM_INFINITY) |
589 | allowed = 1; | 589 | allowed = 1; |
590 | lock_limit >>= PAGE_SHIFT; | 590 | lock_limit >>= PAGE_SHIFT; |
@@ -618,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, | |||
618 | 618 | ||
619 | down_write(&mm->mmap_sem); | 619 | down_write(&mm->mmap_sem); |
620 | 620 | ||
621 | lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 621 | lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT; |
622 | vm = mm->total_vm + pgsz; | 622 | vm = mm->total_vm + pgsz; |
623 | if (lim < vm) | 623 | if (lim < vm) |
624 | goto out; | 624 | goto out; |
625 | 625 | ||
626 | lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 626 | lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT; |
627 | vm = mm->locked_vm + pgsz; | 627 | vm = mm->locked_vm + pgsz; |
628 | if (lim < vm) | 628 | if (lim < vm) |
629 | goto out; | 629 | goto out; |
@@ -265,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
265 | * segment grow beyond its set limit the in case where the limit is | 265 | * segment grow beyond its set limit the in case where the limit is |
266 | * not page aligned -Ram Gupta | 266 | * not page aligned -Ram Gupta |
267 | */ | 267 | */ |
268 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | 268 | rlim = rlimit(RLIMIT_DATA); |
269 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + | 269 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + |
270 | (mm->end_data - mm->start_data) > rlim) | 270 | (mm->end_data - mm->start_data) > rlim) |
271 | goto out; | 271 | goto out; |
@@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
437 | { | 437 | { |
438 | __vma_link_list(mm, vma, prev, rb_parent); | 438 | __vma_link_list(mm, vma, prev, rb_parent); |
439 | __vma_link_rb(mm, vma, rb_link, rb_parent); | 439 | __vma_link_rb(mm, vma, rb_link, rb_parent); |
440 | __anon_vma_link(vma); | ||
441 | } | 440 | } |
442 | 441 | ||
443 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 442 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | |||
499 | * are necessary. The "insert" vma (if any) is to be inserted | 498 | * are necessary. The "insert" vma (if any) is to be inserted |
500 | * before we drop the necessary locks. | 499 | * before we drop the necessary locks. |
501 | */ | 500 | */ |
502 | void vma_adjust(struct vm_area_struct *vma, unsigned long start, | 501 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, |
503 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | 502 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) |
504 | { | 503 | { |
505 | struct mm_struct *mm = vma->vm_mm; | 504 | struct mm_struct *mm = vma->vm_mm; |
@@ -542,6 +541,26 @@ again: remove_next = 1 + (end > next->vm_end); | |||
542 | } | 541 | } |
543 | } | 542 | } |
544 | 543 | ||
544 | /* | ||
545 | * When changing only vma->vm_end, we don't really need anon_vma lock. | ||
546 | */ | ||
547 | if (vma->anon_vma && (insert || importer || start != vma->vm_start)) | ||
548 | anon_vma = vma->anon_vma; | ||
549 | if (anon_vma) { | ||
550 | /* | ||
551 | * Easily overlooked: when mprotect shifts the boundary, | ||
552 | * make sure the expanding vma has anon_vma set if the | ||
553 | * shrinking vma had, to cover any anon pages imported. | ||
554 | */ | ||
555 | if (importer && !importer->anon_vma) { | ||
556 | /* Block reverse map lookups until things are set up. */ | ||
557 | if (anon_vma_clone(importer, vma)) { | ||
558 | return -ENOMEM; | ||
559 | } | ||
560 | importer->anon_vma = anon_vma; | ||
561 | } | ||
562 | } | ||
563 | |||
545 | if (file) { | 564 | if (file) { |
546 | mapping = file->f_mapping; | 565 | mapping = file->f_mapping; |
547 | if (!(vma->vm_flags & VM_NONLINEAR)) | 566 | if (!(vma->vm_flags & VM_NONLINEAR)) |
@@ -567,25 +586,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
567 | } | 586 | } |
568 | } | 587 | } |
569 | 588 | ||
570 | /* | ||
571 | * When changing only vma->vm_end, we don't really need | ||
572 | * anon_vma lock. | ||
573 | */ | ||
574 | if (vma->anon_vma && (insert || importer || start != vma->vm_start)) | ||
575 | anon_vma = vma->anon_vma; | ||
576 | if (anon_vma) { | ||
577 | spin_lock(&anon_vma->lock); | ||
578 | /* | ||
579 | * Easily overlooked: when mprotect shifts the boundary, | ||
580 | * make sure the expanding vma has anon_vma set if the | ||
581 | * shrinking vma had, to cover any anon pages imported. | ||
582 | */ | ||
583 | if (importer && !importer->anon_vma) { | ||
584 | importer->anon_vma = anon_vma; | ||
585 | __anon_vma_link(importer); | ||
586 | } | ||
587 | } | ||
588 | |||
589 | if (root) { | 589 | if (root) { |
590 | flush_dcache_mmap_lock(mapping); | 590 | flush_dcache_mmap_lock(mapping); |
591 | vma_prio_tree_remove(vma, root); | 591 | vma_prio_tree_remove(vma, root); |
@@ -616,8 +616,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
616 | __vma_unlink(mm, next, vma); | 616 | __vma_unlink(mm, next, vma); |
617 | if (file) | 617 | if (file) |
618 | __remove_shared_vm_struct(next, file, mapping); | 618 | __remove_shared_vm_struct(next, file, mapping); |
619 | if (next->anon_vma) | ||
620 | __anon_vma_merge(vma, next); | ||
621 | } else if (insert) { | 619 | } else if (insert) { |
622 | /* | 620 | /* |
623 | * split_vma has split insert from vma, and needs | 621 | * split_vma has split insert from vma, and needs |
@@ -627,8 +625,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
627 | __insert_vm_struct(mm, insert); | 625 | __insert_vm_struct(mm, insert); |
628 | } | 626 | } |
629 | 627 | ||
630 | if (anon_vma) | ||
631 | spin_unlock(&anon_vma->lock); | ||
632 | if (mapping) | 628 | if (mapping) |
633 | spin_unlock(&mapping->i_mmap_lock); | 629 | spin_unlock(&mapping->i_mmap_lock); |
634 | 630 | ||
@@ -638,6 +634,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
638 | if (next->vm_flags & VM_EXECUTABLE) | 634 | if (next->vm_flags & VM_EXECUTABLE) |
639 | removed_exe_file_vma(mm); | 635 | removed_exe_file_vma(mm); |
640 | } | 636 | } |
637 | if (next->anon_vma) | ||
638 | anon_vma_merge(vma, next); | ||
641 | mm->map_count--; | 639 | mm->map_count--; |
642 | mpol_put(vma_policy(next)); | 640 | mpol_put(vma_policy(next)); |
643 | kmem_cache_free(vm_area_cachep, next); | 641 | kmem_cache_free(vm_area_cachep, next); |
@@ -653,6 +651,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
653 | } | 651 | } |
654 | 652 | ||
655 | validate_mm(mm); | 653 | validate_mm(mm); |
654 | |||
655 | return 0; | ||
656 | } | 656 | } |
657 | 657 | ||
658 | /* | 658 | /* |
@@ -759,6 +759,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
759 | { | 759 | { |
760 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 760 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
761 | struct vm_area_struct *area, *next; | 761 | struct vm_area_struct *area, *next; |
762 | int err; | ||
762 | 763 | ||
763 | /* | 764 | /* |
764 | * We later require that vma->vm_flags == vm_flags, | 765 | * We later require that vma->vm_flags == vm_flags, |
@@ -792,11 +793,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
792 | is_mergeable_anon_vma(prev->anon_vma, | 793 | is_mergeable_anon_vma(prev->anon_vma, |
793 | next->anon_vma)) { | 794 | next->anon_vma)) { |
794 | /* cases 1, 6 */ | 795 | /* cases 1, 6 */ |
795 | vma_adjust(prev, prev->vm_start, | 796 | err = vma_adjust(prev, prev->vm_start, |
796 | next->vm_end, prev->vm_pgoff, NULL); | 797 | next->vm_end, prev->vm_pgoff, NULL); |
797 | } else /* cases 2, 5, 7 */ | 798 | } else /* cases 2, 5, 7 */ |
798 | vma_adjust(prev, prev->vm_start, | 799 | err = vma_adjust(prev, prev->vm_start, |
799 | end, prev->vm_pgoff, NULL); | 800 | end, prev->vm_pgoff, NULL); |
801 | if (err) | ||
802 | return NULL; | ||
800 | return prev; | 803 | return prev; |
801 | } | 804 | } |
802 | 805 | ||
@@ -808,11 +811,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
808 | can_vma_merge_before(next, vm_flags, | 811 | can_vma_merge_before(next, vm_flags, |
809 | anon_vma, file, pgoff+pglen)) { | 812 | anon_vma, file, pgoff+pglen)) { |
810 | if (prev && addr < prev->vm_end) /* case 4 */ | 813 | if (prev && addr < prev->vm_end) /* case 4 */ |
811 | vma_adjust(prev, prev->vm_start, | 814 | err = vma_adjust(prev, prev->vm_start, |
812 | addr, prev->vm_pgoff, NULL); | 815 | addr, prev->vm_pgoff, NULL); |
813 | else /* cases 3, 8 */ | 816 | else /* cases 3, 8 */ |
814 | vma_adjust(area, addr, next->vm_end, | 817 | err = vma_adjust(area, addr, next->vm_end, |
815 | next->vm_pgoff - pglen, NULL); | 818 | next->vm_pgoff - pglen, NULL); |
819 | if (err) | ||
820 | return NULL; | ||
816 | return area; | 821 | return area; |
817 | } | 822 | } |
818 | 823 | ||
@@ -967,7 +972,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
967 | unsigned long locked, lock_limit; | 972 | unsigned long locked, lock_limit; |
968 | locked = len >> PAGE_SHIFT; | 973 | locked = len >> PAGE_SHIFT; |
969 | locked += mm->locked_vm; | 974 | locked += mm->locked_vm; |
970 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 975 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
971 | lock_limit >>= PAGE_SHIFT; | 976 | lock_limit >>= PAGE_SHIFT; |
972 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 977 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
973 | return -EAGAIN; | 978 | return -EAGAIN; |
@@ -1083,6 +1088,30 @@ out: | |||
1083 | return retval; | 1088 | return retval; |
1084 | } | 1089 | } |
1085 | 1090 | ||
1091 | #ifdef __ARCH_WANT_SYS_OLD_MMAP | ||
1092 | struct mmap_arg_struct { | ||
1093 | unsigned long addr; | ||
1094 | unsigned long len; | ||
1095 | unsigned long prot; | ||
1096 | unsigned long flags; | ||
1097 | unsigned long fd; | ||
1098 | unsigned long offset; | ||
1099 | }; | ||
1100 | |||
1101 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | ||
1102 | { | ||
1103 | struct mmap_arg_struct a; | ||
1104 | |||
1105 | if (copy_from_user(&a, arg, sizeof(a))) | ||
1106 | return -EFAULT; | ||
1107 | if (a.offset & ~PAGE_MASK) | ||
1108 | return -EINVAL; | ||
1109 | |||
1110 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | ||
1111 | a.offset >> PAGE_SHIFT); | ||
1112 | } | ||
1113 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | ||
1114 | |||
1086 | /* | 1115 | /* |
1087 | * Some shared mappigns will want the pages marked read-only | 1116 | * Some shared mappigns will want the pages marked read-only |
1088 | * to track write events. If so, we'll downgrade vm_page_prot | 1117 | * to track write events. If so, we'll downgrade vm_page_prot |
@@ -1205,6 +1234,7 @@ munmap_back: | |||
1205 | vma->vm_flags = vm_flags; | 1234 | vma->vm_flags = vm_flags; |
1206 | vma->vm_page_prot = vm_get_page_prot(vm_flags); | 1235 | vma->vm_page_prot = vm_get_page_prot(vm_flags); |
1207 | vma->vm_pgoff = pgoff; | 1236 | vma->vm_pgoff = pgoff; |
1237 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
1208 | 1238 | ||
1209 | if (file) { | 1239 | if (file) { |
1210 | error = -EINVAL; | 1240 | error = -EINVAL; |
@@ -1265,13 +1295,8 @@ out: | |||
1265 | mm->total_vm += len >> PAGE_SHIFT; | 1295 | mm->total_vm += len >> PAGE_SHIFT; |
1266 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1296 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1267 | if (vm_flags & VM_LOCKED) { | 1297 | if (vm_flags & VM_LOCKED) { |
1268 | /* | 1298 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
1269 | * makes pages present; downgrades, drops, reacquires mmap_sem | 1299 | mm->locked_vm += (len >> PAGE_SHIFT); |
1270 | */ | ||
1271 | long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); | ||
1272 | if (nr_pages < 0) | ||
1273 | return nr_pages; /* vma gone! */ | ||
1274 | mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; | ||
1275 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1300 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) |
1276 | make_pages_present(addr, addr + len); | 1301 | make_pages_present(addr, addr + len); |
1277 | return addr; | 1302 | return addr; |
@@ -1599,7 +1624,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
1599 | return -ENOMEM; | 1624 | return -ENOMEM; |
1600 | 1625 | ||
1601 | /* Stack limit test */ | 1626 | /* Stack limit test */ |
1602 | if (size > rlim[RLIMIT_STACK].rlim_cur) | 1627 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) |
1603 | return -ENOMEM; | 1628 | return -ENOMEM; |
1604 | 1629 | ||
1605 | /* mlock limit tests */ | 1630 | /* mlock limit tests */ |
@@ -1607,7 +1632,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
1607 | unsigned long locked; | 1632 | unsigned long locked; |
1608 | unsigned long limit; | 1633 | unsigned long limit; |
1609 | locked = mm->locked_vm + grow; | 1634 | locked = mm->locked_vm + grow; |
1610 | limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 1635 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
1636 | limit >>= PAGE_SHIFT; | ||
1611 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 1637 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
1612 | return -ENOMEM; | 1638 | return -ENOMEM; |
1613 | } | 1639 | } |
@@ -1754,8 +1780,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
1754 | if (!prev || expand_stack(prev, addr)) | 1780 | if (!prev || expand_stack(prev, addr)) |
1755 | return NULL; | 1781 | return NULL; |
1756 | if (prev->vm_flags & VM_LOCKED) { | 1782 | if (prev->vm_flags & VM_LOCKED) { |
1757 | if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) | 1783 | mlock_vma_pages_range(prev, addr, prev->vm_end); |
1758 | return NULL; /* vma gone! */ | ||
1759 | } | 1784 | } |
1760 | return prev; | 1785 | return prev; |
1761 | } | 1786 | } |
@@ -1783,8 +1808,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1783 | if (expand_stack(vma, addr)) | 1808 | if (expand_stack(vma, addr)) |
1784 | return NULL; | 1809 | return NULL; |
1785 | if (vma->vm_flags & VM_LOCKED) { | 1810 | if (vma->vm_flags & VM_LOCKED) { |
1786 | if (mlock_vma_pages_range(vma, addr, start) < 0) | 1811 | mlock_vma_pages_range(vma, addr, start); |
1787 | return NULL; /* vma gone! */ | ||
1788 | } | 1812 | } |
1789 | return vma; | 1813 | return vma; |
1790 | } | 1814 | } |
@@ -1871,6 +1895,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1871 | { | 1895 | { |
1872 | struct mempolicy *pol; | 1896 | struct mempolicy *pol; |
1873 | struct vm_area_struct *new; | 1897 | struct vm_area_struct *new; |
1898 | int err = -ENOMEM; | ||
1874 | 1899 | ||
1875 | if (is_vm_hugetlb_page(vma) && (addr & | 1900 | if (is_vm_hugetlb_page(vma) && (addr & |
1876 | ~(huge_page_mask(hstate_vma(vma))))) | 1901 | ~(huge_page_mask(hstate_vma(vma))))) |
@@ -1878,11 +1903,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1878 | 1903 | ||
1879 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 1904 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1880 | if (!new) | 1905 | if (!new) |
1881 | return -ENOMEM; | 1906 | goto out_err; |
1882 | 1907 | ||
1883 | /* most fields are the same, copy all, and then fixup */ | 1908 | /* most fields are the same, copy all, and then fixup */ |
1884 | *new = *vma; | 1909 | *new = *vma; |
1885 | 1910 | ||
1911 | INIT_LIST_HEAD(&new->anon_vma_chain); | ||
1912 | |||
1886 | if (new_below) | 1913 | if (new_below) |
1887 | new->vm_end = addr; | 1914 | new->vm_end = addr; |
1888 | else { | 1915 | else { |
@@ -1892,11 +1919,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1892 | 1919 | ||
1893 | pol = mpol_dup(vma_policy(vma)); | 1920 | pol = mpol_dup(vma_policy(vma)); |
1894 | if (IS_ERR(pol)) { | 1921 | if (IS_ERR(pol)) { |
1895 | kmem_cache_free(vm_area_cachep, new); | 1922 | err = PTR_ERR(pol); |
1896 | return PTR_ERR(pol); | 1923 | goto out_free_vma; |
1897 | } | 1924 | } |
1898 | vma_set_policy(new, pol); | 1925 | vma_set_policy(new, pol); |
1899 | 1926 | ||
1927 | if (anon_vma_clone(new, vma)) | ||
1928 | goto out_free_mpol; | ||
1929 | |||
1900 | if (new->vm_file) { | 1930 | if (new->vm_file) { |
1901 | get_file(new->vm_file); | 1931 | get_file(new->vm_file); |
1902 | if (vma->vm_flags & VM_EXECUTABLE) | 1932 | if (vma->vm_flags & VM_EXECUTABLE) |
@@ -1907,12 +1937,28 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1907 | new->vm_ops->open(new); | 1937 | new->vm_ops->open(new); |
1908 | 1938 | ||
1909 | if (new_below) | 1939 | if (new_below) |
1910 | vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + | 1940 | err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + |
1911 | ((addr - new->vm_start) >> PAGE_SHIFT), new); | 1941 | ((addr - new->vm_start) >> PAGE_SHIFT), new); |
1912 | else | 1942 | else |
1913 | vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | 1943 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); |
1914 | 1944 | ||
1915 | return 0; | 1945 | /* Success. */ |
1946 | if (!err) | ||
1947 | return 0; | ||
1948 | |||
1949 | /* Clean everything up if vma_adjust failed. */ | ||
1950 | new->vm_ops->close(new); | ||
1951 | if (new->vm_file) { | ||
1952 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1953 | removed_exe_file_vma(mm); | ||
1954 | fput(new->vm_file); | ||
1955 | } | ||
1956 | out_free_mpol: | ||
1957 | mpol_put(pol); | ||
1958 | out_free_vma: | ||
1959 | kmem_cache_free(vm_area_cachep, new); | ||
1960 | out_err: | ||
1961 | return err; | ||
1916 | } | 1962 | } |
1917 | 1963 | ||
1918 | /* | 1964 | /* |
@@ -2074,7 +2120,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2074 | unsigned long locked, lock_limit; | 2120 | unsigned long locked, lock_limit; |
2075 | locked = len >> PAGE_SHIFT; | 2121 | locked = len >> PAGE_SHIFT; |
2076 | locked += mm->locked_vm; | 2122 | locked += mm->locked_vm; |
2077 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 2123 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
2078 | lock_limit >>= PAGE_SHIFT; | 2124 | lock_limit >>= PAGE_SHIFT; |
2079 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 2125 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
2080 | return -EAGAIN; | 2126 | return -EAGAIN; |
@@ -2122,6 +2168,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2122 | return -ENOMEM; | 2168 | return -ENOMEM; |
2123 | } | 2169 | } |
2124 | 2170 | ||
2171 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
2125 | vma->vm_mm = mm; | 2172 | vma->vm_mm = mm; |
2126 | vma->vm_start = addr; | 2173 | vma->vm_start = addr; |
2127 | vma->vm_end = addr + len; | 2174 | vma->vm_end = addr + len; |
@@ -2258,10 +2305,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2258 | if (new_vma) { | 2305 | if (new_vma) { |
2259 | *new_vma = *vma; | 2306 | *new_vma = *vma; |
2260 | pol = mpol_dup(vma_policy(vma)); | 2307 | pol = mpol_dup(vma_policy(vma)); |
2261 | if (IS_ERR(pol)) { | 2308 | if (IS_ERR(pol)) |
2262 | kmem_cache_free(vm_area_cachep, new_vma); | 2309 | goto out_free_vma; |
2263 | return NULL; | 2310 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
2264 | } | 2311 | if (anon_vma_clone(new_vma, vma)) |
2312 | goto out_free_mempol; | ||
2265 | vma_set_policy(new_vma, pol); | 2313 | vma_set_policy(new_vma, pol); |
2266 | new_vma->vm_start = addr; | 2314 | new_vma->vm_start = addr; |
2267 | new_vma->vm_end = addr + len; | 2315 | new_vma->vm_end = addr + len; |
@@ -2277,6 +2325,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2277 | } | 2325 | } |
2278 | } | 2326 | } |
2279 | return new_vma; | 2327 | return new_vma; |
2328 | |||
2329 | out_free_mempol: | ||
2330 | mpol_put(pol); | ||
2331 | out_free_vma: | ||
2332 | kmem_cache_free(vm_area_cachep, new_vma); | ||
2333 | return NULL; | ||
2280 | } | 2334 | } |
2281 | 2335 | ||
2282 | /* | 2336 | /* |
@@ -2288,7 +2342,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) | |||
2288 | unsigned long cur = mm->total_vm; /* pages */ | 2342 | unsigned long cur = mm->total_vm; /* pages */ |
2289 | unsigned long lim; | 2343 | unsigned long lim; |
2290 | 2344 | ||
2291 | lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 2345 | lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; |
2292 | 2346 | ||
2293 | if (cur + npages > lim) | 2347 | if (cur + npages > lim) |
2294 | return 0; | 2348 | return 0; |
@@ -2354,6 +2408,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2354 | if (unlikely(vma == NULL)) | 2408 | if (unlikely(vma == NULL)) |
2355 | return -ENOMEM; | 2409 | return -ENOMEM; |
2356 | 2410 | ||
2411 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
2357 | vma->vm_mm = mm; | 2412 | vma->vm_mm = mm; |
2358 | vma->vm_start = addr; | 2413 | vma->vm_start = addr; |
2359 | vma->vm_end = addr + len; | 2414 | vma->vm_end = addr + len; |
@@ -2454,6 +2509,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
2454 | int mm_take_all_locks(struct mm_struct *mm) | 2509 | int mm_take_all_locks(struct mm_struct *mm) |
2455 | { | 2510 | { |
2456 | struct vm_area_struct *vma; | 2511 | struct vm_area_struct *vma; |
2512 | struct anon_vma_chain *avc; | ||
2457 | int ret = -EINTR; | 2513 | int ret = -EINTR; |
2458 | 2514 | ||
2459 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2515 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
@@ -2471,7 +2527,8 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
2471 | if (signal_pending(current)) | 2527 | if (signal_pending(current)) |
2472 | goto out_unlock; | 2528 | goto out_unlock; |
2473 | if (vma->anon_vma) | 2529 | if (vma->anon_vma) |
2474 | vm_lock_anon_vma(mm, vma->anon_vma); | 2530 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
2531 | vm_lock_anon_vma(mm, avc->anon_vma); | ||
2475 | } | 2532 | } |
2476 | 2533 | ||
2477 | ret = 0; | 2534 | ret = 0; |
@@ -2526,13 +2583,15 @@ static void vm_unlock_mapping(struct address_space *mapping) | |||
2526 | void mm_drop_all_locks(struct mm_struct *mm) | 2583 | void mm_drop_all_locks(struct mm_struct *mm) |
2527 | { | 2584 | { |
2528 | struct vm_area_struct *vma; | 2585 | struct vm_area_struct *vma; |
2586 | struct anon_vma_chain *avc; | ||
2529 | 2587 | ||
2530 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2588 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
2531 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | 2589 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); |
2532 | 2590 | ||
2533 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 2591 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
2534 | if (vma->anon_vma) | 2592 | if (vma->anon_vma) |
2535 | vm_unlock_anon_vma(vma->anon_vma); | 2593 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
2594 | vm_unlock_anon_vma(avc->anon_vma); | ||
2536 | if (vma->vm_file && vma->vm_file->f_mapping) | 2595 | if (vma->vm_file && vma->vm_file->f_mapping) |
2537 | vm_unlock_mapping(vma->vm_file->f_mapping); | 2596 | vm_unlock_mapping(vma->vm_file->f_mapping); |
2538 | } | 2597 | } |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index ded9081f4021..0777654147c9 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
@@ -5,6 +5,7 @@ | |||
5 | 5 | ||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/mmu_context.h> | 7 | #include <linux/mmu_context.h> |
8 | #include <linux/module.h> | ||
8 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
9 | 10 | ||
10 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm) | |||
37 | if (active_mm != mm) | 38 | if (active_mm != mm) |
38 | mmdrop(active_mm); | 39 | mmdrop(active_mm); |
39 | } | 40 | } |
41 | EXPORT_SYMBOL_GPL(use_mm); | ||
40 | 42 | ||
41 | /* | 43 | /* |
42 | * unuse_mm | 44 | * unuse_mm |
@@ -56,3 +58,4 @@ void unuse_mm(struct mm_struct *mm) | |||
56 | enter_lazy_tlb(mm, tsk); | 58 | enter_lazy_tlb(mm, tsk); |
57 | task_unlock(tsk); | 59 | task_unlock(tsk); |
58 | } | 60 | } |
61 | EXPORT_SYMBOL_GPL(unuse_mm); | ||
diff --git a/mm/mremap.c b/mm/mremap.c index 845190898d59..e9c75efce609 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -285,7 +285,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
285 | if (vma->vm_flags & VM_LOCKED) { | 285 | if (vma->vm_flags & VM_LOCKED) { |
286 | unsigned long locked, lock_limit; | 286 | unsigned long locked, lock_limit; |
287 | locked = mm->locked_vm << PAGE_SHIFT; | 287 | locked = mm->locked_vm << PAGE_SHIFT; |
288 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 288 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
289 | locked += new_len - old_len; | 289 | locked += new_len - old_len; |
290 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 290 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
291 | goto Eagain; | 291 | goto Eagain; |
@@ -460,8 +460,11 @@ unsigned long do_mremap(unsigned long addr, | |||
460 | if (vma_expandable(vma, new_len - old_len)) { | 460 | if (vma_expandable(vma, new_len - old_len)) { |
461 | int pages = (new_len - old_len) >> PAGE_SHIFT; | 461 | int pages = (new_len - old_len) >> PAGE_SHIFT; |
462 | 462 | ||
463 | vma_adjust(vma, vma->vm_start, | 463 | if (vma_adjust(vma, vma->vm_start, addr + new_len, |
464 | addr + new_len, vma->vm_pgoff, NULL); | 464 | vma->vm_pgoff, NULL)) { |
465 | ret = -ENOMEM; | ||
466 | goto out; | ||
467 | } | ||
465 | 468 | ||
466 | mm->total_vm += pages; | 469 | mm->total_vm += pages; |
467 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 470 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
diff --git a/mm/nommu.c b/mm/nommu.c index 48a2ecfaf059..605ace8982a8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
146 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 146 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
147 | 147 | ||
148 | for (i = 0; i < nr_pages; i++) { | 148 | for (i = 0; i < nr_pages; i++) { |
149 | vma = find_vma(mm, start); | 149 | vma = find_extend_vma(mm, start); |
150 | if (!vma) | 150 | if (!vma) |
151 | goto finish_or_fault; | 151 | goto finish_or_fault; |
152 | 152 | ||
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(find_vma); | |||
764 | */ | 764 | */ |
765 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | 765 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) |
766 | { | 766 | { |
767 | return find_vma(mm, addr); | 767 | return find_vma(mm, addr & PAGE_MASK); |
768 | } | 768 | } |
769 | 769 | ||
770 | /* | 770 | /* |
@@ -1209,7 +1209,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1209 | region->vm_flags = vm_flags; | 1209 | region->vm_flags = vm_flags; |
1210 | region->vm_pgoff = pgoff; | 1210 | region->vm_pgoff = pgoff; |
1211 | 1211 | ||
1212 | INIT_LIST_HEAD(&vma->anon_vma_node); | 1212 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
1213 | vma->vm_flags = vm_flags; | 1213 | vma->vm_flags = vm_flags; |
1214 | vma->vm_pgoff = pgoff; | 1214 | vma->vm_pgoff = pgoff; |
1215 | 1215 | ||
@@ -1428,6 +1428,30 @@ out: | |||
1428 | return retval; | 1428 | return retval; |
1429 | } | 1429 | } |
1430 | 1430 | ||
1431 | #ifdef __ARCH_WANT_SYS_OLD_MMAP | ||
1432 | struct mmap_arg_struct { | ||
1433 | unsigned long addr; | ||
1434 | unsigned long len; | ||
1435 | unsigned long prot; | ||
1436 | unsigned long flags; | ||
1437 | unsigned long fd; | ||
1438 | unsigned long offset; | ||
1439 | }; | ||
1440 | |||
1441 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | ||
1442 | { | ||
1443 | struct mmap_arg_struct a; | ||
1444 | |||
1445 | if (copy_from_user(&a, arg, sizeof(a))) | ||
1446 | return -EFAULT; | ||
1447 | if (a.offset & ~PAGE_MASK) | ||
1448 | return -EINVAL; | ||
1449 | |||
1450 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | ||
1451 | a.offset >> PAGE_SHIFT); | ||
1452 | } | ||
1453 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | ||
1454 | |||
1431 | /* | 1455 | /* |
1432 | * split a vma into two pieces at address 'addr', a new vma is allocated either | 1456 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
1433 | * for the first part or the tail. | 1457 | * for the first part or the tail. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 237050478f28..9b223af6a147 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -401,8 +401,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
401 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | 401 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
402 | task_pid_nr(p), p->comm, | 402 | task_pid_nr(p), p->comm, |
403 | K(p->mm->total_vm), | 403 | K(p->mm->total_vm), |
404 | K(get_mm_counter(p->mm, anon_rss)), | 404 | K(get_mm_counter(p->mm, MM_ANONPAGES)), |
405 | K(get_mm_counter(p->mm, file_rss))); | 405 | K(get_mm_counter(p->mm, MM_FILEPAGES))); |
406 | task_unlock(p); | 406 | task_unlock(p); |
407 | 407 | ||
408 | /* | 408 | /* |
@@ -473,6 +473,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
473 | unsigned long points = 0; | 473 | unsigned long points = 0; |
474 | struct task_struct *p; | 474 | struct task_struct *p; |
475 | 475 | ||
476 | if (sysctl_panic_on_oom == 2) | ||
477 | panic("out of memory(memcg). panic_on_oom is selected.\n"); | ||
476 | read_lock(&tasklist_lock); | 478 | read_lock(&tasklist_lock); |
477 | retry: | 479 | retry: |
478 | p = select_bad_process(&points, mem); | 480 | p = select_bad_process(&points, mem); |
@@ -601,13 +603,6 @@ void pagefault_out_of_memory(void) | |||
601 | /* Got some memory back in the last second. */ | 603 | /* Got some memory back in the last second. */ |
602 | return; | 604 | return; |
603 | 605 | ||
604 | /* | ||
605 | * If this is from memcg, oom-killer is already invoked. | ||
606 | * and not worth to go system-wide-oom. | ||
607 | */ | ||
608 | if (mem_cgroup_oom_called(current)) | ||
609 | goto rest_and_return; | ||
610 | |||
611 | if (sysctl_panic_on_oom) | 606 | if (sysctl_panic_on_oom) |
612 | panic("out of memory from page fault. panic_on_oom is selected.\n"); | 607 | panic("out of memory from page fault. panic_on_oom is selected.\n"); |
613 | 608 | ||
@@ -619,7 +614,6 @@ void pagefault_out_of_memory(void) | |||
619 | * Give "p" a good chance of killing itself before we | 614 | * Give "p" a good chance of killing itself before we |
620 | * retry to allocate memory. | 615 | * retry to allocate memory. |
621 | */ | 616 | */ |
622 | rest_and_return: | ||
623 | if (!test_thread_flag(TIF_MEMDIE)) | 617 | if (!test_thread_flag(TIF_MEMDIE)) |
624 | schedule_timeout_uninterruptible(1); | 618 | schedule_timeout_uninterruptible(1); |
625 | } | 619 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8deb9d0fd5b1..d03c946d5566 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/kmemleak.h> | 50 | #include <linux/kmemleak.h> |
51 | #include <linux/memory.h> | 51 | #include <linux/memory.h> |
52 | #include <trace/events/kmem.h> | 52 | #include <trace/events/kmem.h> |
53 | #include <linux/ftrace_event.h> | ||
53 | 54 | ||
54 | #include <asm/tlbflush.h> | 55 | #include <asm/tlbflush.h> |
55 | #include <asm/div64.h> | 56 | #include <asm/div64.h> |
@@ -76,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly; | |||
76 | int percpu_pagelist_fraction; | 77 | int percpu_pagelist_fraction; |
77 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 78 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
78 | 79 | ||
80 | #ifdef CONFIG_PM_SLEEP | ||
81 | /* | ||
82 | * The following functions are used by the suspend/hibernate code to temporarily | ||
83 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations | ||
84 | * while devices are suspended. To avoid races with the suspend/hibernate code, | ||
85 | * they should always be called with pm_mutex held (gfp_allowed_mask also should | ||
86 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | ||
87 | * guaranteed not to run in parallel with that modification). | ||
88 | */ | ||
89 | void set_gfp_allowed_mask(gfp_t mask) | ||
90 | { | ||
91 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
92 | gfp_allowed_mask = mask; | ||
93 | } | ||
94 | |||
95 | gfp_t clear_gfp_allowed_mask(gfp_t mask) | ||
96 | { | ||
97 | gfp_t ret = gfp_allowed_mask; | ||
98 | |||
99 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
100 | gfp_allowed_mask &= ~mask; | ||
101 | return ret; | ||
102 | } | ||
103 | #endif /* CONFIG_PM_SLEEP */ | ||
104 | |||
79 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 105 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
80 | int pageblock_order __read_mostly; | 106 | int pageblock_order __read_mostly; |
81 | #endif | 107 | #endif |
@@ -263,10 +289,7 @@ static void bad_page(struct page *page) | |||
263 | 289 | ||
264 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 290 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", |
265 | current->comm, page_to_pfn(page)); | 291 | current->comm, page_to_pfn(page)); |
266 | printk(KERN_ALERT | 292 | dump_page(page); |
267 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
268 | page, (void *)page->flags, page_count(page), | ||
269 | page_mapcount(page), page->mapping, page->index); | ||
270 | 293 | ||
271 | dump_stack(); | 294 | dump_stack(); |
272 | out: | 295 | out: |
@@ -530,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
530 | int batch_free = 0; | 553 | int batch_free = 0; |
531 | 554 | ||
532 | spin_lock(&zone->lock); | 555 | spin_lock(&zone->lock); |
533 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 556 | zone->all_unreclaimable = 0; |
534 | zone->pages_scanned = 0; | 557 | zone->pages_scanned = 0; |
535 | 558 | ||
536 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 559 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
@@ -568,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
568 | int migratetype) | 591 | int migratetype) |
569 | { | 592 | { |
570 | spin_lock(&zone->lock); | 593 | spin_lock(&zone->lock); |
571 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 594 | zone->all_unreclaimable = 0; |
572 | zone->pages_scanned = 0; | 595 | zone->pages_scanned = 0; |
573 | 596 | ||
574 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 597 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
@@ -583,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
583 | int bad = 0; | 606 | int bad = 0; |
584 | int wasMlocked = __TestClearPageMlocked(page); | 607 | int wasMlocked = __TestClearPageMlocked(page); |
585 | 608 | ||
609 | trace_mm_page_free_direct(page, order); | ||
586 | kmemcheck_free_shadow(page, order); | 610 | kmemcheck_free_shadow(page, order); |
587 | 611 | ||
588 | for (i = 0 ; i < (1 << order) ; ++i) | 612 | for (i = 0 ; i < (1 << order) ; ++i) |
@@ -1009,10 +1033,10 @@ static void drain_pages(unsigned int cpu) | |||
1009 | struct per_cpu_pageset *pset; | 1033 | struct per_cpu_pageset *pset; |
1010 | struct per_cpu_pages *pcp; | 1034 | struct per_cpu_pages *pcp; |
1011 | 1035 | ||
1012 | pset = zone_pcp(zone, cpu); | 1036 | local_irq_save(flags); |
1037 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
1013 | 1038 | ||
1014 | pcp = &pset->pcp; | 1039 | pcp = &pset->pcp; |
1015 | local_irq_save(flags); | ||
1016 | free_pcppages_bulk(zone, pcp->count, pcp); | 1040 | free_pcppages_bulk(zone, pcp->count, pcp); |
1017 | pcp->count = 0; | 1041 | pcp->count = 0; |
1018 | local_irq_restore(flags); | 1042 | local_irq_restore(flags); |
@@ -1073,8 +1097,9 @@ void mark_free_pages(struct zone *zone) | |||
1073 | 1097 | ||
1074 | /* | 1098 | /* |
1075 | * Free a 0-order page | 1099 | * Free a 0-order page |
1100 | * cold == 1 ? free a cold page : free a hot page | ||
1076 | */ | 1101 | */ |
1077 | static void free_hot_cold_page(struct page *page, int cold) | 1102 | void free_hot_cold_page(struct page *page, int cold) |
1078 | { | 1103 | { |
1079 | struct zone *zone = page_zone(page); | 1104 | struct zone *zone = page_zone(page); |
1080 | struct per_cpu_pages *pcp; | 1105 | struct per_cpu_pages *pcp; |
@@ -1082,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1082 | int migratetype; | 1107 | int migratetype; |
1083 | int wasMlocked = __TestClearPageMlocked(page); | 1108 | int wasMlocked = __TestClearPageMlocked(page); |
1084 | 1109 | ||
1110 | trace_mm_page_free_direct(page, 0); | ||
1085 | kmemcheck_free_shadow(page, 0); | 1111 | kmemcheck_free_shadow(page, 0); |
1086 | 1112 | ||
1087 | if (PageAnon(page)) | 1113 | if (PageAnon(page)) |
@@ -1096,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1096 | arch_free_page(page, 0); | 1122 | arch_free_page(page, 0); |
1097 | kernel_map_pages(page, 1, 0); | 1123 | kernel_map_pages(page, 1, 0); |
1098 | 1124 | ||
1099 | pcp = &zone_pcp(zone, get_cpu())->pcp; | ||
1100 | migratetype = get_pageblock_migratetype(page); | 1125 | migratetype = get_pageblock_migratetype(page); |
1101 | set_page_private(page, migratetype); | 1126 | set_page_private(page, migratetype); |
1102 | local_irq_save(flags); | 1127 | local_irq_save(flags); |
@@ -1119,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1119 | migratetype = MIGRATE_MOVABLE; | 1144 | migratetype = MIGRATE_MOVABLE; |
1120 | } | 1145 | } |
1121 | 1146 | ||
1147 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
1122 | if (cold) | 1148 | if (cold) |
1123 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1149 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
1124 | else | 1150 | else |
@@ -1131,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1131 | 1157 | ||
1132 | out: | 1158 | out: |
1133 | local_irq_restore(flags); | 1159 | local_irq_restore(flags); |
1134 | put_cpu(); | ||
1135 | } | 1160 | } |
1136 | 1161 | ||
1137 | void free_hot_page(struct page *page) | ||
1138 | { | ||
1139 | trace_mm_page_free_direct(page, 0); | ||
1140 | free_hot_cold_page(page, 0); | ||
1141 | } | ||
1142 | |||
1143 | /* | 1162 | /* |
1144 | * split_page takes a non-compound higher-order page, and splits it into | 1163 | * split_page takes a non-compound higher-order page, and splits it into |
1145 | * n (1<<order) sub-pages: page[0..n] | 1164 | * n (1<<order) sub-pages: page[0..n] |
@@ -1181,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1181 | unsigned long flags; | 1200 | unsigned long flags; |
1182 | struct page *page; | 1201 | struct page *page; |
1183 | int cold = !!(gfp_flags & __GFP_COLD); | 1202 | int cold = !!(gfp_flags & __GFP_COLD); |
1184 | int cpu; | ||
1185 | 1203 | ||
1186 | again: | 1204 | again: |
1187 | cpu = get_cpu(); | ||
1188 | if (likely(order == 0)) { | 1205 | if (likely(order == 0)) { |
1189 | struct per_cpu_pages *pcp; | 1206 | struct per_cpu_pages *pcp; |
1190 | struct list_head *list; | 1207 | struct list_head *list; |
1191 | 1208 | ||
1192 | pcp = &zone_pcp(zone, cpu)->pcp; | ||
1193 | list = &pcp->lists[migratetype]; | ||
1194 | local_irq_save(flags); | 1209 | local_irq_save(flags); |
1210 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
1211 | list = &pcp->lists[migratetype]; | ||
1195 | if (list_empty(list)) { | 1212 | if (list_empty(list)) { |
1196 | pcp->count += rmqueue_bulk(zone, 0, | 1213 | pcp->count += rmqueue_bulk(zone, 0, |
1197 | pcp->batch, list, | 1214 | pcp->batch, list, |
@@ -1232,7 +1249,6 @@ again: | |||
1232 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1249 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1233 | zone_statistics(preferred_zone, zone); | 1250 | zone_statistics(preferred_zone, zone); |
1234 | local_irq_restore(flags); | 1251 | local_irq_restore(flags); |
1235 | put_cpu(); | ||
1236 | 1252 | ||
1237 | VM_BUG_ON(bad_range(zone, page)); | 1253 | VM_BUG_ON(bad_range(zone, page)); |
1238 | if (prep_new_page(page, order, gfp_flags)) | 1254 | if (prep_new_page(page, order, gfp_flags)) |
@@ -1241,7 +1257,6 @@ again: | |||
1241 | 1257 | ||
1242 | failed: | 1258 | failed: |
1243 | local_irq_restore(flags); | 1259 | local_irq_restore(flags); |
1244 | put_cpu(); | ||
1245 | return NULL; | 1260 | return NULL; |
1246 | } | 1261 | } |
1247 | 1262 | ||
@@ -2013,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec) | |||
2013 | void __free_pages(struct page *page, unsigned int order) | 2028 | void __free_pages(struct page *page, unsigned int order) |
2014 | { | 2029 | { |
2015 | if (put_page_testzero(page)) { | 2030 | if (put_page_testzero(page)) { |
2016 | trace_mm_page_free_direct(page, order); | ||
2017 | if (order == 0) | 2031 | if (order == 0) |
2018 | free_hot_page(page); | 2032 | free_hot_cold_page(page, 0); |
2019 | else | 2033 | else |
2020 | __free_pages_ok(page, order); | 2034 | __free_pages_ok(page, order); |
2021 | } | 2035 | } |
@@ -2180,7 +2194,7 @@ void show_free_areas(void) | |||
2180 | for_each_online_cpu(cpu) { | 2194 | for_each_online_cpu(cpu) { |
2181 | struct per_cpu_pageset *pageset; | 2195 | struct per_cpu_pageset *pageset; |
2182 | 2196 | ||
2183 | pageset = zone_pcp(zone, cpu); | 2197 | pageset = per_cpu_ptr(zone->pageset, cpu); |
2184 | 2198 | ||
2185 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | 2199 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
2186 | cpu, pageset->pcp.high, | 2200 | cpu, pageset->pcp.high, |
@@ -2271,7 +2285,7 @@ void show_free_areas(void) | |||
2271 | K(zone_page_state(zone, NR_BOUNCE)), | 2285 | K(zone_page_state(zone, NR_BOUNCE)), |
2272 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2286 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
2273 | zone->pages_scanned, | 2287 | zone->pages_scanned, |
2274 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2288 | (zone->all_unreclaimable ? "yes" : "no") |
2275 | ); | 2289 | ); |
2276 | printk("lowmem_reserve[]:"); | 2290 | printk("lowmem_reserve[]:"); |
2277 | for (i = 0; i < MAX_NR_ZONES; i++) | 2291 | for (i = 0; i < MAX_NR_ZONES; i++) |
@@ -2745,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2745 | 2759 | ||
2746 | #endif /* CONFIG_NUMA */ | 2760 | #endif /* CONFIG_NUMA */ |
2747 | 2761 | ||
2762 | /* | ||
2763 | * Boot pageset table. One per cpu which is going to be used for all | ||
2764 | * zones and all nodes. The parameters will be set in such a way | ||
2765 | * that an item put on a list will immediately be handed over to | ||
2766 | * the buddy list. This is safe since pageset manipulation is done | ||
2767 | * with interrupts disabled. | ||
2768 | * | ||
2769 | * The boot_pagesets must be kept even after bootup is complete for | ||
2770 | * unused processors and/or zones. They do play a role for bootstrapping | ||
2771 | * hotplugged processors. | ||
2772 | * | ||
2773 | * zoneinfo_show() and maybe other functions do | ||
2774 | * not check if the processor is online before following the pageset pointer. | ||
2775 | * Other parts of the kernel may not check if the zone is available. | ||
2776 | */ | ||
2777 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | ||
2778 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | ||
2779 | |||
2748 | /* return values int ....just for stop_machine() */ | 2780 | /* return values int ....just for stop_machine() */ |
2749 | static int __build_all_zonelists(void *dummy) | 2781 | static int __build_all_zonelists(void *dummy) |
2750 | { | 2782 | { |
2751 | int nid; | 2783 | int nid; |
2784 | int cpu; | ||
2752 | 2785 | ||
2753 | #ifdef CONFIG_NUMA | 2786 | #ifdef CONFIG_NUMA |
2754 | memset(node_load, 0, sizeof(node_load)); | 2787 | memset(node_load, 0, sizeof(node_load)); |
@@ -2759,6 +2792,23 @@ static int __build_all_zonelists(void *dummy) | |||
2759 | build_zonelists(pgdat); | 2792 | build_zonelists(pgdat); |
2760 | build_zonelist_cache(pgdat); | 2793 | build_zonelist_cache(pgdat); |
2761 | } | 2794 | } |
2795 | |||
2796 | /* | ||
2797 | * Initialize the boot_pagesets that are going to be used | ||
2798 | * for bootstrapping processors. The real pagesets for | ||
2799 | * each zone will be allocated later when the per cpu | ||
2800 | * allocator is available. | ||
2801 | * | ||
2802 | * boot_pagesets are used also for bootstrapping offline | ||
2803 | * cpus if the system is already booted because the pagesets | ||
2804 | * are needed to initialize allocators on a specific cpu too. | ||
2805 | * F.e. the percpu allocator needs the page allocator which | ||
2806 | * needs the percpu allocator in order to allocate its pagesets | ||
2807 | * (a chicken-egg dilemma). | ||
2808 | */ | ||
2809 | for_each_possible_cpu(cpu) | ||
2810 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | ||
2811 | |||
2762 | return 0; | 2812 | return 0; |
2763 | } | 2813 | } |
2764 | 2814 | ||
@@ -3096,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3096 | pcp->batch = PAGE_SHIFT * 8; | 3146 | pcp->batch = PAGE_SHIFT * 8; |
3097 | } | 3147 | } |
3098 | 3148 | ||
3099 | |||
3100 | #ifdef CONFIG_NUMA | ||
3101 | /* | ||
3102 | * Boot pageset table. One per cpu which is going to be used for all | ||
3103 | * zones and all nodes. The parameters will be set in such a way | ||
3104 | * that an item put on a list will immediately be handed over to | ||
3105 | * the buddy list. This is safe since pageset manipulation is done | ||
3106 | * with interrupts disabled. | ||
3107 | * | ||
3108 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
3109 | * | ||
3110 | * The boot_pagesets must be kept even after bootup is complete for | ||
3111 | * unused processors and/or zones. They do play a role for bootstrapping | ||
3112 | * hotplugged processors. | ||
3113 | * | ||
3114 | * zoneinfo_show() and maybe other functions do | ||
3115 | * not check if the processor is online before following the pageset pointer. | ||
3116 | * Other parts of the kernel may not check if the zone is available. | ||
3117 | */ | ||
3118 | static struct per_cpu_pageset boot_pageset[NR_CPUS]; | ||
3119 | |||
3120 | /* | 3149 | /* |
3121 | * Dynamically allocate memory for the | 3150 | * Allocate per cpu pagesets and initialize them. |
3122 | * per cpu pageset array in struct zone. | 3151 | * Before this call only boot pagesets were available. |
3152 | * Boot pagesets will no longer be used by this processorr | ||
3153 | * after setup_per_cpu_pageset(). | ||
3123 | */ | 3154 | */ |
3124 | static int __cpuinit process_zones(int cpu) | 3155 | void __init setup_per_cpu_pageset(void) |
3125 | { | 3156 | { |
3126 | struct zone *zone, *dzone; | 3157 | struct zone *zone; |
3127 | int node = cpu_to_node(cpu); | 3158 | int cpu; |
3128 | |||
3129 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
3130 | 3159 | ||
3131 | for_each_populated_zone(zone) { | 3160 | for_each_populated_zone(zone) { |
3132 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 3161 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
3133 | GFP_KERNEL, node); | ||
3134 | if (!zone_pcp(zone, cpu)) | ||
3135 | goto bad; | ||
3136 | 3162 | ||
3137 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); | 3163 | for_each_possible_cpu(cpu) { |
3164 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
3138 | 3165 | ||
3139 | if (percpu_pagelist_fraction) | 3166 | setup_pageset(pcp, zone_batchsize(zone)); |
3140 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
3141 | (zone->present_pages / percpu_pagelist_fraction)); | ||
3142 | } | ||
3143 | 3167 | ||
3144 | return 0; | 3168 | if (percpu_pagelist_fraction) |
3145 | bad: | 3169 | setup_pagelist_highmark(pcp, |
3146 | for_each_zone(dzone) { | 3170 | (zone->present_pages / |
3147 | if (!populated_zone(dzone)) | 3171 | percpu_pagelist_fraction)); |
3148 | continue; | 3172 | } |
3149 | if (dzone == zone) | ||
3150 | break; | ||
3151 | kfree(zone_pcp(dzone, cpu)); | ||
3152 | zone_pcp(dzone, cpu) = &boot_pageset[cpu]; | ||
3153 | } | ||
3154 | return -ENOMEM; | ||
3155 | } | ||
3156 | |||
3157 | static inline void free_zone_pagesets(int cpu) | ||
3158 | { | ||
3159 | struct zone *zone; | ||
3160 | |||
3161 | for_each_zone(zone) { | ||
3162 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
3163 | |||
3164 | /* Free per_cpu_pageset if it is slab allocated */ | ||
3165 | if (pset != &boot_pageset[cpu]) | ||
3166 | kfree(pset); | ||
3167 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
3168 | } | ||
3169 | } | ||
3170 | |||
3171 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
3172 | unsigned long action, | ||
3173 | void *hcpu) | ||
3174 | { | ||
3175 | int cpu = (long)hcpu; | ||
3176 | int ret = NOTIFY_OK; | ||
3177 | |||
3178 | switch (action) { | ||
3179 | case CPU_UP_PREPARE: | ||
3180 | case CPU_UP_PREPARE_FROZEN: | ||
3181 | if (process_zones(cpu)) | ||
3182 | ret = NOTIFY_BAD; | ||
3183 | break; | ||
3184 | case CPU_UP_CANCELED: | ||
3185 | case CPU_UP_CANCELED_FROZEN: | ||
3186 | case CPU_DEAD: | ||
3187 | case CPU_DEAD_FROZEN: | ||
3188 | free_zone_pagesets(cpu); | ||
3189 | break; | ||
3190 | default: | ||
3191 | break; | ||
3192 | } | 3173 | } |
3193 | return ret; | ||
3194 | } | 3174 | } |
3195 | 3175 | ||
3196 | static struct notifier_block __cpuinitdata pageset_notifier = | ||
3197 | { &pageset_cpuup_callback, NULL, 0 }; | ||
3198 | |||
3199 | void __init setup_per_cpu_pageset(void) | ||
3200 | { | ||
3201 | int err; | ||
3202 | |||
3203 | /* Initialize per_cpu_pageset for cpu 0. | ||
3204 | * A cpuup callback will do this for every cpu | ||
3205 | * as it comes online | ||
3206 | */ | ||
3207 | err = process_zones(smp_processor_id()); | ||
3208 | BUG_ON(err); | ||
3209 | register_cpu_notifier(&pageset_notifier); | ||
3210 | } | ||
3211 | |||
3212 | #endif | ||
3213 | |||
3214 | static noinline __init_refok | 3176 | static noinline __init_refok |
3215 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 3177 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
3216 | { | 3178 | { |
@@ -3260,11 +3222,11 @@ static int __zone_pcp_update(void *data) | |||
3260 | int cpu; | 3222 | int cpu; |
3261 | unsigned long batch = zone_batchsize(zone), flags; | 3223 | unsigned long batch = zone_batchsize(zone), flags; |
3262 | 3224 | ||
3263 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 3225 | for_each_possible_cpu(cpu) { |
3264 | struct per_cpu_pageset *pset; | 3226 | struct per_cpu_pageset *pset; |
3265 | struct per_cpu_pages *pcp; | 3227 | struct per_cpu_pages *pcp; |
3266 | 3228 | ||
3267 | pset = zone_pcp(zone, cpu); | 3229 | pset = per_cpu_ptr(zone->pageset, cpu); |
3268 | pcp = &pset->pcp; | 3230 | pcp = &pset->pcp; |
3269 | 3231 | ||
3270 | local_irq_save(flags); | 3232 | local_irq_save(flags); |
@@ -3282,21 +3244,17 @@ void zone_pcp_update(struct zone *zone) | |||
3282 | 3244 | ||
3283 | static __meminit void zone_pcp_init(struct zone *zone) | 3245 | static __meminit void zone_pcp_init(struct zone *zone) |
3284 | { | 3246 | { |
3285 | int cpu; | 3247 | /* |
3286 | unsigned long batch = zone_batchsize(zone); | 3248 | * per cpu subsystem is not up at this point. The following code |
3249 | * relies on the ability of the linker to provide the | ||
3250 | * offset of a (static) per cpu variable into the per cpu area. | ||
3251 | */ | ||
3252 | zone->pageset = &boot_pageset; | ||
3287 | 3253 | ||
3288 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
3289 | #ifdef CONFIG_NUMA | ||
3290 | /* Early boot. Slab allocator not functional yet */ | ||
3291 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
3292 | setup_pageset(&boot_pageset[cpu],0); | ||
3293 | #else | ||
3294 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
3295 | #endif | ||
3296 | } | ||
3297 | if (zone->present_pages) | 3254 | if (zone->present_pages) |
3298 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 3255 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
3299 | zone->name, zone->present_pages, batch); | 3256 | zone->name, zone->present_pages, |
3257 | zone_batchsize(zone)); | ||
3300 | } | 3258 | } |
3301 | 3259 | ||
3302 | __meminit int init_currently_empty_zone(struct zone *zone, | 3260 | __meminit int init_currently_empty_zone(struct zone *zone, |
@@ -3435,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid, | |||
3435 | } | 3393 | } |
3436 | } | 3394 | } |
3437 | 3395 | ||
3396 | int __init add_from_early_node_map(struct range *range, int az, | ||
3397 | int nr_range, int nid) | ||
3398 | { | ||
3399 | int i; | ||
3400 | u64 start, end; | ||
3401 | |||
3402 | /* need to go over early_node_map to find out good range for node */ | ||
3403 | for_each_active_range_index_in_nid(i, nid) { | ||
3404 | start = early_node_map[i].start_pfn; | ||
3405 | end = early_node_map[i].end_pfn; | ||
3406 | nr_range = add_range(range, az, nr_range, start, end); | ||
3407 | } | ||
3408 | return nr_range; | ||
3409 | } | ||
3410 | |||
3411 | #ifdef CONFIG_NO_BOOTMEM | ||
3412 | void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | ||
3413 | u64 goal, u64 limit) | ||
3414 | { | ||
3415 | int i; | ||
3416 | void *ptr; | ||
3417 | |||
3418 | /* need to go over early_node_map to find out good range for node */ | ||
3419 | for_each_active_range_index_in_nid(i, nid) { | ||
3420 | u64 addr; | ||
3421 | u64 ei_start, ei_last; | ||
3422 | |||
3423 | ei_last = early_node_map[i].end_pfn; | ||
3424 | ei_last <<= PAGE_SHIFT; | ||
3425 | ei_start = early_node_map[i].start_pfn; | ||
3426 | ei_start <<= PAGE_SHIFT; | ||
3427 | addr = find_early_area(ei_start, ei_last, | ||
3428 | goal, limit, size, align); | ||
3429 | |||
3430 | if (addr == -1ULL) | ||
3431 | continue; | ||
3432 | |||
3433 | #if 0 | ||
3434 | printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", | ||
3435 | nid, | ||
3436 | ei_start, ei_last, goal, limit, size, | ||
3437 | align, addr); | ||
3438 | #endif | ||
3439 | |||
3440 | ptr = phys_to_virt(addr); | ||
3441 | memset(ptr, 0, size); | ||
3442 | reserve_early_without_check(addr, addr + size, "BOOTMEM"); | ||
3443 | return ptr; | ||
3444 | } | ||
3445 | |||
3446 | return NULL; | ||
3447 | } | ||
3448 | #endif | ||
3449 | |||
3450 | |||
3438 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) | 3451 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) |
3439 | { | 3452 | { |
3440 | int i; | 3453 | int i; |
@@ -4377,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4377 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4390 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4378 | if (i == ZONE_MOVABLE) | 4391 | if (i == ZONE_MOVABLE) |
4379 | continue; | 4392 | continue; |
4380 | printk(" %-8s %0#10lx -> %0#10lx\n", | 4393 | printk(" %-8s ", zone_names[i]); |
4381 | zone_names[i], | 4394 | if (arch_zone_lowest_possible_pfn[i] == |
4395 | arch_zone_highest_possible_pfn[i]) | ||
4396 | printk("empty\n"); | ||
4397 | else | ||
4398 | printk("%0#10lx -> %0#10lx\n", | ||
4382 | arch_zone_lowest_possible_pfn[i], | 4399 | arch_zone_lowest_possible_pfn[i], |
4383 | arch_zone_highest_possible_pfn[i]); | 4400 | arch_zone_highest_possible_pfn[i]); |
4384 | } | 4401 | } |
@@ -4467,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
4467 | } | 4484 | } |
4468 | 4485 | ||
4469 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4486 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
4470 | struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; | 4487 | struct pglist_data __refdata contig_page_data = { |
4488 | #ifndef CONFIG_NO_BOOTMEM | ||
4489 | .bdata = &bootmem_node_data[0] | ||
4490 | #endif | ||
4491 | }; | ||
4471 | EXPORT_SYMBOL(contig_page_data); | 4492 | EXPORT_SYMBOL(contig_page_data); |
4472 | #endif | 4493 | #endif |
4473 | 4494 | ||
@@ -4810,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
4810 | if (!write || (ret == -EINVAL)) | 4831 | if (!write || (ret == -EINVAL)) |
4811 | return ret; | 4832 | return ret; |
4812 | for_each_populated_zone(zone) { | 4833 | for_each_populated_zone(zone) { |
4813 | for_each_online_cpu(cpu) { | 4834 | for_each_possible_cpu(cpu) { |
4814 | unsigned long high; | 4835 | unsigned long high; |
4815 | high = zone->present_pages / percpu_pagelist_fraction; | 4836 | high = zone->present_pages / percpu_pagelist_fraction; |
4816 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | 4837 | setup_pagelist_highmark( |
4838 | per_cpu_ptr(zone->pageset, cpu), high); | ||
4817 | } | 4839 | } |
4818 | } | 4840 | } |
4819 | return 0; | 4841 | return 0; |
@@ -5159,3 +5181,80 @@ bool is_free_buddy_page(struct page *page) | |||
5159 | return order < MAX_ORDER; | 5181 | return order < MAX_ORDER; |
5160 | } | 5182 | } |
5161 | #endif | 5183 | #endif |
5184 | |||
5185 | static struct trace_print_flags pageflag_names[] = { | ||
5186 | {1UL << PG_locked, "locked" }, | ||
5187 | {1UL << PG_error, "error" }, | ||
5188 | {1UL << PG_referenced, "referenced" }, | ||
5189 | {1UL << PG_uptodate, "uptodate" }, | ||
5190 | {1UL << PG_dirty, "dirty" }, | ||
5191 | {1UL << PG_lru, "lru" }, | ||
5192 | {1UL << PG_active, "active" }, | ||
5193 | {1UL << PG_slab, "slab" }, | ||
5194 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | ||
5195 | {1UL << PG_arch_1, "arch_1" }, | ||
5196 | {1UL << PG_reserved, "reserved" }, | ||
5197 | {1UL << PG_private, "private" }, | ||
5198 | {1UL << PG_private_2, "private_2" }, | ||
5199 | {1UL << PG_writeback, "writeback" }, | ||
5200 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
5201 | {1UL << PG_head, "head" }, | ||
5202 | {1UL << PG_tail, "tail" }, | ||
5203 | #else | ||
5204 | {1UL << PG_compound, "compound" }, | ||
5205 | #endif | ||
5206 | {1UL << PG_swapcache, "swapcache" }, | ||
5207 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
5208 | {1UL << PG_reclaim, "reclaim" }, | ||
5209 | {1UL << PG_buddy, "buddy" }, | ||
5210 | {1UL << PG_swapbacked, "swapbacked" }, | ||
5211 | {1UL << PG_unevictable, "unevictable" }, | ||
5212 | #ifdef CONFIG_MMU | ||
5213 | {1UL << PG_mlocked, "mlocked" }, | ||
5214 | #endif | ||
5215 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
5216 | {1UL << PG_uncached, "uncached" }, | ||
5217 | #endif | ||
5218 | #ifdef CONFIG_MEMORY_FAILURE | ||
5219 | {1UL << PG_hwpoison, "hwpoison" }, | ||
5220 | #endif | ||
5221 | {-1UL, NULL }, | ||
5222 | }; | ||
5223 | |||
5224 | static void dump_page_flags(unsigned long flags) | ||
5225 | { | ||
5226 | const char *delim = ""; | ||
5227 | unsigned long mask; | ||
5228 | int i; | ||
5229 | |||
5230 | printk(KERN_ALERT "page flags: %#lx(", flags); | ||
5231 | |||
5232 | /* remove zone id */ | ||
5233 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
5234 | |||
5235 | for (i = 0; pageflag_names[i].name && flags; i++) { | ||
5236 | |||
5237 | mask = pageflag_names[i].mask; | ||
5238 | if ((flags & mask) != mask) | ||
5239 | continue; | ||
5240 | |||
5241 | flags &= ~mask; | ||
5242 | printk("%s%s", delim, pageflag_names[i].name); | ||
5243 | delim = "|"; | ||
5244 | } | ||
5245 | |||
5246 | /* check for left over flags */ | ||
5247 | if (flags) | ||
5248 | printk("%s%#lx", delim, flags); | ||
5249 | |||
5250 | printk(")\n"); | ||
5251 | } | ||
5252 | |||
5253 | void dump_page(struct page *page) | ||
5254 | { | ||
5255 | printk(KERN_ALERT | ||
5256 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | ||
5257 | page, page_count(page), page_mapcount(page), | ||
5258 | page->mapping, page->index); | ||
5259 | dump_page_flags(page->flags); | ||
5260 | } | ||
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3d535d594826..3dd88539a0e6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -335,6 +335,37 @@ not_enough_page: | |||
335 | } | 335 | } |
336 | 336 | ||
337 | /** | 337 | /** |
338 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | ||
339 | * @end: swap entry to be cmpxchged | ||
340 | * @old: old id | ||
341 | * @new: new id | ||
342 | * | ||
343 | * Returns old id at success, 0 at failure. | ||
344 | * (There is no mem_cgroup useing 0 as its id) | ||
345 | */ | ||
346 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
347 | unsigned short old, unsigned short new) | ||
348 | { | ||
349 | int type = swp_type(ent); | ||
350 | unsigned long offset = swp_offset(ent); | ||
351 | unsigned long idx = offset / SC_PER_PAGE; | ||
352 | unsigned long pos = offset & SC_POS_MASK; | ||
353 | struct swap_cgroup_ctrl *ctrl; | ||
354 | struct page *mappage; | ||
355 | struct swap_cgroup *sc; | ||
356 | |||
357 | ctrl = &swap_cgroup_ctrl[type]; | ||
358 | |||
359 | mappage = ctrl->map[idx]; | ||
360 | sc = page_address(mappage); | ||
361 | sc += pos; | ||
362 | if (cmpxchg(&sc->id, old, new) == old) | ||
363 | return old; | ||
364 | else | ||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | /** | ||
338 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 369 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
339 | * @ent: swap entry to be recorded into | 370 | * @ent: swap entry to be recorded into |
340 | * @mem: mem_cgroup to be recorded | 371 | * @mem: mem_cgroup to be recorded |
@@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
358 | mappage = ctrl->map[idx]; | 389 | mappage = ctrl->map[idx]; |
359 | sc = page_address(mappage); | 390 | sc = page_address(mappage); |
360 | sc += pos; | 391 | sc += pos; |
361 | old = sc->id; | 392 | old = xchg(&sc->id, id); |
362 | sc->id = id; | ||
363 | 393 | ||
364 | return old; | 394 | return old; |
365 | } | 395 | } |
diff --git a/mm/percpu.c b/mm/percpu.c index 083e7c91e5f6..768419d44ad7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -80,13 +80,15 @@ | |||
80 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ | 80 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ |
81 | #ifndef __addr_to_pcpu_ptr | 81 | #ifndef __addr_to_pcpu_ptr |
82 | #define __addr_to_pcpu_ptr(addr) \ | 82 | #define __addr_to_pcpu_ptr(addr) \ |
83 | (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ | 83 | (void __percpu *)((unsigned long)(addr) - \ |
84 | + (unsigned long)__per_cpu_start) | 84 | (unsigned long)pcpu_base_addr + \ |
85 | (unsigned long)__per_cpu_start) | ||
85 | #endif | 86 | #endif |
86 | #ifndef __pcpu_ptr_to_addr | 87 | #ifndef __pcpu_ptr_to_addr |
87 | #define __pcpu_ptr_to_addr(ptr) \ | 88 | #define __pcpu_ptr_to_addr(ptr) \ |
88 | (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ | 89 | (void __force *)((unsigned long)(ptr) + \ |
89 | - (unsigned long)__per_cpu_start) | 90 | (unsigned long)pcpu_base_addr - \ |
91 | (unsigned long)__per_cpu_start) | ||
90 | #endif | 92 | #endif |
91 | 93 | ||
92 | struct pcpu_chunk { | 94 | struct pcpu_chunk { |
@@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
913 | int rs, re; | 915 | int rs, re; |
914 | 916 | ||
915 | /* quick path, check whether it's empty already */ | 917 | /* quick path, check whether it's empty already */ |
916 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | 918 | rs = page_start; |
917 | if (rs == page_start && re == page_end) | 919 | pcpu_next_unpop(chunk, &rs, &re, page_end); |
918 | return; | 920 | if (rs == page_start && re == page_end) |
919 | break; | 921 | return; |
920 | } | ||
921 | 922 | ||
922 | /* immutable chunks can't be depopulated */ | 923 | /* immutable chunks can't be depopulated */ |
923 | WARN_ON(chunk->immutable); | 924 | WARN_ON(chunk->immutable); |
@@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
968 | int rs, re, rc; | 969 | int rs, re, rc; |
969 | 970 | ||
970 | /* quick path, check whether all pages are already there */ | 971 | /* quick path, check whether all pages are already there */ |
971 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { | 972 | rs = page_start; |
972 | if (rs == page_start && re == page_end) | 973 | pcpu_next_pop(chunk, &rs, &re, page_end); |
973 | goto clear; | 974 | if (rs == page_start && re == page_end) |
974 | break; | 975 | goto clear; |
975 | } | ||
976 | 976 | ||
977 | /* need to allocate and map pages, this chunk can't be immutable */ | 977 | /* need to allocate and map pages, this chunk can't be immutable */ |
978 | WARN_ON(chunk->immutable); | 978 | WARN_ON(chunk->immutable); |
@@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
1067 | * RETURNS: | 1067 | * RETURNS: |
1068 | * Percpu pointer to the allocated area on success, NULL on failure. | 1068 | * Percpu pointer to the allocated area on success, NULL on failure. |
1069 | */ | 1069 | */ |
1070 | static void *pcpu_alloc(size_t size, size_t align, bool reserved) | 1070 | static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) |
1071 | { | 1071 | { |
1072 | static int warn_limit = 10; | 1072 | static int warn_limit = 10; |
1073 | struct pcpu_chunk *chunk; | 1073 | struct pcpu_chunk *chunk; |
@@ -1196,7 +1196,7 @@ fail_unlock_mutex: | |||
1196 | * RETURNS: | 1196 | * RETURNS: |
1197 | * Percpu pointer to the allocated area on success, NULL on failure. | 1197 | * Percpu pointer to the allocated area on success, NULL on failure. |
1198 | */ | 1198 | */ |
1199 | void *__alloc_percpu(size_t size, size_t align) | 1199 | void __percpu *__alloc_percpu(size_t size, size_t align) |
1200 | { | 1200 | { |
1201 | return pcpu_alloc(size, align, false); | 1201 | return pcpu_alloc(size, align, false); |
1202 | } | 1202 | } |
@@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); | |||
1217 | * RETURNS: | 1217 | * RETURNS: |
1218 | * Percpu pointer to the allocated area on success, NULL on failure. | 1218 | * Percpu pointer to the allocated area on success, NULL on failure. |
1219 | */ | 1219 | */ |
1220 | void *__alloc_reserved_percpu(size_t size, size_t align) | 1220 | void __percpu *__alloc_reserved_percpu(size_t size, size_t align) |
1221 | { | 1221 | { |
1222 | return pcpu_alloc(size, align, true); | 1222 | return pcpu_alloc(size, align, true); |
1223 | } | 1223 | } |
@@ -1269,7 +1269,7 @@ static void pcpu_reclaim(struct work_struct *work) | |||
1269 | * CONTEXT: | 1269 | * CONTEXT: |
1270 | * Can be called from atomic context. | 1270 | * Can be called from atomic context. |
1271 | */ | 1271 | */ |
1272 | void free_percpu(void *ptr) | 1272 | void free_percpu(void __percpu *ptr) |
1273 | { | 1273 | { |
1274 | void *addr; | 1274 | void *addr; |
1275 | struct pcpu_chunk *chunk; | 1275 | struct pcpu_chunk *chunk; |
diff --git a/mm/readahead.c b/mm/readahead.c index 033bc135a41f..337b20e946f6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -501,6 +501,12 @@ void page_cache_sync_readahead(struct address_space *mapping, | |||
501 | if (!ra->ra_pages) | 501 | if (!ra->ra_pages) |
502 | return; | 502 | return; |
503 | 503 | ||
504 | /* be dumb */ | ||
505 | if (filp->f_mode & FMODE_RANDOM) { | ||
506 | force_page_cache_readahead(mapping, filp, offset, req_size); | ||
507 | return; | ||
508 | } | ||
509 | |||
504 | /* do read-ahead */ | 510 | /* do read-ahead */ |
505 | ondemand_readahead(mapping, ra, filp, false, offset, req_size); | 511 | ondemand_readahead(mapping, ra, filp, false, offset, req_size); |
506 | } | 512 | } |
@@ -62,6 +62,7 @@ | |||
62 | #include "internal.h" | 62 | #include "internal.h" |
63 | 63 | ||
64 | static struct kmem_cache *anon_vma_cachep; | 64 | static struct kmem_cache *anon_vma_cachep; |
65 | static struct kmem_cache *anon_vma_chain_cachep; | ||
65 | 66 | ||
66 | static inline struct anon_vma *anon_vma_alloc(void) | 67 | static inline struct anon_vma *anon_vma_alloc(void) |
67 | { | 68 | { |
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma) | |||
73 | kmem_cache_free(anon_vma_cachep, anon_vma); | 74 | kmem_cache_free(anon_vma_cachep, anon_vma); |
74 | } | 75 | } |
75 | 76 | ||
77 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | ||
78 | { | ||
79 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | ||
80 | } | ||
81 | |||
82 | void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | ||
83 | { | ||
84 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | ||
85 | } | ||
86 | |||
76 | /** | 87 | /** |
77 | * anon_vma_prepare - attach an anon_vma to a memory region | 88 | * anon_vma_prepare - attach an anon_vma to a memory region |
78 | * @vma: the memory region in question | 89 | * @vma: the memory region in question |
@@ -103,18 +114,23 @@ void anon_vma_free(struct anon_vma *anon_vma) | |||
103 | int anon_vma_prepare(struct vm_area_struct *vma) | 114 | int anon_vma_prepare(struct vm_area_struct *vma) |
104 | { | 115 | { |
105 | struct anon_vma *anon_vma = vma->anon_vma; | 116 | struct anon_vma *anon_vma = vma->anon_vma; |
117 | struct anon_vma_chain *avc; | ||
106 | 118 | ||
107 | might_sleep(); | 119 | might_sleep(); |
108 | if (unlikely(!anon_vma)) { | 120 | if (unlikely(!anon_vma)) { |
109 | struct mm_struct *mm = vma->vm_mm; | 121 | struct mm_struct *mm = vma->vm_mm; |
110 | struct anon_vma *allocated; | 122 | struct anon_vma *allocated; |
111 | 123 | ||
124 | avc = anon_vma_chain_alloc(); | ||
125 | if (!avc) | ||
126 | goto out_enomem; | ||
127 | |||
112 | anon_vma = find_mergeable_anon_vma(vma); | 128 | anon_vma = find_mergeable_anon_vma(vma); |
113 | allocated = NULL; | 129 | allocated = NULL; |
114 | if (!anon_vma) { | 130 | if (!anon_vma) { |
115 | anon_vma = anon_vma_alloc(); | 131 | anon_vma = anon_vma_alloc(); |
116 | if (unlikely(!anon_vma)) | 132 | if (unlikely(!anon_vma)) |
117 | return -ENOMEM; | 133 | goto out_enomem_free_avc; |
118 | allocated = anon_vma; | 134 | allocated = anon_vma; |
119 | } | 135 | } |
120 | spin_lock(&anon_vma->lock); | 136 | spin_lock(&anon_vma->lock); |
@@ -123,53 +139,113 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
123 | spin_lock(&mm->page_table_lock); | 139 | spin_lock(&mm->page_table_lock); |
124 | if (likely(!vma->anon_vma)) { | 140 | if (likely(!vma->anon_vma)) { |
125 | vma->anon_vma = anon_vma; | 141 | vma->anon_vma = anon_vma; |
126 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 142 | avc->anon_vma = anon_vma; |
143 | avc->vma = vma; | ||
144 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
145 | list_add(&avc->same_anon_vma, &anon_vma->head); | ||
127 | allocated = NULL; | 146 | allocated = NULL; |
128 | } | 147 | } |
129 | spin_unlock(&mm->page_table_lock); | 148 | spin_unlock(&mm->page_table_lock); |
130 | 149 | ||
131 | spin_unlock(&anon_vma->lock); | 150 | spin_unlock(&anon_vma->lock); |
132 | if (unlikely(allocated)) | 151 | if (unlikely(allocated)) { |
133 | anon_vma_free(allocated); | 152 | anon_vma_free(allocated); |
153 | anon_vma_chain_free(avc); | ||
154 | } | ||
134 | } | 155 | } |
135 | return 0; | 156 | return 0; |
157 | |||
158 | out_enomem_free_avc: | ||
159 | anon_vma_chain_free(avc); | ||
160 | out_enomem: | ||
161 | return -ENOMEM; | ||
136 | } | 162 | } |
137 | 163 | ||
138 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | 164 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
165 | struct anon_vma_chain *avc, | ||
166 | struct anon_vma *anon_vma) | ||
139 | { | 167 | { |
140 | BUG_ON(vma->anon_vma != next->anon_vma); | 168 | avc->vma = vma; |
141 | list_del(&next->anon_vma_node); | 169 | avc->anon_vma = anon_vma; |
170 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
171 | |||
172 | spin_lock(&anon_vma->lock); | ||
173 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
174 | spin_unlock(&anon_vma->lock); | ||
142 | } | 175 | } |
143 | 176 | ||
144 | void __anon_vma_link(struct vm_area_struct *vma) | 177 | /* |
178 | * Attach the anon_vmas from src to dst. | ||
179 | * Returns 0 on success, -ENOMEM on failure. | ||
180 | */ | ||
181 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | ||
145 | { | 182 | { |
146 | struct anon_vma *anon_vma = vma->anon_vma; | 183 | struct anon_vma_chain *avc, *pavc; |
147 | 184 | ||
148 | if (anon_vma) | 185 | list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { |
149 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 186 | avc = anon_vma_chain_alloc(); |
187 | if (!avc) | ||
188 | goto enomem_failure; | ||
189 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | ||
190 | } | ||
191 | return 0; | ||
192 | |||
193 | enomem_failure: | ||
194 | unlink_anon_vmas(dst); | ||
195 | return -ENOMEM; | ||
150 | } | 196 | } |
151 | 197 | ||
152 | void anon_vma_link(struct vm_area_struct *vma) | 198 | /* |
199 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | ||
200 | * the corresponding VMA in the parent process is attached to. | ||
201 | * Returns 0 on success, non-zero on failure. | ||
202 | */ | ||
203 | int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | ||
153 | { | 204 | { |
154 | struct anon_vma *anon_vma = vma->anon_vma; | 205 | struct anon_vma_chain *avc; |
206 | struct anon_vma *anon_vma; | ||
155 | 207 | ||
156 | if (anon_vma) { | 208 | /* Don't bother if the parent process has no anon_vma here. */ |
157 | spin_lock(&anon_vma->lock); | 209 | if (!pvma->anon_vma) |
158 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 210 | return 0; |
159 | spin_unlock(&anon_vma->lock); | 211 | |
160 | } | 212 | /* |
213 | * First, attach the new VMA to the parent VMA's anon_vmas, | ||
214 | * so rmap can find non-COWed pages in child processes. | ||
215 | */ | ||
216 | if (anon_vma_clone(vma, pvma)) | ||
217 | return -ENOMEM; | ||
218 | |||
219 | /* Then add our own anon_vma. */ | ||
220 | anon_vma = anon_vma_alloc(); | ||
221 | if (!anon_vma) | ||
222 | goto out_error; | ||
223 | avc = anon_vma_chain_alloc(); | ||
224 | if (!avc) | ||
225 | goto out_error_free_anon_vma; | ||
226 | anon_vma_chain_link(vma, avc, anon_vma); | ||
227 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | ||
228 | vma->anon_vma = anon_vma; | ||
229 | |||
230 | return 0; | ||
231 | |||
232 | out_error_free_anon_vma: | ||
233 | anon_vma_free(anon_vma); | ||
234 | out_error: | ||
235 | return -ENOMEM; | ||
161 | } | 236 | } |
162 | 237 | ||
163 | void anon_vma_unlink(struct vm_area_struct *vma) | 238 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) |
164 | { | 239 | { |
165 | struct anon_vma *anon_vma = vma->anon_vma; | 240 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; |
166 | int empty; | 241 | int empty; |
167 | 242 | ||
243 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
168 | if (!anon_vma) | 244 | if (!anon_vma) |
169 | return; | 245 | return; |
170 | 246 | ||
171 | spin_lock(&anon_vma->lock); | 247 | spin_lock(&anon_vma->lock); |
172 | list_del(&vma->anon_vma_node); | 248 | list_del(&anon_vma_chain->same_anon_vma); |
173 | 249 | ||
174 | /* We must garbage collect the anon_vma if it's empty */ | 250 | /* We must garbage collect the anon_vma if it's empty */ |
175 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); | 251 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
@@ -179,6 +255,18 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
179 | anon_vma_free(anon_vma); | 255 | anon_vma_free(anon_vma); |
180 | } | 256 | } |
181 | 257 | ||
258 | void unlink_anon_vmas(struct vm_area_struct *vma) | ||
259 | { | ||
260 | struct anon_vma_chain *avc, *next; | ||
261 | |||
262 | /* Unlink each anon_vma chained to the VMA. */ | ||
263 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
264 | anon_vma_unlink(avc); | ||
265 | list_del(&avc->same_vma); | ||
266 | anon_vma_chain_free(avc); | ||
267 | } | ||
268 | } | ||
269 | |||
182 | static void anon_vma_ctor(void *data) | 270 | static void anon_vma_ctor(void *data) |
183 | { | 271 | { |
184 | struct anon_vma *anon_vma = data; | 272 | struct anon_vma *anon_vma = data; |
@@ -192,6 +280,7 @@ void __init anon_vma_init(void) | |||
192 | { | 280 | { |
193 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 281 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
194 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | 282 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); |
283 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); | ||
195 | } | 284 | } |
196 | 285 | ||
197 | /* | 286 | /* |
@@ -396,7 +485,7 @@ static int page_referenced_anon(struct page *page, | |||
396 | { | 485 | { |
397 | unsigned int mapcount; | 486 | unsigned int mapcount; |
398 | struct anon_vma *anon_vma; | 487 | struct anon_vma *anon_vma; |
399 | struct vm_area_struct *vma; | 488 | struct anon_vma_chain *avc; |
400 | int referenced = 0; | 489 | int referenced = 0; |
401 | 490 | ||
402 | anon_vma = page_lock_anon_vma(page); | 491 | anon_vma = page_lock_anon_vma(page); |
@@ -404,7 +493,8 @@ static int page_referenced_anon(struct page *page, | |||
404 | return referenced; | 493 | return referenced; |
405 | 494 | ||
406 | mapcount = page_mapcount(page); | 495 | mapcount = page_mapcount(page); |
407 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 496 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
497 | struct vm_area_struct *vma = avc->vma; | ||
408 | unsigned long address = vma_address(page, vma); | 498 | unsigned long address = vma_address(page, vma); |
409 | if (address == -EFAULT) | 499 | if (address == -EFAULT) |
410 | continue; | 500 | continue; |
@@ -511,9 +601,6 @@ int page_referenced(struct page *page, | |||
511 | int referenced = 0; | 601 | int referenced = 0; |
512 | int we_locked = 0; | 602 | int we_locked = 0; |
513 | 603 | ||
514 | if (TestClearPageReferenced(page)) | ||
515 | referenced++; | ||
516 | |||
517 | *vm_flags = 0; | 604 | *vm_flags = 0; |
518 | if (page_mapped(page) && page_rmapping(page)) { | 605 | if (page_mapped(page) && page_rmapping(page)) { |
519 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { | 606 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
@@ -614,6 +701,30 @@ int page_mkclean(struct page *page) | |||
614 | EXPORT_SYMBOL_GPL(page_mkclean); | 701 | EXPORT_SYMBOL_GPL(page_mkclean); |
615 | 702 | ||
616 | /** | 703 | /** |
704 | * page_move_anon_rmap - move a page to our anon_vma | ||
705 | * @page: the page to move to our anon_vma | ||
706 | * @vma: the vma the page belongs to | ||
707 | * @address: the user virtual address mapped | ||
708 | * | ||
709 | * When a page belongs exclusively to one process after a COW event, | ||
710 | * that page can be moved into the anon_vma that belongs to just that | ||
711 | * process, so the rmap code will not search the parent or sibling | ||
712 | * processes. | ||
713 | */ | ||
714 | void page_move_anon_rmap(struct page *page, | ||
715 | struct vm_area_struct *vma, unsigned long address) | ||
716 | { | ||
717 | struct anon_vma *anon_vma = vma->anon_vma; | ||
718 | |||
719 | VM_BUG_ON(!PageLocked(page)); | ||
720 | VM_BUG_ON(!anon_vma); | ||
721 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | ||
722 | |||
723 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
724 | page->mapping = (struct address_space *) anon_vma; | ||
725 | } | ||
726 | |||
727 | /** | ||
617 | * __page_set_anon_rmap - setup new anonymous rmap | 728 | * __page_set_anon_rmap - setup new anonymous rmap |
618 | * @page: the page to add the mapping to | 729 | * @page: the page to add the mapping to |
619 | * @vma: the vm area in which the mapping is added | 730 | * @vma: the vm area in which the mapping is added |
@@ -652,9 +763,6 @@ static void __page_check_anon_rmap(struct page *page, | |||
652 | * are initially only visible via the pagetables, and the pte is locked | 763 | * are initially only visible via the pagetables, and the pte is locked |
653 | * over the call to page_add_new_anon_rmap. | 764 | * over the call to page_add_new_anon_rmap. |
654 | */ | 765 | */ |
655 | struct anon_vma *anon_vma = vma->anon_vma; | ||
656 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
657 | BUG_ON(page->mapping != (struct address_space *)anon_vma); | ||
658 | BUG_ON(page->index != linear_page_index(vma, address)); | 766 | BUG_ON(page->index != linear_page_index(vma, address)); |
659 | #endif | 767 | #endif |
660 | } | 768 | } |
@@ -815,9 +923,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
815 | 923 | ||
816 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 924 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
817 | if (PageAnon(page)) | 925 | if (PageAnon(page)) |
818 | dec_mm_counter(mm, anon_rss); | 926 | dec_mm_counter(mm, MM_ANONPAGES); |
819 | else | 927 | else |
820 | dec_mm_counter(mm, file_rss); | 928 | dec_mm_counter(mm, MM_FILEPAGES); |
821 | set_pte_at(mm, address, pte, | 929 | set_pte_at(mm, address, pte, |
822 | swp_entry_to_pte(make_hwpoison_entry(page))); | 930 | swp_entry_to_pte(make_hwpoison_entry(page))); |
823 | } else if (PageAnon(page)) { | 931 | } else if (PageAnon(page)) { |
@@ -839,7 +947,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
839 | list_add(&mm->mmlist, &init_mm.mmlist); | 947 | list_add(&mm->mmlist, &init_mm.mmlist); |
840 | spin_unlock(&mmlist_lock); | 948 | spin_unlock(&mmlist_lock); |
841 | } | 949 | } |
842 | dec_mm_counter(mm, anon_rss); | 950 | dec_mm_counter(mm, MM_ANONPAGES); |
951 | inc_mm_counter(mm, MM_SWAPENTS); | ||
843 | } else if (PAGE_MIGRATION) { | 952 | } else if (PAGE_MIGRATION) { |
844 | /* | 953 | /* |
845 | * Store the pfn of the page in a special migration | 954 | * Store the pfn of the page in a special migration |
@@ -857,7 +966,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
857 | entry = make_migration_entry(page, pte_write(pteval)); | 966 | entry = make_migration_entry(page, pte_write(pteval)); |
858 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 967 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
859 | } else | 968 | } else |
860 | dec_mm_counter(mm, file_rss); | 969 | dec_mm_counter(mm, MM_FILEPAGES); |
861 | 970 | ||
862 | page_remove_rmap(page); | 971 | page_remove_rmap(page); |
863 | page_cache_release(page); | 972 | page_cache_release(page); |
@@ -996,7 +1105,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
996 | 1105 | ||
997 | page_remove_rmap(page); | 1106 | page_remove_rmap(page); |
998 | page_cache_release(page); | 1107 | page_cache_release(page); |
999 | dec_mm_counter(mm, file_rss); | 1108 | dec_mm_counter(mm, MM_FILEPAGES); |
1000 | (*mapcount)--; | 1109 | (*mapcount)--; |
1001 | } | 1110 | } |
1002 | pte_unmap_unlock(pte - 1, ptl); | 1111 | pte_unmap_unlock(pte - 1, ptl); |
@@ -1024,14 +1133,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1024 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1133 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1025 | { | 1134 | { |
1026 | struct anon_vma *anon_vma; | 1135 | struct anon_vma *anon_vma; |
1027 | struct vm_area_struct *vma; | 1136 | struct anon_vma_chain *avc; |
1028 | int ret = SWAP_AGAIN; | 1137 | int ret = SWAP_AGAIN; |
1029 | 1138 | ||
1030 | anon_vma = page_lock_anon_vma(page); | 1139 | anon_vma = page_lock_anon_vma(page); |
1031 | if (!anon_vma) | 1140 | if (!anon_vma) |
1032 | return ret; | 1141 | return ret; |
1033 | 1142 | ||
1034 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1143 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
1144 | struct vm_area_struct *vma = avc->vma; | ||
1035 | unsigned long address = vma_address(page, vma); | 1145 | unsigned long address = vma_address(page, vma); |
1036 | if (address == -EFAULT) | 1146 | if (address == -EFAULT) |
1037 | continue; | 1147 | continue; |
@@ -1222,7 +1332,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1222 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1332 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1223 | { | 1333 | { |
1224 | struct anon_vma *anon_vma; | 1334 | struct anon_vma *anon_vma; |
1225 | struct vm_area_struct *vma; | 1335 | struct anon_vma_chain *avc; |
1226 | int ret = SWAP_AGAIN; | 1336 | int ret = SWAP_AGAIN; |
1227 | 1337 | ||
1228 | /* | 1338 | /* |
@@ -1237,7 +1347,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1237 | if (!anon_vma) | 1347 | if (!anon_vma) |
1238 | return ret; | 1348 | return ret; |
1239 | spin_lock(&anon_vma->lock); | 1349 | spin_lock(&anon_vma->lock); |
1240 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1350 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
1351 | struct vm_area_struct *vma = avc->vma; | ||
1241 | unsigned long address = vma_address(page, vma); | 1352 | unsigned long address = vma_address(page, vma); |
1242 | if (address == -EFAULT) | 1353 | if (address == -EFAULT) |
1243 | continue; | 1354 | continue; |
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to, | |||
935 | 935 | ||
936 | from->avail -= nr; | 936 | from->avail -= nr; |
937 | to->avail += nr; | 937 | to->avail += nr; |
938 | to->touched = 1; | ||
939 | return nr; | 938 | return nr; |
940 | } | 939 | } |
941 | 940 | ||
@@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | |||
983 | 982 | ||
984 | if (limit > 1) | 983 | if (limit > 1) |
985 | limit = 12; | 984 | limit = 12; |
986 | ac_ptr = kmalloc_node(memsize, gfp, node); | 985 | ac_ptr = kzalloc_node(memsize, gfp, node); |
987 | if (ac_ptr) { | 986 | if (ac_ptr) { |
988 | for_each_node(i) { | 987 | for_each_node(i) { |
989 | if (i == node || !node_online(i)) { | 988 | if (i == node || !node_online(i)) |
990 | ac_ptr[i] = NULL; | ||
991 | continue; | 989 | continue; |
992 | } | ||
993 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); | 990 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); |
994 | if (!ac_ptr[i]) { | 991 | if (!ac_ptr[i]) { |
995 | for (i--; i >= 0; i--) | 992 | for (i--; i >= 0; i--) |
@@ -2963,8 +2960,10 @@ retry: | |||
2963 | spin_lock(&l3->list_lock); | 2960 | spin_lock(&l3->list_lock); |
2964 | 2961 | ||
2965 | /* See if we can refill from the shared array */ | 2962 | /* See if we can refill from the shared array */ |
2966 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) | 2963 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { |
2964 | l3->shared->touched = 1; | ||
2967 | goto alloc_done; | 2965 | goto alloc_done; |
2966 | } | ||
2968 | 2967 | ||
2969 | while (batchcount > 0) { | 2968 | while (batchcount > 0) { |
2970 | struct list_head *entry; | 2969 | struct list_head *entry; |
@@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | |||
3101 | if (cachep == &cache_cache) | 3100 | if (cachep == &cache_cache) |
3102 | return false; | 3101 | return false; |
3103 | 3102 | ||
3104 | return should_failslab(obj_size(cachep), flags); | 3103 | return should_failslab(obj_size(cachep), flags, cachep->flags); |
3105 | } | 3104 | } |
3106 | 3105 | ||
3107 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3106 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
@@ -151,7 +151,8 @@ | |||
151 | * Set of flags that will prevent slab merging | 151 | * Set of flags that will prevent slab merging |
152 | */ | 152 | */ |
153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) | 154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ |
155 | SLAB_FAILSLAB) | ||
155 | 156 | ||
156 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 157 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
157 | SLAB_CACHE_DMA | SLAB_NOTRACK) | 158 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) | |||
217 | 218 | ||
218 | #endif | 219 | #endif |
219 | 220 | ||
220 | static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) | 221 | static inline void stat(struct kmem_cache *s, enum stat_item si) |
221 | { | 222 | { |
222 | #ifdef CONFIG_SLUB_STATS | 223 | #ifdef CONFIG_SLUB_STATS |
223 | c->stat[si]++; | 224 | __this_cpu_inc(s->cpu_slab->stat[si]); |
224 | #endif | 225 | #endif |
225 | } | 226 | } |
226 | 227 | ||
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
242 | #endif | 243 | #endif |
243 | } | 244 | } |
244 | 245 | ||
245 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
246 | { | ||
247 | #ifdef CONFIG_SMP | ||
248 | return s->cpu_slab[cpu]; | ||
249 | #else | ||
250 | return &s->cpu_slab; | ||
251 | #endif | ||
252 | } | ||
253 | |||
254 | /* Verify that a pointer has an address that is valid within a slab page */ | 246 | /* Verify that a pointer has an address that is valid within a slab page */ |
255 | static inline int check_valid_pointer(struct kmem_cache *s, | 247 | static inline int check_valid_pointer(struct kmem_cache *s, |
256 | struct page *page, const void *object) | 248 | struct page *page, const void *object) |
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, | |||
269 | return 1; | 261 | return 1; |
270 | } | 262 | } |
271 | 263 | ||
272 | /* | ||
273 | * Slow version of get and set free pointer. | ||
274 | * | ||
275 | * This version requires touching the cache lines of kmem_cache which | ||
276 | * we avoid to do in the fast alloc free paths. There we obtain the offset | ||
277 | * from the page struct. | ||
278 | */ | ||
279 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | 264 | static inline void *get_freepointer(struct kmem_cache *s, void *object) |
280 | { | 265 | { |
281 | return *(void **)(object + s->offset); | 266 | return *(void **)(object + s->offset); |
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str) | |||
1020 | case 't': | 1005 | case 't': |
1021 | slub_debug |= SLAB_TRACE; | 1006 | slub_debug |= SLAB_TRACE; |
1022 | break; | 1007 | break; |
1008 | case 'a': | ||
1009 | slub_debug |= SLAB_FAILSLAB; | ||
1010 | break; | ||
1023 | default: | 1011 | default: |
1024 | printk(KERN_ERR "slub_debug option '%c' " | 1012 | printk(KERN_ERR "slub_debug option '%c' " |
1025 | "unknown. skipped\n", *str); | 1013 | "unknown. skipped\n", *str); |
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1124 | if (!page) | 1112 | if (!page) |
1125 | return NULL; | 1113 | return NULL; |
1126 | 1114 | ||
1127 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1115 | stat(s, ORDER_FALLBACK); |
1128 | } | 1116 | } |
1129 | 1117 | ||
1130 | if (kmemcheck_enabled | 1118 | if (kmemcheck_enabled |
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
1422 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | 1410 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) |
1423 | { | 1411 | { |
1424 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1412 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1425 | struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); | ||
1426 | 1413 | ||
1427 | __ClearPageSlubFrozen(page); | 1414 | __ClearPageSlubFrozen(page); |
1428 | if (page->inuse) { | 1415 | if (page->inuse) { |
1429 | 1416 | ||
1430 | if (page->freelist) { | 1417 | if (page->freelist) { |
1431 | add_partial(n, page, tail); | 1418 | add_partial(n, page, tail); |
1432 | stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | 1419 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); |
1433 | } else { | 1420 | } else { |
1434 | stat(c, DEACTIVATE_FULL); | 1421 | stat(s, DEACTIVATE_FULL); |
1435 | if (SLABDEBUG && PageSlubDebug(page) && | 1422 | if (SLABDEBUG && PageSlubDebug(page) && |
1436 | (s->flags & SLAB_STORE_USER)) | 1423 | (s->flags & SLAB_STORE_USER)) |
1437 | add_full(n, page); | 1424 | add_full(n, page); |
1438 | } | 1425 | } |
1439 | slab_unlock(page); | 1426 | slab_unlock(page); |
1440 | } else { | 1427 | } else { |
1441 | stat(c, DEACTIVATE_EMPTY); | 1428 | stat(s, DEACTIVATE_EMPTY); |
1442 | if (n->nr_partial < s->min_partial) { | 1429 | if (n->nr_partial < s->min_partial) { |
1443 | /* | 1430 | /* |
1444 | * Adding an empty slab to the partial slabs in order | 1431 | * Adding an empty slab to the partial slabs in order |
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
1454 | slab_unlock(page); | 1441 | slab_unlock(page); |
1455 | } else { | 1442 | } else { |
1456 | slab_unlock(page); | 1443 | slab_unlock(page); |
1457 | stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); | 1444 | stat(s, FREE_SLAB); |
1458 | discard_slab(s, page); | 1445 | discard_slab(s, page); |
1459 | } | 1446 | } |
1460 | } | 1447 | } |
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1469 | int tail = 1; | 1456 | int tail = 1; |
1470 | 1457 | ||
1471 | if (page->freelist) | 1458 | if (page->freelist) |
1472 | stat(c, DEACTIVATE_REMOTE_FREES); | 1459 | stat(s, DEACTIVATE_REMOTE_FREES); |
1473 | /* | 1460 | /* |
1474 | * Merge cpu freelist into slab freelist. Typically we get here | 1461 | * Merge cpu freelist into slab freelist. Typically we get here |
1475 | * because both freelists are empty. So this is unlikely | 1462 | * because both freelists are empty. So this is unlikely |
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1482 | 1469 | ||
1483 | /* Retrieve object from cpu_freelist */ | 1470 | /* Retrieve object from cpu_freelist */ |
1484 | object = c->freelist; | 1471 | object = c->freelist; |
1485 | c->freelist = c->freelist[c->offset]; | 1472 | c->freelist = get_freepointer(s, c->freelist); |
1486 | 1473 | ||
1487 | /* And put onto the regular freelist */ | 1474 | /* And put onto the regular freelist */ |
1488 | object[c->offset] = page->freelist; | 1475 | set_freepointer(s, object, page->freelist); |
1489 | page->freelist = object; | 1476 | page->freelist = object; |
1490 | page->inuse--; | 1477 | page->inuse--; |
1491 | } | 1478 | } |
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1495 | 1482 | ||
1496 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1483 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1497 | { | 1484 | { |
1498 | stat(c, CPUSLAB_FLUSH); | 1485 | stat(s, CPUSLAB_FLUSH); |
1499 | slab_lock(c->page); | 1486 | slab_lock(c->page); |
1500 | deactivate_slab(s, c); | 1487 | deactivate_slab(s, c); |
1501 | } | 1488 | } |
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1507 | */ | 1494 | */ |
1508 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | 1495 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) |
1509 | { | 1496 | { |
1510 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 1497 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
1511 | 1498 | ||
1512 | if (likely(c && c->page)) | 1499 | if (likely(c && c->page)) |
1513 | flush_slab(s, c); | 1500 | flush_slab(s, c); |
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1635 | if (unlikely(!node_match(c, node))) | 1622 | if (unlikely(!node_match(c, node))) |
1636 | goto another_slab; | 1623 | goto another_slab; |
1637 | 1624 | ||
1638 | stat(c, ALLOC_REFILL); | 1625 | stat(s, ALLOC_REFILL); |
1639 | 1626 | ||
1640 | load_freelist: | 1627 | load_freelist: |
1641 | object = c->page->freelist; | 1628 | object = c->page->freelist; |
@@ -1644,13 +1631,13 @@ load_freelist: | |||
1644 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) | 1631 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) |
1645 | goto debug; | 1632 | goto debug; |
1646 | 1633 | ||
1647 | c->freelist = object[c->offset]; | 1634 | c->freelist = get_freepointer(s, object); |
1648 | c->page->inuse = c->page->objects; | 1635 | c->page->inuse = c->page->objects; |
1649 | c->page->freelist = NULL; | 1636 | c->page->freelist = NULL; |
1650 | c->node = page_to_nid(c->page); | 1637 | c->node = page_to_nid(c->page); |
1651 | unlock_out: | 1638 | unlock_out: |
1652 | slab_unlock(c->page); | 1639 | slab_unlock(c->page); |
1653 | stat(c, ALLOC_SLOWPATH); | 1640 | stat(s, ALLOC_SLOWPATH); |
1654 | return object; | 1641 | return object; |
1655 | 1642 | ||
1656 | another_slab: | 1643 | another_slab: |
@@ -1660,7 +1647,7 @@ new_slab: | |||
1660 | new = get_partial(s, gfpflags, node); | 1647 | new = get_partial(s, gfpflags, node); |
1661 | if (new) { | 1648 | if (new) { |
1662 | c->page = new; | 1649 | c->page = new; |
1663 | stat(c, ALLOC_FROM_PARTIAL); | 1650 | stat(s, ALLOC_FROM_PARTIAL); |
1664 | goto load_freelist; | 1651 | goto load_freelist; |
1665 | } | 1652 | } |
1666 | 1653 | ||
@@ -1673,8 +1660,8 @@ new_slab: | |||
1673 | local_irq_disable(); | 1660 | local_irq_disable(); |
1674 | 1661 | ||
1675 | if (new) { | 1662 | if (new) { |
1676 | c = get_cpu_slab(s, smp_processor_id()); | 1663 | c = __this_cpu_ptr(s->cpu_slab); |
1677 | stat(c, ALLOC_SLAB); | 1664 | stat(s, ALLOC_SLAB); |
1678 | if (c->page) | 1665 | if (c->page) |
1679 | flush_slab(s, c); | 1666 | flush_slab(s, c); |
1680 | slab_lock(new); | 1667 | slab_lock(new); |
@@ -1690,7 +1677,7 @@ debug: | |||
1690 | goto another_slab; | 1677 | goto another_slab; |
1691 | 1678 | ||
1692 | c->page->inuse++; | 1679 | c->page->inuse++; |
1693 | c->page->freelist = object[c->offset]; | 1680 | c->page->freelist = get_freepointer(s, object); |
1694 | c->node = -1; | 1681 | c->node = -1; |
1695 | goto unlock_out; | 1682 | goto unlock_out; |
1696 | } | 1683 | } |
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1711 | void **object; | 1698 | void **object; |
1712 | struct kmem_cache_cpu *c; | 1699 | struct kmem_cache_cpu *c; |
1713 | unsigned long flags; | 1700 | unsigned long flags; |
1714 | unsigned int objsize; | ||
1715 | 1701 | ||
1716 | gfpflags &= gfp_allowed_mask; | 1702 | gfpflags &= gfp_allowed_mask; |
1717 | 1703 | ||
1718 | lockdep_trace_alloc(gfpflags); | 1704 | lockdep_trace_alloc(gfpflags); |
1719 | might_sleep_if(gfpflags & __GFP_WAIT); | 1705 | might_sleep_if(gfpflags & __GFP_WAIT); |
1720 | 1706 | ||
1721 | if (should_failslab(s->objsize, gfpflags)) | 1707 | if (should_failslab(s->objsize, gfpflags, s->flags)) |
1722 | return NULL; | 1708 | return NULL; |
1723 | 1709 | ||
1724 | local_irq_save(flags); | 1710 | local_irq_save(flags); |
1725 | c = get_cpu_slab(s, smp_processor_id()); | 1711 | c = __this_cpu_ptr(s->cpu_slab); |
1726 | objsize = c->objsize; | 1712 | object = c->freelist; |
1727 | if (unlikely(!c->freelist || !node_match(c, node))) | 1713 | if (unlikely(!object || !node_match(c, node))) |
1728 | 1714 | ||
1729 | object = __slab_alloc(s, gfpflags, node, addr, c); | 1715 | object = __slab_alloc(s, gfpflags, node, addr, c); |
1730 | 1716 | ||
1731 | else { | 1717 | else { |
1732 | object = c->freelist; | 1718 | c->freelist = get_freepointer(s, object); |
1733 | c->freelist = object[c->offset]; | 1719 | stat(s, ALLOC_FASTPATH); |
1734 | stat(c, ALLOC_FASTPATH); | ||
1735 | } | 1720 | } |
1736 | local_irq_restore(flags); | 1721 | local_irq_restore(flags); |
1737 | 1722 | ||
1738 | if (unlikely(gfpflags & __GFP_ZERO) && object) | 1723 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
1739 | memset(object, 0, objsize); | 1724 | memset(object, 0, s->objsize); |
1740 | 1725 | ||
1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | 1726 | kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); |
1742 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | 1727 | kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); |
1743 | 1728 | ||
1744 | return object; | 1729 | return object; |
1745 | } | 1730 | } |
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | |||
1794 | * handling required then we can return immediately. | 1779 | * handling required then we can return immediately. |
1795 | */ | 1780 | */ |
1796 | static void __slab_free(struct kmem_cache *s, struct page *page, | 1781 | static void __slab_free(struct kmem_cache *s, struct page *page, |
1797 | void *x, unsigned long addr, unsigned int offset) | 1782 | void *x, unsigned long addr) |
1798 | { | 1783 | { |
1799 | void *prior; | 1784 | void *prior; |
1800 | void **object = (void *)x; | 1785 | void **object = (void *)x; |
1801 | struct kmem_cache_cpu *c; | ||
1802 | 1786 | ||
1803 | c = get_cpu_slab(s, raw_smp_processor_id()); | 1787 | stat(s, FREE_SLOWPATH); |
1804 | stat(c, FREE_SLOWPATH); | ||
1805 | slab_lock(page); | 1788 | slab_lock(page); |
1806 | 1789 | ||
1807 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) | 1790 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) |
1808 | goto debug; | 1791 | goto debug; |
1809 | 1792 | ||
1810 | checks_ok: | 1793 | checks_ok: |
1811 | prior = object[offset] = page->freelist; | 1794 | prior = page->freelist; |
1795 | set_freepointer(s, object, prior); | ||
1812 | page->freelist = object; | 1796 | page->freelist = object; |
1813 | page->inuse--; | 1797 | page->inuse--; |
1814 | 1798 | ||
1815 | if (unlikely(PageSlubFrozen(page))) { | 1799 | if (unlikely(PageSlubFrozen(page))) { |
1816 | stat(c, FREE_FROZEN); | 1800 | stat(s, FREE_FROZEN); |
1817 | goto out_unlock; | 1801 | goto out_unlock; |
1818 | } | 1802 | } |
1819 | 1803 | ||
@@ -1826,7 +1810,7 @@ checks_ok: | |||
1826 | */ | 1810 | */ |
1827 | if (unlikely(!prior)) { | 1811 | if (unlikely(!prior)) { |
1828 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 1812 | add_partial(get_node(s, page_to_nid(page)), page, 1); |
1829 | stat(c, FREE_ADD_PARTIAL); | 1813 | stat(s, FREE_ADD_PARTIAL); |
1830 | } | 1814 | } |
1831 | 1815 | ||
1832 | out_unlock: | 1816 | out_unlock: |
@@ -1839,10 +1823,10 @@ slab_empty: | |||
1839 | * Slab still on the partial list. | 1823 | * Slab still on the partial list. |
1840 | */ | 1824 | */ |
1841 | remove_partial(s, page); | 1825 | remove_partial(s, page); |
1842 | stat(c, FREE_REMOVE_PARTIAL); | 1826 | stat(s, FREE_REMOVE_PARTIAL); |
1843 | } | 1827 | } |
1844 | slab_unlock(page); | 1828 | slab_unlock(page); |
1845 | stat(c, FREE_SLAB); | 1829 | stat(s, FREE_SLAB); |
1846 | discard_slab(s, page); | 1830 | discard_slab(s, page); |
1847 | return; | 1831 | return; |
1848 | 1832 | ||
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
1872 | 1856 | ||
1873 | kmemleak_free_recursive(x, s->flags); | 1857 | kmemleak_free_recursive(x, s->flags); |
1874 | local_irq_save(flags); | 1858 | local_irq_save(flags); |
1875 | c = get_cpu_slab(s, smp_processor_id()); | 1859 | c = __this_cpu_ptr(s->cpu_slab); |
1876 | kmemcheck_slab_free(s, object, c->objsize); | 1860 | kmemcheck_slab_free(s, object, s->objsize); |
1877 | debug_check_no_locks_freed(object, c->objsize); | 1861 | debug_check_no_locks_freed(object, s->objsize); |
1878 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1862 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
1879 | debug_check_no_obj_freed(object, c->objsize); | 1863 | debug_check_no_obj_freed(object, s->objsize); |
1880 | if (likely(page == c->page && c->node >= 0)) { | 1864 | if (likely(page == c->page && c->node >= 0)) { |
1881 | object[c->offset] = c->freelist; | 1865 | set_freepointer(s, object, c->freelist); |
1882 | c->freelist = object; | 1866 | c->freelist = object; |
1883 | stat(c, FREE_FASTPATH); | 1867 | stat(s, FREE_FASTPATH); |
1884 | } else | 1868 | } else |
1885 | __slab_free(s, page, x, addr, c->offset); | 1869 | __slab_free(s, page, x, addr); |
1886 | 1870 | ||
1887 | local_irq_restore(flags); | 1871 | local_irq_restore(flags); |
1888 | } | 1872 | } |
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
2069 | return ALIGN(align, sizeof(void *)); | 2053 | return ALIGN(align, sizeof(void *)); |
2070 | } | 2054 | } |
2071 | 2055 | ||
2072 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
2073 | struct kmem_cache_cpu *c) | ||
2074 | { | ||
2075 | c->page = NULL; | ||
2076 | c->freelist = NULL; | ||
2077 | c->node = 0; | ||
2078 | c->offset = s->offset / sizeof(void *); | ||
2079 | c->objsize = s->objsize; | ||
2080 | #ifdef CONFIG_SLUB_STATS | ||
2081 | memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); | ||
2082 | #endif | ||
2083 | } | ||
2084 | |||
2085 | static void | 2056 | static void |
2086 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | 2057 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) |
2087 | { | 2058 | { |
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | |||
2095 | #endif | 2066 | #endif |
2096 | } | 2067 | } |
2097 | 2068 | ||
2098 | #ifdef CONFIG_SMP | 2069 | static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); |
2099 | /* | ||
2100 | * Per cpu array for per cpu structures. | ||
2101 | * | ||
2102 | * The per cpu array places all kmem_cache_cpu structures from one processor | ||
2103 | * close together meaning that it becomes possible that multiple per cpu | ||
2104 | * structures are contained in one cacheline. This may be particularly | ||
2105 | * beneficial for the kmalloc caches. | ||
2106 | * | ||
2107 | * A desktop system typically has around 60-80 slabs. With 100 here we are | ||
2108 | * likely able to get per cpu structures for all caches from the array defined | ||
2109 | * here. We must be able to cover all kmalloc caches during bootstrap. | ||
2110 | * | ||
2111 | * If the per cpu array is exhausted then fall back to kmalloc | ||
2112 | * of individual cachelines. No sharing is possible then. | ||
2113 | */ | ||
2114 | #define NR_KMEM_CACHE_CPU 100 | ||
2115 | |||
2116 | static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], | ||
2117 | kmem_cache_cpu); | ||
2118 | |||
2119 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | ||
2120 | static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); | ||
2121 | |||
2122 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | ||
2123 | int cpu, gfp_t flags) | ||
2124 | { | ||
2125 | struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); | ||
2126 | |||
2127 | if (c) | ||
2128 | per_cpu(kmem_cache_cpu_free, cpu) = | ||
2129 | (void *)c->freelist; | ||
2130 | else { | ||
2131 | /* Table overflow: So allocate ourselves */ | ||
2132 | c = kmalloc_node( | ||
2133 | ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), | ||
2134 | flags, cpu_to_node(cpu)); | ||
2135 | if (!c) | ||
2136 | return NULL; | ||
2137 | } | ||
2138 | |||
2139 | init_kmem_cache_cpu(s, c); | ||
2140 | return c; | ||
2141 | } | ||
2142 | |||
2143 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) | ||
2144 | { | ||
2145 | if (c < per_cpu(kmem_cache_cpu, cpu) || | ||
2146 | c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { | ||
2147 | kfree(c); | ||
2148 | return; | ||
2149 | } | ||
2150 | c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); | ||
2151 | per_cpu(kmem_cache_cpu_free, cpu) = c; | ||
2152 | } | ||
2153 | |||
2154 | static void free_kmem_cache_cpus(struct kmem_cache *s) | ||
2155 | { | ||
2156 | int cpu; | ||
2157 | |||
2158 | for_each_online_cpu(cpu) { | ||
2159 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2160 | |||
2161 | if (c) { | ||
2162 | s->cpu_slab[cpu] = NULL; | ||
2163 | free_kmem_cache_cpu(c, cpu); | ||
2164 | } | ||
2165 | } | ||
2166 | } | ||
2167 | |||
2168 | static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
2169 | { | ||
2170 | int cpu; | ||
2171 | |||
2172 | for_each_online_cpu(cpu) { | ||
2173 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2174 | |||
2175 | if (c) | ||
2176 | continue; | ||
2177 | |||
2178 | c = alloc_kmem_cache_cpu(s, cpu, flags); | ||
2179 | if (!c) { | ||
2180 | free_kmem_cache_cpus(s); | ||
2181 | return 0; | ||
2182 | } | ||
2183 | s->cpu_slab[cpu] = c; | ||
2184 | } | ||
2185 | return 1; | ||
2186 | } | ||
2187 | |||
2188 | /* | ||
2189 | * Initialize the per cpu array. | ||
2190 | */ | ||
2191 | static void init_alloc_cpu_cpu(int cpu) | ||
2192 | { | ||
2193 | int i; | ||
2194 | 2070 | ||
2195 | if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) | 2071 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) |
2196 | return; | ||
2197 | |||
2198 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) | ||
2199 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); | ||
2200 | |||
2201 | cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); | ||
2202 | } | ||
2203 | |||
2204 | static void __init init_alloc_cpu(void) | ||
2205 | { | 2072 | { |
2206 | int cpu; | 2073 | if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) |
2207 | 2074 | /* | |
2208 | for_each_online_cpu(cpu) | 2075 | * Boot time creation of the kmalloc array. Use static per cpu data |
2209 | init_alloc_cpu_cpu(cpu); | 2076 | * since the per cpu allocator is not available yet. |
2210 | } | 2077 | */ |
2078 | s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); | ||
2079 | else | ||
2080 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); | ||
2211 | 2081 | ||
2212 | #else | 2082 | if (!s->cpu_slab) |
2213 | static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} | 2083 | return 0; |
2214 | static inline void init_alloc_cpu(void) {} | ||
2215 | 2084 | ||
2216 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
2217 | { | ||
2218 | init_kmem_cache_cpu(s, &s->cpu_slab); | ||
2219 | return 1; | 2085 | return 1; |
2220 | } | 2086 | } |
2221 | #endif | ||
2222 | 2087 | ||
2223 | #ifdef CONFIG_NUMA | 2088 | #ifdef CONFIG_NUMA |
2224 | /* | 2089 | /* |
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
2287 | int node; | 2152 | int node; |
2288 | int local_node; | 2153 | int local_node; |
2289 | 2154 | ||
2290 | if (slab_state >= UP) | 2155 | if (slab_state >= UP && (s < kmalloc_caches || |
2156 | s > kmalloc_caches + KMALLOC_CACHES)) | ||
2291 | local_node = page_to_nid(virt_to_page(s)); | 2157 | local_node = page_to_nid(virt_to_page(s)); |
2292 | else | 2158 | else |
2293 | local_node = 0; | 2159 | local_node = 0; |
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
2502 | 2368 | ||
2503 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) | 2369 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) |
2504 | return 1; | 2370 | return 1; |
2371 | |||
2505 | free_kmem_cache_nodes(s); | 2372 | free_kmem_cache_nodes(s); |
2506 | error: | 2373 | error: |
2507 | if (flags & SLAB_PANIC) | 2374 | if (flags & SLAB_PANIC) |
@@ -2609,9 +2476,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
2609 | int node; | 2476 | int node; |
2610 | 2477 | ||
2611 | flush_all(s); | 2478 | flush_all(s); |
2612 | 2479 | free_percpu(s->cpu_slab); | |
2613 | /* Attempt to free all objects */ | 2480 | /* Attempt to free all objects */ |
2614 | free_kmem_cache_cpus(s); | ||
2615 | for_each_node_state(node, N_NORMAL_MEMORY) { | 2481 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2616 | struct kmem_cache_node *n = get_node(s, node); | 2482 | struct kmem_cache_node *n = get_node(s, node); |
2617 | 2483 | ||
@@ -2651,7 +2517,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
2651 | * Kmalloc subsystem | 2517 | * Kmalloc subsystem |
2652 | *******************************************************************/ | 2518 | *******************************************************************/ |
2653 | 2519 | ||
2654 | struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; | 2520 | struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; |
2655 | EXPORT_SYMBOL(kmalloc_caches); | 2521 | EXPORT_SYMBOL(kmalloc_caches); |
2656 | 2522 | ||
2657 | static int __init setup_slub_min_order(char *str) | 2523 | static int __init setup_slub_min_order(char *str) |
@@ -2741,6 +2607,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2741 | char *text; | 2607 | char *text; |
2742 | size_t realsize; | 2608 | size_t realsize; |
2743 | unsigned long slabflags; | 2609 | unsigned long slabflags; |
2610 | int i; | ||
2744 | 2611 | ||
2745 | s = kmalloc_caches_dma[index]; | 2612 | s = kmalloc_caches_dma[index]; |
2746 | if (s) | 2613 | if (s) |
@@ -2760,7 +2627,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2760 | realsize = kmalloc_caches[index].objsize; | 2627 | realsize = kmalloc_caches[index].objsize; |
2761 | text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", | 2628 | text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", |
2762 | (unsigned int)realsize); | 2629 | (unsigned int)realsize); |
2763 | s = kmalloc(kmem_size, flags & ~SLUB_DMA); | 2630 | |
2631 | s = NULL; | ||
2632 | for (i = 0; i < KMALLOC_CACHES; i++) | ||
2633 | if (!kmalloc_caches[i].size) | ||
2634 | break; | ||
2635 | |||
2636 | BUG_ON(i >= KMALLOC_CACHES); | ||
2637 | s = kmalloc_caches + i; | ||
2764 | 2638 | ||
2765 | /* | 2639 | /* |
2766 | * Must defer sysfs creation to a workqueue because we don't know | 2640 | * Must defer sysfs creation to a workqueue because we don't know |
@@ -2772,9 +2646,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2772 | if (slab_state >= SYSFS) | 2646 | if (slab_state >= SYSFS) |
2773 | slabflags |= __SYSFS_ADD_DEFERRED; | 2647 | slabflags |= __SYSFS_ADD_DEFERRED; |
2774 | 2648 | ||
2775 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2649 | if (!text || !kmem_cache_open(s, flags, text, |
2776 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { | 2650 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { |
2777 | kfree(s); | 2651 | s->size = 0; |
2778 | kfree(text); | 2652 | kfree(text); |
2779 | goto unlock_out; | 2653 | goto unlock_out; |
2780 | } | 2654 | } |
@@ -3086,7 +2960,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3086 | /* | 2960 | /* |
3087 | * if n->nr_slabs > 0, slabs still exist on the node | 2961 | * if n->nr_slabs > 0, slabs still exist on the node |
3088 | * that is going down. We were unable to free them, | 2962 | * that is going down. We were unable to free them, |
3089 | * and offline_pages() function shoudn't call this | 2963 | * and offline_pages() function shouldn't call this |
3090 | * callback. So, we must fail. | 2964 | * callback. So, we must fail. |
3091 | */ | 2965 | */ |
3092 | BUG_ON(slabs_node(s, offline_node)); | 2966 | BUG_ON(slabs_node(s, offline_node)); |
@@ -3176,8 +3050,6 @@ void __init kmem_cache_init(void) | |||
3176 | int i; | 3050 | int i; |
3177 | int caches = 0; | 3051 | int caches = 0; |
3178 | 3052 | ||
3179 | init_alloc_cpu(); | ||
3180 | |||
3181 | #ifdef CONFIG_NUMA | 3053 | #ifdef CONFIG_NUMA |
3182 | /* | 3054 | /* |
3183 | * Must first have the slab cache available for the allocations of the | 3055 | * Must first have the slab cache available for the allocations of the |
@@ -3261,8 +3133,10 @@ void __init kmem_cache_init(void) | |||
3261 | 3133 | ||
3262 | #ifdef CONFIG_SMP | 3134 | #ifdef CONFIG_SMP |
3263 | register_cpu_notifier(&slab_notifier); | 3135 | register_cpu_notifier(&slab_notifier); |
3264 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | 3136 | #endif |
3265 | nr_cpu_ids * sizeof(struct kmem_cache_cpu *); | 3137 | #ifdef CONFIG_NUMA |
3138 | kmem_size = offsetof(struct kmem_cache, node) + | ||
3139 | nr_node_ids * sizeof(struct kmem_cache_node *); | ||
3266 | #else | 3140 | #else |
3267 | kmem_size = sizeof(struct kmem_cache); | 3141 | kmem_size = sizeof(struct kmem_cache); |
3268 | #endif | 3142 | #endif |
@@ -3351,22 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
3351 | down_write(&slub_lock); | 3225 | down_write(&slub_lock); |
3352 | s = find_mergeable(size, align, flags, name, ctor); | 3226 | s = find_mergeable(size, align, flags, name, ctor); |
3353 | if (s) { | 3227 | if (s) { |
3354 | int cpu; | ||
3355 | |||
3356 | s->refcount++; | 3228 | s->refcount++; |
3357 | /* | 3229 | /* |
3358 | * Adjust the object sizes so that we clear | 3230 | * Adjust the object sizes so that we clear |
3359 | * the complete object on kzalloc. | 3231 | * the complete object on kzalloc. |
3360 | */ | 3232 | */ |
3361 | s->objsize = max(s->objsize, (int)size); | 3233 | s->objsize = max(s->objsize, (int)size); |
3362 | |||
3363 | /* | ||
3364 | * And then we need to update the object size in the | ||
3365 | * per cpu structures | ||
3366 | */ | ||
3367 | for_each_online_cpu(cpu) | ||
3368 | get_cpu_slab(s, cpu)->objsize = s->objsize; | ||
3369 | |||
3370 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3234 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
3371 | up_write(&slub_lock); | 3235 | up_write(&slub_lock); |
3372 | 3236 | ||
@@ -3420,29 +3284,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
3420 | unsigned long flags; | 3284 | unsigned long flags; |
3421 | 3285 | ||
3422 | switch (action) { | 3286 | switch (action) { |
3423 | case CPU_UP_PREPARE: | ||
3424 | case CPU_UP_PREPARE_FROZEN: | ||
3425 | init_alloc_cpu_cpu(cpu); | ||
3426 | down_read(&slub_lock); | ||
3427 | list_for_each_entry(s, &slab_caches, list) | ||
3428 | s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, | ||
3429 | GFP_KERNEL); | ||
3430 | up_read(&slub_lock); | ||
3431 | break; | ||
3432 | |||
3433 | case CPU_UP_CANCELED: | 3287 | case CPU_UP_CANCELED: |
3434 | case CPU_UP_CANCELED_FROZEN: | 3288 | case CPU_UP_CANCELED_FROZEN: |
3435 | case CPU_DEAD: | 3289 | case CPU_DEAD: |
3436 | case CPU_DEAD_FROZEN: | 3290 | case CPU_DEAD_FROZEN: |
3437 | down_read(&slub_lock); | 3291 | down_read(&slub_lock); |
3438 | list_for_each_entry(s, &slab_caches, list) { | 3292 | list_for_each_entry(s, &slab_caches, list) { |
3439 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
3440 | |||
3441 | local_irq_save(flags); | 3293 | local_irq_save(flags); |
3442 | __flush_cpu_slab(s, cpu); | 3294 | __flush_cpu_slab(s, cpu); |
3443 | local_irq_restore(flags); | 3295 | local_irq_restore(flags); |
3444 | free_kmem_cache_cpu(c, cpu); | ||
3445 | s->cpu_slab[cpu] = NULL; | ||
3446 | } | 3296 | } |
3447 | up_read(&slub_lock); | 3297 | up_read(&slub_lock); |
3448 | break; | 3298 | break; |
@@ -3928,7 +3778,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
3928 | int cpu; | 3778 | int cpu; |
3929 | 3779 | ||
3930 | for_each_possible_cpu(cpu) { | 3780 | for_each_possible_cpu(cpu) { |
3931 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 3781 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
3932 | 3782 | ||
3933 | if (!c || c->node < 0) | 3783 | if (!c || c->node < 0) |
3934 | continue; | 3784 | continue; |
@@ -4171,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
4171 | } | 4021 | } |
4172 | SLAB_ATTR(trace); | 4022 | SLAB_ATTR(trace); |
4173 | 4023 | ||
4024 | #ifdef CONFIG_FAILSLAB | ||
4025 | static ssize_t failslab_show(struct kmem_cache *s, char *buf) | ||
4026 | { | ||
4027 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); | ||
4028 | } | ||
4029 | |||
4030 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, | ||
4031 | size_t length) | ||
4032 | { | ||
4033 | s->flags &= ~SLAB_FAILSLAB; | ||
4034 | if (buf[0] == '1') | ||
4035 | s->flags |= SLAB_FAILSLAB; | ||
4036 | return length; | ||
4037 | } | ||
4038 | SLAB_ATTR(failslab); | ||
4039 | #endif | ||
4040 | |||
4174 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) | 4041 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) |
4175 | { | 4042 | { |
4176 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); | 4043 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); |
@@ -4353,7 +4220,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | |||
4353 | return -ENOMEM; | 4220 | return -ENOMEM; |
4354 | 4221 | ||
4355 | for_each_online_cpu(cpu) { | 4222 | for_each_online_cpu(cpu) { |
4356 | unsigned x = get_cpu_slab(s, cpu)->stat[si]; | 4223 | unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; |
4357 | 4224 | ||
4358 | data[cpu] = x; | 4225 | data[cpu] = x; |
4359 | sum += x; | 4226 | sum += x; |
@@ -4376,7 +4243,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) | |||
4376 | int cpu; | 4243 | int cpu; |
4377 | 4244 | ||
4378 | for_each_online_cpu(cpu) | 4245 | for_each_online_cpu(cpu) |
4379 | get_cpu_slab(s, cpu)->stat[si] = 0; | 4246 | per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; |
4380 | } | 4247 | } |
4381 | 4248 | ||
4382 | #define STAT_ATTR(si, text) \ | 4249 | #define STAT_ATTR(si, text) \ |
@@ -4467,6 +4334,10 @@ static struct attribute *slab_attrs[] = { | |||
4467 | &deactivate_remote_frees_attr.attr, | 4334 | &deactivate_remote_frees_attr.attr, |
4468 | &order_fallback_attr.attr, | 4335 | &order_fallback_attr.attr, |
4469 | #endif | 4336 | #endif |
4337 | #ifdef CONFIG_FAILSLAB | ||
4338 | &failslab_attr.attr, | ||
4339 | #endif | ||
4340 | |||
4470 | NULL | 4341 | NULL |
4471 | }; | 4342 | }; |
4472 | 4343 | ||
@@ -4519,7 +4390,7 @@ static void kmem_cache_release(struct kobject *kobj) | |||
4519 | kfree(s); | 4390 | kfree(s); |
4520 | } | 4391 | } |
4521 | 4392 | ||
4522 | static struct sysfs_ops slab_sysfs_ops = { | 4393 | static const struct sysfs_ops slab_sysfs_ops = { |
4523 | .show = slab_attr_show, | 4394 | .show = slab_attr_show, |
4524 | .store = slab_attr_store, | 4395 | .store = slab_attr_store, |
4525 | }; | 4396 | }; |
@@ -4538,7 +4409,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) | |||
4538 | return 0; | 4409 | return 0; |
4539 | } | 4410 | } |
4540 | 4411 | ||
4541 | static struct kset_uevent_ops slab_uevent_ops = { | 4412 | static const struct kset_uevent_ops slab_uevent_ops = { |
4542 | .filter = uevent_filter, | 4413 | .filter = uevent_filter, |
4543 | }; | 4414 | }; |
4544 | 4415 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bdcb4a3..392b9bb5bc01 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, | |||
40 | unsigned long align, | 40 | unsigned long align, |
41 | unsigned long goal) | 41 | unsigned long goal) |
42 | { | 42 | { |
43 | return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); | 43 | return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); |
44 | } | 44 | } |
45 | 45 | ||
46 | static void *vmemmap_buf; | ||
47 | static void *vmemmap_buf_end; | ||
46 | 48 | ||
47 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) | 49 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) |
48 | { | 50 | { |
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
64 | __pa(MAX_DMA_ADDRESS)); | 66 | __pa(MAX_DMA_ADDRESS)); |
65 | } | 67 | } |
66 | 68 | ||
69 | /* need to make sure size is all the same during early stage */ | ||
70 | void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) | ||
71 | { | ||
72 | void *ptr; | ||
73 | |||
74 | if (!vmemmap_buf) | ||
75 | return vmemmap_alloc_block(size, node); | ||
76 | |||
77 | /* take the from buf */ | ||
78 | ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); | ||
79 | if (ptr + size > vmemmap_buf_end) | ||
80 | return vmemmap_alloc_block(size, node); | ||
81 | |||
82 | vmemmap_buf = ptr + size; | ||
83 | |||
84 | return ptr; | ||
85 | } | ||
86 | |||
67 | void __meminit vmemmap_verify(pte_t *pte, int node, | 87 | void __meminit vmemmap_verify(pte_t *pte, int node, |
68 | unsigned long start, unsigned long end) | 88 | unsigned long start, unsigned long end) |
69 | { | 89 | { |
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | |||
80 | pte_t *pte = pte_offset_kernel(pmd, addr); | 100 | pte_t *pte = pte_offset_kernel(pmd, addr); |
81 | if (pte_none(*pte)) { | 101 | if (pte_none(*pte)) { |
82 | pte_t entry; | 102 | pte_t entry; |
83 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | 103 | void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); |
84 | if (!p) | 104 | if (!p) |
85 | return NULL; | 105 | return NULL; |
86 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | 106 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); |
@@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) | |||
163 | 183 | ||
164 | return map; | 184 | return map; |
165 | } | 185 | } |
186 | |||
187 | void __init sparse_mem_maps_populate_node(struct page **map_map, | ||
188 | unsigned long pnum_begin, | ||
189 | unsigned long pnum_end, | ||
190 | unsigned long map_count, int nodeid) | ||
191 | { | ||
192 | unsigned long pnum; | ||
193 | unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; | ||
194 | void *vmemmap_buf_start; | ||
195 | |||
196 | size = ALIGN(size, PMD_SIZE); | ||
197 | vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, | ||
198 | PMD_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
199 | |||
200 | if (vmemmap_buf_start) { | ||
201 | vmemmap_buf = vmemmap_buf_start; | ||
202 | vmemmap_buf_end = vmemmap_buf_start + size * map_count; | ||
203 | } | ||
204 | |||
205 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
206 | struct mem_section *ms; | ||
207 | |||
208 | if (!present_section_nr(pnum)) | ||
209 | continue; | ||
210 | |||
211 | map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); | ||
212 | if (map_map[pnum]) | ||
213 | continue; | ||
214 | ms = __nr_to_section(pnum); | ||
215 | printk(KERN_ERR "%s: sparsemem memory map backing failed " | ||
216 | "some memory will not be available.\n", __func__); | ||
217 | ms->section_mem_map = 0; | ||
218 | } | ||
219 | |||
220 | if (vmemmap_buf_start) { | ||
221 | /* need to free left buf */ | ||
222 | #ifdef CONFIG_NO_BOOTMEM | ||
223 | free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); | ||
224 | if (vmemmap_buf_start < vmemmap_buf) { | ||
225 | char name[15]; | ||
226 | |||
227 | snprintf(name, sizeof(name), "MEMMAP %d", nodeid); | ||
228 | reserve_early_without_check(__pa(vmemmap_buf_start), | ||
229 | __pa(vmemmap_buf), name); | ||
230 | } | ||
231 | #else | ||
232 | free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); | ||
233 | #endif | ||
234 | vmemmap_buf = NULL; | ||
235 | vmemmap_buf_end = NULL; | ||
236 | } | ||
237 | } | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 6ce4aab69e99..22896d589133 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
271 | 271 | ||
272 | #ifdef CONFIG_MEMORY_HOTREMOVE | 272 | #ifdef CONFIG_MEMORY_HOTREMOVE |
273 | static unsigned long * __init | 273 | static unsigned long * __init |
274 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | 274 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
275 | unsigned long count) | ||
275 | { | 276 | { |
276 | unsigned long section_nr; | 277 | unsigned long section_nr; |
277 | 278 | ||
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | |||
286 | * this problem. | 287 | * this problem. |
287 | */ | 288 | */ |
288 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | 289 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); |
289 | return alloc_bootmem_section(usemap_size(), section_nr); | 290 | return alloc_bootmem_section(usemap_size() * count, section_nr); |
290 | } | 291 | } |
291 | 292 | ||
292 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 293 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
329 | } | 330 | } |
330 | #else | 331 | #else |
331 | static unsigned long * __init | 332 | static unsigned long * __init |
332 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | 333 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
334 | unsigned long count) | ||
333 | { | 335 | { |
334 | return NULL; | 336 | return NULL; |
335 | } | 337 | } |
@@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
339 | } | 341 | } |
340 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 342 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
341 | 343 | ||
342 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) | 344 | static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, |
345 | unsigned long pnum_begin, | ||
346 | unsigned long pnum_end, | ||
347 | unsigned long usemap_count, int nodeid) | ||
343 | { | 348 | { |
344 | unsigned long *usemap; | 349 | void *usemap; |
345 | struct mem_section *ms = __nr_to_section(pnum); | 350 | unsigned long pnum; |
346 | int nid = sparse_early_nid(ms); | 351 | int size = usemap_size(); |
347 | |||
348 | usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); | ||
349 | if (usemap) | ||
350 | return usemap; | ||
351 | 352 | ||
352 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | 353 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
354 | usemap_count); | ||
353 | if (usemap) { | 355 | if (usemap) { |
354 | check_usemap_section_nr(nid, usemap); | 356 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
355 | return usemap; | 357 | if (!present_section_nr(pnum)) |
358 | continue; | ||
359 | usemap_map[pnum] = usemap; | ||
360 | usemap += size; | ||
361 | } | ||
362 | return; | ||
356 | } | 363 | } |
357 | 364 | ||
358 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | 365 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); |
359 | nid = 0; | 366 | if (usemap) { |
367 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
368 | if (!present_section_nr(pnum)) | ||
369 | continue; | ||
370 | usemap_map[pnum] = usemap; | ||
371 | usemap += size; | ||
372 | check_usemap_section_nr(nodeid, usemap_map[pnum]); | ||
373 | } | ||
374 | return; | ||
375 | } | ||
360 | 376 | ||
361 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | 377 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
362 | return NULL; | ||
363 | } | 378 | } |
364 | 379 | ||
365 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 380 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
@@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
375 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); | 390 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); |
376 | return map; | 391 | return map; |
377 | } | 392 | } |
393 | void __init sparse_mem_maps_populate_node(struct page **map_map, | ||
394 | unsigned long pnum_begin, | ||
395 | unsigned long pnum_end, | ||
396 | unsigned long map_count, int nodeid) | ||
397 | { | ||
398 | void *map; | ||
399 | unsigned long pnum; | ||
400 | unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; | ||
401 | |||
402 | map = alloc_remap(nodeid, size * map_count); | ||
403 | if (map) { | ||
404 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
405 | if (!present_section_nr(pnum)) | ||
406 | continue; | ||
407 | map_map[pnum] = map; | ||
408 | map += size; | ||
409 | } | ||
410 | return; | ||
411 | } | ||
412 | |||
413 | size = PAGE_ALIGN(size); | ||
414 | map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); | ||
415 | if (map) { | ||
416 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
417 | if (!present_section_nr(pnum)) | ||
418 | continue; | ||
419 | map_map[pnum] = map; | ||
420 | map += size; | ||
421 | } | ||
422 | return; | ||
423 | } | ||
424 | |||
425 | /* fallback */ | ||
426 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
427 | struct mem_section *ms; | ||
428 | |||
429 | if (!present_section_nr(pnum)) | ||
430 | continue; | ||
431 | map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); | ||
432 | if (map_map[pnum]) | ||
433 | continue; | ||
434 | ms = __nr_to_section(pnum); | ||
435 | printk(KERN_ERR "%s: sparsemem memory map backing failed " | ||
436 | "some memory will not be available.\n", __func__); | ||
437 | ms->section_mem_map = 0; | ||
438 | } | ||
439 | } | ||
378 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 440 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
379 | 441 | ||
442 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
443 | static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, | ||
444 | unsigned long pnum_begin, | ||
445 | unsigned long pnum_end, | ||
446 | unsigned long map_count, int nodeid) | ||
447 | { | ||
448 | sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, | ||
449 | map_count, nodeid); | ||
450 | } | ||
451 | #else | ||
380 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 452 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) |
381 | { | 453 | { |
382 | struct page *map; | 454 | struct page *map; |
@@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
392 | ms->section_mem_map = 0; | 464 | ms->section_mem_map = 0; |
393 | return NULL; | 465 | return NULL; |
394 | } | 466 | } |
467 | #endif | ||
395 | 468 | ||
396 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) | 469 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) |
397 | { | 470 | { |
398 | } | 471 | } |
472 | |||
399 | /* | 473 | /* |
400 | * Allocate the accumulated non-linear sections, allocate a mem_map | 474 | * Allocate the accumulated non-linear sections, allocate a mem_map |
401 | * for each and record the physical to section mapping. | 475 | * for each and record the physical to section mapping. |
@@ -407,6 +481,14 @@ void __init sparse_init(void) | |||
407 | unsigned long *usemap; | 481 | unsigned long *usemap; |
408 | unsigned long **usemap_map; | 482 | unsigned long **usemap_map; |
409 | int size; | 483 | int size; |
484 | int nodeid_begin = 0; | ||
485 | unsigned long pnum_begin = 0; | ||
486 | unsigned long usemap_count; | ||
487 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
488 | unsigned long map_count; | ||
489 | int size2; | ||
490 | struct page **map_map; | ||
491 | #endif | ||
410 | 492 | ||
411 | /* | 493 | /* |
412 | * map is using big page (aka 2M in x86 64 bit) | 494 | * map is using big page (aka 2M in x86 64 bit) |
@@ -425,10 +507,81 @@ void __init sparse_init(void) | |||
425 | panic("can not allocate usemap_map\n"); | 507 | panic("can not allocate usemap_map\n"); |
426 | 508 | ||
427 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 509 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
510 | struct mem_section *ms; | ||
511 | |||
428 | if (!present_section_nr(pnum)) | 512 | if (!present_section_nr(pnum)) |
429 | continue; | 513 | continue; |
430 | usemap_map[pnum] = sparse_early_usemap_alloc(pnum); | 514 | ms = __nr_to_section(pnum); |
515 | nodeid_begin = sparse_early_nid(ms); | ||
516 | pnum_begin = pnum; | ||
517 | break; | ||
431 | } | 518 | } |
519 | usemap_count = 1; | ||
520 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
521 | struct mem_section *ms; | ||
522 | int nodeid; | ||
523 | |||
524 | if (!present_section_nr(pnum)) | ||
525 | continue; | ||
526 | ms = __nr_to_section(pnum); | ||
527 | nodeid = sparse_early_nid(ms); | ||
528 | if (nodeid == nodeid_begin) { | ||
529 | usemap_count++; | ||
530 | continue; | ||
531 | } | ||
532 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
533 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, | ||
534 | usemap_count, nodeid_begin); | ||
535 | /* new start, update count etc*/ | ||
536 | nodeid_begin = nodeid; | ||
537 | pnum_begin = pnum; | ||
538 | usemap_count = 1; | ||
539 | } | ||
540 | /* ok, last chunk */ | ||
541 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, | ||
542 | usemap_count, nodeid_begin); | ||
543 | |||
544 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
545 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | ||
546 | map_map = alloc_bootmem(size2); | ||
547 | if (!map_map) | ||
548 | panic("can not allocate map_map\n"); | ||
549 | |||
550 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | ||
551 | struct mem_section *ms; | ||
552 | |||
553 | if (!present_section_nr(pnum)) | ||
554 | continue; | ||
555 | ms = __nr_to_section(pnum); | ||
556 | nodeid_begin = sparse_early_nid(ms); | ||
557 | pnum_begin = pnum; | ||
558 | break; | ||
559 | } | ||
560 | map_count = 1; | ||
561 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
562 | struct mem_section *ms; | ||
563 | int nodeid; | ||
564 | |||
565 | if (!present_section_nr(pnum)) | ||
566 | continue; | ||
567 | ms = __nr_to_section(pnum); | ||
568 | nodeid = sparse_early_nid(ms); | ||
569 | if (nodeid == nodeid_begin) { | ||
570 | map_count++; | ||
571 | continue; | ||
572 | } | ||
573 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
574 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, | ||
575 | map_count, nodeid_begin); | ||
576 | /* new start, update count etc*/ | ||
577 | nodeid_begin = nodeid; | ||
578 | pnum_begin = pnum; | ||
579 | map_count = 1; | ||
580 | } | ||
581 | /* ok, last chunk */ | ||
582 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, | ||
583 | map_count, nodeid_begin); | ||
584 | #endif | ||
432 | 585 | ||
433 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 586 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
434 | if (!present_section_nr(pnum)) | 587 | if (!present_section_nr(pnum)) |
@@ -438,7 +591,11 @@ void __init sparse_init(void) | |||
438 | if (!usemap) | 591 | if (!usemap) |
439 | continue; | 592 | continue; |
440 | 593 | ||
594 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
595 | map = map_map[pnum]; | ||
596 | #else | ||
441 | map = sparse_early_mem_map_alloc(pnum); | 597 | map = sparse_early_mem_map_alloc(pnum); |
598 | #endif | ||
442 | if (!map) | 599 | if (!map) |
443 | continue; | 600 | continue; |
444 | 601 | ||
@@ -448,6 +605,9 @@ void __init sparse_init(void) | |||
448 | 605 | ||
449 | vmemmap_populate_print_last(); | 606 | vmemmap_populate_print_last(); |
450 | 607 | ||
608 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
609 | free_bootmem(__pa(map_map), size2); | ||
610 | #endif | ||
451 | free_bootmem(__pa(usemap_map), size); | 611 | free_bootmem(__pa(usemap_map), size); |
452 | } | 612 | } |
453 | 613 | ||
@@ -55,7 +55,7 @@ static void __page_cache_release(struct page *page) | |||
55 | del_page_from_lru(zone, page); | 55 | del_page_from_lru(zone, page); |
56 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 56 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
57 | } | 57 | } |
58 | free_hot_page(page); | 58 | free_hot_cold_page(page, 0); |
59 | } | 59 | } |
60 | 60 | ||
61 | static void put_compound_page(struct page *page) | 61 | static void put_compound_page(struct page *page) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c0585b16418..6cd0a8f90dc7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry) | |||
723 | return p != NULL; | 723 | return p != NULL; |
724 | } | 724 | } |
725 | 725 | ||
726 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
727 | /** | ||
728 | * mem_cgroup_count_swap_user - count the user of a swap entry | ||
729 | * @ent: the swap entry to be checked | ||
730 | * @pagep: the pointer for the swap cache page of the entry to be stored | ||
731 | * | ||
732 | * Returns the number of the user of the swap entry. The number is valid only | ||
733 | * for swaps of anonymous pages. | ||
734 | * If the entry is found on swap cache, the page is stored to pagep with | ||
735 | * refcount of it being incremented. | ||
736 | */ | ||
737 | int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | ||
738 | { | ||
739 | struct page *page; | ||
740 | struct swap_info_struct *p; | ||
741 | int count = 0; | ||
742 | |||
743 | page = find_get_page(&swapper_space, ent.val); | ||
744 | if (page) | ||
745 | count += page_mapcount(page); | ||
746 | p = swap_info_get(ent); | ||
747 | if (p) { | ||
748 | count += swap_count(p->swap_map[swp_offset(ent)]); | ||
749 | spin_unlock(&swap_lock); | ||
750 | } | ||
751 | |||
752 | *pagep = page; | ||
753 | return count; | ||
754 | } | ||
755 | #endif | ||
756 | |||
726 | #ifdef CONFIG_HIBERNATION | 757 | #ifdef CONFIG_HIBERNATION |
727 | /* | 758 | /* |
728 | * Find the swap type that corresponds to given device (if any). | 759 | * Find the swap type that corresponds to given device (if any). |
@@ -840,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
840 | goto out; | 871 | goto out; |
841 | } | 872 | } |
842 | 873 | ||
843 | inc_mm_counter(vma->vm_mm, anon_rss); | 874 | dec_mm_counter(vma->vm_mm, MM_SWAPENTS); |
875 | inc_mm_counter(vma->vm_mm, MM_ANONPAGES); | ||
844 | get_page(page); | 876 | get_page(page); |
845 | set_pte_at(vma->vm_mm, addr, pte, | 877 | set_pte_at(vma->vm_mm, addr, pte, |
846 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 878 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
@@ -1759,11 +1791,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1759 | unsigned int type; | 1791 | unsigned int type; |
1760 | int i, prev; | 1792 | int i, prev; |
1761 | int error; | 1793 | int error; |
1762 | union swap_header *swap_header = NULL; | 1794 | union swap_header *swap_header; |
1763 | unsigned int nr_good_pages = 0; | 1795 | unsigned int nr_good_pages; |
1764 | int nr_extents = 0; | 1796 | int nr_extents = 0; |
1765 | sector_t span; | 1797 | sector_t span; |
1766 | unsigned long maxpages = 1; | 1798 | unsigned long maxpages; |
1767 | unsigned long swapfilepages; | 1799 | unsigned long swapfilepages; |
1768 | unsigned char *swap_map = NULL; | 1800 | unsigned char *swap_map = NULL; |
1769 | struct page *page = NULL; | 1801 | struct page *page = NULL; |
@@ -1922,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1922 | * swap pte. | 1954 | * swap pte. |
1923 | */ | 1955 | */ |
1924 | maxpages = swp_offset(pte_to_swp_entry( | 1956 | maxpages = swp_offset(pte_to_swp_entry( |
1925 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; | 1957 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1926 | if (maxpages > swap_header->info.last_page) | 1958 | if (maxpages > swap_header->info.last_page) { |
1927 | maxpages = swap_header->info.last_page; | 1959 | maxpages = swap_header->info.last_page + 1; |
1960 | /* p->max is an unsigned int: don't overflow it */ | ||
1961 | if ((unsigned int)maxpages == 0) | ||
1962 | maxpages = UINT_MAX; | ||
1963 | } | ||
1928 | p->highest_bit = maxpages - 1; | 1964 | p->highest_bit = maxpages - 1; |
1929 | 1965 | ||
1930 | error = -EINVAL; | 1966 | error = -EINVAL; |
@@ -1948,23 +1984,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1948 | } | 1984 | } |
1949 | 1985 | ||
1950 | memset(swap_map, 0, maxpages); | 1986 | memset(swap_map, 0, maxpages); |
1987 | nr_good_pages = maxpages - 1; /* omit header page */ | ||
1988 | |||
1951 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1989 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
1952 | int page_nr = swap_header->info.badpages[i]; | 1990 | unsigned int page_nr = swap_header->info.badpages[i]; |
1953 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { | 1991 | if (page_nr == 0 || page_nr > swap_header->info.last_page) { |
1954 | error = -EINVAL; | 1992 | error = -EINVAL; |
1955 | goto bad_swap; | 1993 | goto bad_swap; |
1956 | } | 1994 | } |
1957 | swap_map[page_nr] = SWAP_MAP_BAD; | 1995 | if (page_nr < maxpages) { |
1996 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
1997 | nr_good_pages--; | ||
1998 | } | ||
1958 | } | 1999 | } |
1959 | 2000 | ||
1960 | error = swap_cgroup_swapon(type, maxpages); | 2001 | error = swap_cgroup_swapon(type, maxpages); |
1961 | if (error) | 2002 | if (error) |
1962 | goto bad_swap; | 2003 | goto bad_swap; |
1963 | 2004 | ||
1964 | nr_good_pages = swap_header->info.last_page - | ||
1965 | swap_header->info.nr_badpages - | ||
1966 | 1 /* header page */; | ||
1967 | |||
1968 | if (nr_good_pages) { | 2005 | if (nr_good_pages) { |
1969 | swap_map[0] = SWAP_MAP_BAD; | 2006 | swap_map[0] = SWAP_MAP_BAD; |
1970 | p->max = maxpages; | 2007 | p->max = maxpages; |
@@ -2155,7 +2192,11 @@ void swap_shmem_alloc(swp_entry_t entry) | |||
2155 | } | 2192 | } |
2156 | 2193 | ||
2157 | /* | 2194 | /* |
2158 | * increase reference count of swap entry by 1. | 2195 | * Increase reference count of swap entry by 1. |
2196 | * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required | ||
2197 | * but could not be atomically allocated. Returns 0, just as if it succeeded, | ||
2198 | * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which | ||
2199 | * might occur if a page table entry has got corrupted. | ||
2159 | */ | 2200 | */ |
2160 | int swap_duplicate(swp_entry_t entry) | 2201 | int swap_duplicate(swp_entry_t entry) |
2161 | { | 2202 | { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index c26986c85ce0..79c809895fba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -262,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
262 | return ret; | 262 | return ret; |
263 | } | 263 | } |
264 | 264 | ||
265 | /* Called without lock on whether page is mapped, so answer is unstable */ | ||
266 | static inline int page_mapping_inuse(struct page *page) | ||
267 | { | ||
268 | struct address_space *mapping; | ||
269 | |||
270 | /* Page is in somebody's page tables. */ | ||
271 | if (page_mapped(page)) | ||
272 | return 1; | ||
273 | |||
274 | /* Be more reluctant to reclaim swapcache than pagecache */ | ||
275 | if (PageSwapCache(page)) | ||
276 | return 1; | ||
277 | |||
278 | mapping = page_mapping(page); | ||
279 | if (!mapping) | ||
280 | return 0; | ||
281 | |||
282 | /* File is mmap'd by somebody? */ | ||
283 | return mapping_mapped(mapping); | ||
284 | } | ||
285 | |||
286 | static inline int is_page_cache_freeable(struct page *page) | 265 | static inline int is_page_cache_freeable(struct page *page) |
287 | { | 266 | { |
288 | /* | 267 | /* |
@@ -579,6 +558,65 @@ redo: | |||
579 | put_page(page); /* drop ref from isolate */ | 558 | put_page(page); /* drop ref from isolate */ |
580 | } | 559 | } |
581 | 560 | ||
561 | enum page_references { | ||
562 | PAGEREF_RECLAIM, | ||
563 | PAGEREF_RECLAIM_CLEAN, | ||
564 | PAGEREF_KEEP, | ||
565 | PAGEREF_ACTIVATE, | ||
566 | }; | ||
567 | |||
568 | static enum page_references page_check_references(struct page *page, | ||
569 | struct scan_control *sc) | ||
570 | { | ||
571 | int referenced_ptes, referenced_page; | ||
572 | unsigned long vm_flags; | ||
573 | |||
574 | referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); | ||
575 | referenced_page = TestClearPageReferenced(page); | ||
576 | |||
577 | /* Lumpy reclaim - ignore references */ | ||
578 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
579 | return PAGEREF_RECLAIM; | ||
580 | |||
581 | /* | ||
582 | * Mlock lost the isolation race with us. Let try_to_unmap() | ||
583 | * move the page to the unevictable list. | ||
584 | */ | ||
585 | if (vm_flags & VM_LOCKED) | ||
586 | return PAGEREF_RECLAIM; | ||
587 | |||
588 | if (referenced_ptes) { | ||
589 | if (PageAnon(page)) | ||
590 | return PAGEREF_ACTIVATE; | ||
591 | /* | ||
592 | * All mapped pages start out with page table | ||
593 | * references from the instantiating fault, so we need | ||
594 | * to look twice if a mapped file page is used more | ||
595 | * than once. | ||
596 | * | ||
597 | * Mark it and spare it for another trip around the | ||
598 | * inactive list. Another page table reference will | ||
599 | * lead to its activation. | ||
600 | * | ||
601 | * Note: the mark is set for activated pages as well | ||
602 | * so that recently deactivated but used pages are | ||
603 | * quickly recovered. | ||
604 | */ | ||
605 | SetPageReferenced(page); | ||
606 | |||
607 | if (referenced_page) | ||
608 | return PAGEREF_ACTIVATE; | ||
609 | |||
610 | return PAGEREF_KEEP; | ||
611 | } | ||
612 | |||
613 | /* Reclaim if clean, defer dirty pages to writeback */ | ||
614 | if (referenced_page) | ||
615 | return PAGEREF_RECLAIM_CLEAN; | ||
616 | |||
617 | return PAGEREF_RECLAIM; | ||
618 | } | ||
619 | |||
582 | /* | 620 | /* |
583 | * shrink_page_list() returns the number of reclaimed pages | 621 | * shrink_page_list() returns the number of reclaimed pages |
584 | */ | 622 | */ |
@@ -590,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
590 | struct pagevec freed_pvec; | 628 | struct pagevec freed_pvec; |
591 | int pgactivate = 0; | 629 | int pgactivate = 0; |
592 | unsigned long nr_reclaimed = 0; | 630 | unsigned long nr_reclaimed = 0; |
593 | unsigned long vm_flags; | ||
594 | 631 | ||
595 | cond_resched(); | 632 | cond_resched(); |
596 | 633 | ||
597 | pagevec_init(&freed_pvec, 1); | 634 | pagevec_init(&freed_pvec, 1); |
598 | while (!list_empty(page_list)) { | 635 | while (!list_empty(page_list)) { |
636 | enum page_references references; | ||
599 | struct address_space *mapping; | 637 | struct address_space *mapping; |
600 | struct page *page; | 638 | struct page *page; |
601 | int may_enter_fs; | 639 | int may_enter_fs; |
602 | int referenced; | ||
603 | 640 | ||
604 | cond_resched(); | 641 | cond_resched(); |
605 | 642 | ||
@@ -641,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
641 | goto keep_locked; | 678 | goto keep_locked; |
642 | } | 679 | } |
643 | 680 | ||
644 | referenced = page_referenced(page, 1, | 681 | references = page_check_references(page, sc); |
645 | sc->mem_cgroup, &vm_flags); | 682 | switch (references) { |
646 | /* | 683 | case PAGEREF_ACTIVATE: |
647 | * In active use or really unfreeable? Activate it. | ||
648 | * If page which have PG_mlocked lost isoltation race, | ||
649 | * try_to_unmap moves it to unevictable list | ||
650 | */ | ||
651 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | ||
652 | referenced && page_mapping_inuse(page) | ||
653 | && !(vm_flags & VM_LOCKED)) | ||
654 | goto activate_locked; | 684 | goto activate_locked; |
685 | case PAGEREF_KEEP: | ||
686 | goto keep_locked; | ||
687 | case PAGEREF_RECLAIM: | ||
688 | case PAGEREF_RECLAIM_CLEAN: | ||
689 | ; /* try to reclaim the page below */ | ||
690 | } | ||
655 | 691 | ||
656 | /* | 692 | /* |
657 | * Anonymous process memory has backing store? | 693 | * Anonymous process memory has backing store? |
@@ -685,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
685 | } | 721 | } |
686 | 722 | ||
687 | if (PageDirty(page)) { | 723 | if (PageDirty(page)) { |
688 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) | 724 | if (references == PAGEREF_RECLAIM_CLEAN) |
689 | goto keep_locked; | 725 | goto keep_locked; |
690 | if (!may_enter_fs) | 726 | if (!may_enter_fs) |
691 | goto keep_locked; | 727 | goto keep_locked; |
@@ -1350,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1350 | continue; | 1386 | continue; |
1351 | } | 1387 | } |
1352 | 1388 | ||
1353 | /* page_referenced clears PageReferenced */ | 1389 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1354 | if (page_mapping_inuse(page) && | ||
1355 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | ||
1356 | nr_rotated++; | 1390 | nr_rotated++; |
1357 | /* | 1391 | /* |
1358 | * Identify referenced, file-backed active pages and | 1392 | * Identify referenced, file-backed active pages and |
@@ -1501,6 +1535,13 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1501 | unsigned long ap, fp; | 1535 | unsigned long ap, fp; |
1502 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1536 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1503 | 1537 | ||
1538 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
1539 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
1540 | percent[0] = 0; | ||
1541 | percent[1] = 100; | ||
1542 | return; | ||
1543 | } | ||
1544 | |||
1504 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1545 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + |
1505 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1546 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); |
1506 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | 1547 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + |
@@ -1598,22 +1639,20 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1598 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1639 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
1599 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1640 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1600 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1641 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1601 | int noswap = 0; | ||
1602 | 1642 | ||
1603 | /* If we have no swap space, do not bother scanning anon pages. */ | 1643 | get_scan_ratio(zone, sc, percent); |
1604 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
1605 | noswap = 1; | ||
1606 | percent[0] = 0; | ||
1607 | percent[1] = 100; | ||
1608 | } else | ||
1609 | get_scan_ratio(zone, sc, percent); | ||
1610 | 1644 | ||
1611 | for_each_evictable_lru(l) { | 1645 | for_each_evictable_lru(l) { |
1612 | int file = is_file_lru(l); | 1646 | int file = is_file_lru(l); |
1613 | unsigned long scan; | 1647 | unsigned long scan; |
1614 | 1648 | ||
1649 | if (percent[file] == 0) { | ||
1650 | nr[l] = 0; | ||
1651 | continue; | ||
1652 | } | ||
1653 | |||
1615 | scan = zone_nr_lru_pages(zone, sc, l); | 1654 | scan = zone_nr_lru_pages(zone, sc, l); |
1616 | if (priority || noswap) { | 1655 | if (priority) { |
1617 | scan >>= priority; | 1656 | scan >>= priority; |
1618 | scan = (scan * percent[file]) / 100; | 1657 | scan = (scan * percent[file]) / 100; |
1619 | } | 1658 | } |
@@ -1694,8 +1733,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
1694 | continue; | 1733 | continue; |
1695 | note_zone_scanning_priority(zone, priority); | 1734 | note_zone_scanning_priority(zone, priority); |
1696 | 1735 | ||
1697 | if (zone_is_all_unreclaimable(zone) && | 1736 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1698 | priority != DEF_PRIORITY) | ||
1699 | continue; /* Let kswapd poll it */ | 1737 | continue; /* Let kswapd poll it */ |
1700 | sc->all_unreclaimable = 0; | 1738 | sc->all_unreclaimable = 0; |
1701 | } else { | 1739 | } else { |
@@ -1922,7 +1960,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
1922 | if (!populated_zone(zone)) | 1960 | if (!populated_zone(zone)) |
1923 | continue; | 1961 | continue; |
1924 | 1962 | ||
1925 | if (zone_is_all_unreclaimable(zone)) | 1963 | if (zone->all_unreclaimable) |
1926 | continue; | 1964 | continue; |
1927 | 1965 | ||
1928 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 1966 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), |
@@ -2012,8 +2050,7 @@ loop_again: | |||
2012 | if (!populated_zone(zone)) | 2050 | if (!populated_zone(zone)) |
2013 | continue; | 2051 | continue; |
2014 | 2052 | ||
2015 | if (zone_is_all_unreclaimable(zone) && | 2053 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2016 | priority != DEF_PRIORITY) | ||
2017 | continue; | 2054 | continue; |
2018 | 2055 | ||
2019 | /* | 2056 | /* |
@@ -2056,13 +2093,9 @@ loop_again: | |||
2056 | if (!populated_zone(zone)) | 2093 | if (!populated_zone(zone)) |
2057 | continue; | 2094 | continue; |
2058 | 2095 | ||
2059 | if (zone_is_all_unreclaimable(zone) && | 2096 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2060 | priority != DEF_PRIORITY) | ||
2061 | continue; | 2097 | continue; |
2062 | 2098 | ||
2063 | if (!zone_watermark_ok(zone, order, | ||
2064 | high_wmark_pages(zone), end_zone, 0)) | ||
2065 | all_zones_ok = 0; | ||
2066 | temp_priority[i] = priority; | 2099 | temp_priority[i] = priority; |
2067 | sc.nr_scanned = 0; | 2100 | sc.nr_scanned = 0; |
2068 | note_zone_scanning_priority(zone, priority); | 2101 | note_zone_scanning_priority(zone, priority); |
@@ -2087,12 +2120,11 @@ loop_again: | |||
2087 | lru_pages); | 2120 | lru_pages); |
2088 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2121 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2089 | total_scanned += sc.nr_scanned; | 2122 | total_scanned += sc.nr_scanned; |
2090 | if (zone_is_all_unreclaimable(zone)) | 2123 | if (zone->all_unreclaimable) |
2091 | continue; | 2124 | continue; |
2092 | if (nr_slab == 0 && zone->pages_scanned >= | 2125 | if (nr_slab == 0 && |
2093 | (zone_reclaimable_pages(zone) * 6)) | 2126 | zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6)) |
2094 | zone_set_flag(zone, | 2127 | zone->all_unreclaimable = 1; |
2095 | ZONE_ALL_UNRECLAIMABLE); | ||
2096 | /* | 2128 | /* |
2097 | * If we've done a decent amount of scanning and | 2129 | * If we've done a decent amount of scanning and |
2098 | * the reclaim ratio is low, start doing writepage | 2130 | * the reclaim ratio is low, start doing writepage |
@@ -2102,13 +2134,18 @@ loop_again: | |||
2102 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2134 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2103 | sc.may_writepage = 1; | 2135 | sc.may_writepage = 1; |
2104 | 2136 | ||
2105 | /* | 2137 | if (!zone_watermark_ok(zone, order, |
2106 | * We are still under min water mark. it mean we have | 2138 | high_wmark_pages(zone), end_zone, 0)) { |
2107 | * GFP_ATOMIC allocation failure risk. Hurry up! | 2139 | all_zones_ok = 0; |
2108 | */ | 2140 | /* |
2109 | if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), | 2141 | * We are still under min water mark. This |
2110 | end_zone, 0)) | 2142 | * means that we have a GFP_ATOMIC allocation |
2111 | has_under_min_watermark_zone = 1; | 2143 | * failure risk. Hurry up! |
2144 | */ | ||
2145 | if (!zone_watermark_ok(zone, order, | ||
2146 | min_wmark_pages(zone), end_zone, 0)) | ||
2147 | has_under_min_watermark_zone = 1; | ||
2148 | } | ||
2112 | 2149 | ||
2113 | } | 2150 | } |
2114 | if (all_zones_ok) | 2151 | if (all_zones_ok) |
@@ -2550,6 +2587,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2550 | * and RECLAIM_SWAP. | 2587 | * and RECLAIM_SWAP. |
2551 | */ | 2588 | */ |
2552 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 2589 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; |
2590 | lockdep_set_current_reclaim_state(gfp_mask); | ||
2553 | reclaim_state.reclaimed_slab = 0; | 2591 | reclaim_state.reclaimed_slab = 0; |
2554 | p->reclaim_state = &reclaim_state; | 2592 | p->reclaim_state = &reclaim_state; |
2555 | 2593 | ||
@@ -2593,6 +2631,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2593 | 2631 | ||
2594 | p->reclaim_state = NULL; | 2632 | p->reclaim_state = NULL; |
2595 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 2633 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
2634 | lockdep_clear_current_reclaim_state(); | ||
2596 | return sc.nr_reclaimed >= nr_pages; | 2635 | return sc.nr_reclaimed >= nr_pages; |
2597 | } | 2636 | } |
2598 | 2637 | ||
@@ -2615,7 +2654,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2615 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) | 2654 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
2616 | return ZONE_RECLAIM_FULL; | 2655 | return ZONE_RECLAIM_FULL; |
2617 | 2656 | ||
2618 | if (zone_is_all_unreclaimable(zone)) | 2657 | if (zone->all_unreclaimable) |
2619 | return ZONE_RECLAIM_FULL; | 2658 | return ZONE_RECLAIM_FULL; |
2620 | 2659 | ||
2621 | /* | 2660 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 6051fbab67ba..7f760cbc73f3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void) | |||
139 | threshold = calculate_threshold(zone); | 139 | threshold = calculate_threshold(zone); |
140 | 140 | ||
141 | for_each_online_cpu(cpu) | 141 | for_each_online_cpu(cpu) |
142 | zone_pcp(zone, cpu)->stat_threshold = threshold; | 142 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
143 | = threshold; | ||
143 | } | 144 | } |
144 | } | 145 | } |
145 | 146 | ||
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void) | |||
149 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 150 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
150 | int delta) | 151 | int delta) |
151 | { | 152 | { |
152 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 153 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
154 | |||
153 | s8 *p = pcp->vm_stat_diff + item; | 155 | s8 *p = pcp->vm_stat_diff + item; |
154 | long x; | 156 | long x; |
155 | 157 | ||
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
202 | */ | 204 | */ |
203 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 205 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
204 | { | 206 | { |
205 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 207 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
206 | s8 *p = pcp->vm_stat_diff + item; | 208 | s8 *p = pcp->vm_stat_diff + item; |
207 | 209 | ||
208 | (*p)++; | 210 | (*p)++; |
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
223 | 225 | ||
224 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 226 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
225 | { | 227 | { |
226 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 228 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
227 | s8 *p = pcp->vm_stat_diff + item; | 229 | s8 *p = pcp->vm_stat_diff + item; |
228 | 230 | ||
229 | (*p)--; | 231 | (*p)--; |
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu) | |||
300 | for_each_populated_zone(zone) { | 302 | for_each_populated_zone(zone) { |
301 | struct per_cpu_pageset *p; | 303 | struct per_cpu_pageset *p; |
302 | 304 | ||
303 | p = zone_pcp(zone, cpu); | 305 | p = per_cpu_ptr(zone->pageset, cpu); |
304 | 306 | ||
305 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 307 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
306 | if (p->vm_stat_diff[i]) { | 308 | if (p->vm_stat_diff[i]) { |
@@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
741 | for_each_online_cpu(i) { | 743 | for_each_online_cpu(i) { |
742 | struct per_cpu_pageset *pageset; | 744 | struct per_cpu_pageset *pageset; |
743 | 745 | ||
744 | pageset = zone_pcp(zone, i); | 746 | pageset = per_cpu_ptr(zone->pageset, i); |
745 | seq_printf(m, | 747 | seq_printf(m, |
746 | "\n cpu: %i" | 748 | "\n cpu: %i" |
747 | "\n count: %i" | 749 | "\n count: %i" |
@@ -761,7 +763,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
761 | "\n prev_priority: %i" | 763 | "\n prev_priority: %i" |
762 | "\n start_pfn: %lu" | 764 | "\n start_pfn: %lu" |
763 | "\n inactive_ratio: %u", | 765 | "\n inactive_ratio: %u", |
764 | zone_is_all_unreclaimable(zone), | 766 | zone->all_unreclaimable, |
765 | zone->prev_priority, | 767 | zone->prev_priority, |
766 | zone->zone_start_pfn, | 768 | zone->zone_start_pfn, |
767 | zone->inactive_ratio); | 769 | zone->inactive_ratio); |
@@ -906,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
906 | case CPU_ONLINE: | 908 | case CPU_ONLINE: |
907 | case CPU_ONLINE_FROZEN: | 909 | case CPU_ONLINE_FROZEN: |
908 | start_cpu_timer(cpu); | 910 | start_cpu_timer(cpu); |
911 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
909 | break; | 912 | break; |
910 | case CPU_DOWN_PREPARE: | 913 | case CPU_DOWN_PREPARE: |
911 | case CPU_DOWN_PREPARE_FROZEN: | 914 | case CPU_DOWN_PREPARE_FROZEN: |